more build updates:

(1) nccl submodule, cnmem submodule
(2) mpi ops fallback test
(3) a bit more blob interface
(4) fixed tests
(5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts
(6) In the build system autogen __init__.py instead of having manual
rules just to copy over an empty __init__.py.
This commit is contained in:
Yangqing Jia 2016-07-22 23:58:24 -07:00
parent b2c2d0b70c
commit 1ede7a7ff0
29 changed files with 156 additions and 1629 deletions

6
.gitmodules vendored
View File

@ -1,3 +1,9 @@
[submodule "third_party/pybind11"] [submodule "third_party/pybind11"]
path = third_party/pybind11 path = third_party/pybind11
url = https://github.com/pybind/pybind11.git url = https://github.com/pybind/pybind11.git
[submodule "third_party/nccl"]
path = third_party/nccl
url = https://github.com/nvidia/nccl.git
[submodule "third_party/cnmem"]
path = third_party/cnmem
url = https://github.com/nvidia/cnmem.git

View File

@ -16,6 +16,6 @@ lint:
@find caffe2 -type f -exec python brewtool/cpplint.py {} \; @find caffe2 -type f -exec python brewtool/cpplint.py {} \;
linecount: linecount:
@cloc --read-lang-def=brewtool/caffe.cloc caffe2 pycaffe2 || \ @cloc --read-lang-def=brewtool/caffe.cloc caffe2 || \
echo "Cloc is not available on the machine. You can install cloc with " && \ echo "Cloc is not available on the machine. You can install cloc with " && \
echo " sudo apt-get install cloc" echo " sudo apt-get install cloc"

View File

@ -108,6 +108,7 @@ class Config(object):
'arch=compute_30,code=sm_30', 'arch=compute_30,code=sm_30',
'arch=compute_35,code=sm_35', 'arch=compute_35,code=sm_35',
'arch=compute_50,code=sm_50', 'arch=compute_50,code=sm_50',
'arch=compute_61,code=sm_61',
] ]
# additional CUDA cflags to pass to nvcc. # additional CUDA cflags to pass to nvcc.
CUDA_CFLAGS = [] CUDA_CFLAGS = []

View File

@ -1,4 +0,0 @@
filegroup(
name = "caffe_python",
srcs = ["__init__.py"],
)

View File

View File

@ -4,11 +4,3 @@ proto_library(
name = 'caffe_proto', name = 'caffe_proto',
srcs = ['caffe.proto'], srcs = ['caffe.proto'],
) )
filegroup(
name = "caffe_proto_py",
srcs = ["__init__.py"],
deps = [
"//caffe:caffe_python",
]
)

View File

@ -26,7 +26,7 @@ cc_library(
deps = [ deps = [
":core", ":core",
":core_gpu_cu", ":core_gpu_cu",
"//third_party/cnmem:cnmem", "//third_party:cnmem",
"//third_party:cuda", "//third_party:cuda",
], ],
whole_archive = True, whole_archive = True,
@ -48,6 +48,7 @@ cc_test(
excludes=["*gpu_test*"]), excludes=["*gpu_test*"]),
deps = [ deps = [
":core", ":core",
"//caffe2/operators:core_ops",
"//third_party:gtest", "//third_party:gtest",
"//caffe2/test:caffe2_gtest_main", "//caffe2/test:caffe2_gtest_main",
], ],
@ -63,11 +64,6 @@ cc_test(
], ],
) )
filegroup(
name = "caffe2_python",
srcs = ["__init__.py"],
)
cc_library( cc_library(
name = "all_available_ops", name = "all_available_ops",
srcs = [], srcs = [],
@ -79,6 +75,7 @@ cc_library(
optional_deps = [ optional_deps = [
"//caffe2/operators:core_ops_gpu", "//caffe2/operators:core_ops_gpu",
"//caffe2/operators:core_ops_cudnn", "//caffe2/operators:core_ops_cudnn",
"//caffe2/contrib/nccl:nccl_ops",
"//caffe2/cuda_rtc:rtc_ops", "//caffe2/cuda_rtc:rtc_ops",
"//caffe2/db:db_gpu", "//caffe2/db:db_gpu",
"//caffe2/image:image_ops", "//caffe2/image:image_ops",

View File

@ -1,5 +0,0 @@
"""
Caffe2: A General Tool for Neural Networks.
"""
__author__ = 'Yangqing Jia'

10
caffe2/contrib/nccl/BREW Normal file
View File

@ -0,0 +1,10 @@
cc_library(
name = "nccl_ops",
srcs = Glob(["*.cc"]),
hdrs = Glob(["*.h"]),
deps = [
"//caffe2:core_gpu",
"//third_party:nccl",
],
whole_archive = True,
)

View File

@ -58,6 +58,9 @@ class Blob {
return *static_cast<const T*>(pointer_); return *static_cast<const T*>(pointer_);
} }
const void* GetRaw() const { return pointer_; }
void* GetRaw() { return pointer_; }
/** /**
* @brief Gets a mutable pointer to the stored object. * @brief Gets a mutable pointer to the stored object.
* *
@ -73,6 +76,7 @@ class Blob {
return static_cast<T*>(pointer_); return static_cast<T*>(pointer_);
} else { } else {
if (is_new_object) *is_new_object = true; if (is_new_object) *is_new_object = true;
VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
return Reset<T>(new T()); return Reset<T>(new T());
} }
} }
@ -87,28 +91,53 @@ class Blob {
*/ */
template <class T> template <class T>
T* Reset(T* allocated) { T* Reset(T* allocated) {
if (pointer_) { if (pointer_ && destroy_) {
CHECK_NOTNULL(destroy_)(pointer_); destroy_(pointer_);
} }
VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
meta_ = TypeMeta::Make<T>(); meta_ = TypeMeta::Make<T>();
pointer_ = static_cast<void*>(allocated); pointer_ = static_cast<void*>(allocated);
destroy_ = &Destroy<T>; destroy_ = &Destroy<T>;
return allocated; return allocated;
} }
/**
* Sets the underlying object to the allocated one, but does not take over
* the ownership of the passed in pointer. If there is already an object in
* the Blob, the old object is freed.
*
* Unlike Reset, this does not take over the ownership of the pointer and the
* caller is responsible for making sure that the lifetime of the allocated
* blob outlasts the lifetime of any access to this blob, until another Reset
* call is made or the blob is destructed.
*/
template <class T>
typename std::remove_const<T>::type* ShareExternal(
typename std::remove_const<T>::type* allocated) {
return static_cast<T*>(
ShareExternal(static_cast<void*>(allocated),
TypeMeta::Make<typename std::remove_const<T>::type>()));
}
void* ShareExternal(void* allocated, const TypeMeta& meta) {
if (pointer_ && destroy_) {
destroy_(pointer_);
}
meta_ = meta;
pointer_ = static_cast<void*>(allocated);
destroy_ = nullptr;
return allocated;
}
/** /**
* Resets the Blob to an empty one. * Resets the Blob to an empty one.
*/ */
inline void Reset() { inline void Reset() {
if (pointer_) { if (pointer_ && destroy_) {
CHECK_NOTNULL(destroy_)(pointer_); destroy_(pointer_);
pointer_ = nullptr;
meta_ = TypeMeta();
destroy_ = nullptr;
} }
pointer_ = nullptr; pointer_ = nullptr;
meta_ = TypeMeta(); meta_ = TypeMeta();
destroy_ = nullptr;
} }
/** /**

View File

@ -69,6 +69,32 @@ TEST(BlobTest, BlobWrongType) {
ASSERT_THROW(blob.Get<int>(), EnforceNotMet); ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
} }
TEST(BlobTest, BlobReset) {
Blob blob;
std::unique_ptr<Foo> foo(new Foo());
EXPECT_TRUE(blob.Reset(foo.release()) != nullptr);
// Also test that Reset works.
blob.Reset();
}
TEST(BlobTest, BlobShareExternalPointer) {
Blob blob;
std::unique_ptr<Foo> foo(new Foo());
EXPECT_EQ(blob.ShareExternal<Foo>(foo.get()), foo.get());
EXPECT_TRUE(blob.IsType<Foo>());
// Also test that Reset works.
blob.Reset();
}
TEST(BlobTest, BlobShareExternalObject) {
Blob blob;
Foo foo;
EXPECT_EQ(blob.ShareExternal<Foo>(&foo), &foo);
EXPECT_TRUE(blob.IsType<Foo>());
// Also test that Reset works.
blob.Reset();
}
TEST(BlobTest, StringSerialization) { TEST(BlobTest, StringSerialization) {
const std::string kTestString = "Hello world?"; const std::string kTestString = "Hello world?";
Blob blob; Blob blob;
@ -558,6 +584,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
"DUMMY_ENGINE"); "DUMMY_ENGINE");
Workspace ws; Workspace ws;
auto load_op = CreateOperator(op_def, &ws); auto load_op = CreateOperator(op_def, &ws);
EXPECT_TRUE(load_op != nullptr);
LOG(INFO) << "Running operator"; LOG(INFO) << "Running operator";
load_op->Run(); load_op->Run();

View File

@ -1,3 +1,4 @@
#include <chrono>
#include <future> #include <future>
#include <random> #include <random>
#include <thread> #include <thread>
@ -55,6 +56,8 @@ namespace {
void TEST_GetStreamAddress(cudaStream_t* ptr) { void TEST_GetStreamAddress(cudaStream_t* ptr) {
CUDAContext context(0); CUDAContext context(0);
*ptr = context.cuda_stream(); *ptr = context.cuda_stream();
// Sleep for a while so we have concurrent thread executions
std::this_thread::sleep_for(std::chrono::seconds(1));
} }
} // namespace } // namespace

View File

@ -35,9 +35,13 @@ class MPIBroadcastOp final : public Operator<Context> {
bool RunOnDevice() override { bool RunOnDevice() override {
MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm(); MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
CAFFE_ENFORCE(OperatorBase::OutputIsType<Tensor<Context>>(0),
"Output is of wrong type.");
auto* output = Output(0); auto* output = Output(0);
// Make sure that output is already allocated. // Make sure that output is already allocated.
CHECK_GT(output->size(), 0); CAFFE_ENFORCE(output->size() > 0,
"Broadcast op uses in-place operation so the output "
"should be already allocated.");
MPI_CHECK(MPI_Bcast( MPI_CHECK(MPI_Bcast(
output->raw_mutable_data(), output->raw_mutable_data(),
output->nbytes(), output->nbytes(),

View File

@ -54,8 +54,18 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
bool RunOnDevice() override { bool RunOnDevice() override {
for (int i = 0; i < InputSize(); ++i) { for (int i = 0; i < InputSize(); ++i) {
local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom( if (OperatorBase::InputIsType<TensorCUDA>(i)) {
Input(i), &context_); local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
Input(i), &context_);
} else {
VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
// Note(jiayq): This removes a const but conceptually
// local_input_blobs will only be used as const blob input for the
// base op so we are still fine.
local_input_blobs_[i]->ShareExternal(
const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
OperatorBase::Inputs()[i]->meta());
}
} }
// Sync to make sure copies are done. // Sync to make sure copies are done.
context_.FinishDeviceComputation(); context_.FinishDeviceComputation();
@ -65,6 +75,9 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
return false; return false;
} }
for (int i = 0; i < OutputSize(); ++i) { for (int i = 0; i < OutputSize(); ++i) {
CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(),
"GPU fallback op currently does not support non-TensorCPU "
"output type.");
Output(i)->CopyFrom( Output(i)->CopyFrom(
local_output_blobs_[i]->template Get<TensorCPU>(), &context_); local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
} }

View File

@ -4,11 +4,3 @@ proto_library(
name = 'caffe2_proto', name = 'caffe2_proto',
srcs = Glob(['*.proto']), srcs = Glob(['*.proto']),
) )
filegroup(
name = "caffe2_proto_py",
srcs = ["__init__.py"],
deps = [
"//caffe2:caffe2_python",
]
)

View File

@ -39,8 +39,8 @@ py_library(
srcs=Glob(["*.py"], excludes=["*_test.py"]), srcs=Glob(["*.py"], excludes=["*_test.py"]),
deps=[ deps=[
":caffe2_python_cpu", ":caffe2_python_cpu",
"//caffe/proto:caffe_proto_py", "//caffe/proto:caffe_proto",
"//caffe2/proto:caffe2_proto_py", "//caffe2/proto:caffe2_proto",
"//caffe2/python/mint:mint", "//caffe2/python/mint:mint",
], ],
optional_deps=[ optional_deps=[

View File

@ -1,4 +0,0 @@
import atexit
from . import core, utils, workspace
from caffe2.proto import caffe2_pb2

View File

@ -14,7 +14,7 @@ from __future__ import print_function
from __future__ import unicode_literals from __future__ import unicode_literals
from caffe2.python import core, workspace from caffe2.python import core, workspace
from caffe2.python.io import Reader, Writer from caffe2.python.dataio import Reader, Writer
from caffe2.python.schema import Struct from caffe2.python.schema import Struct
import numpy as np import numpy as np

View File

@ -1,7 +1,6 @@
py_library( py_library(
name = "mint", name = "mint",
srcs = [ srcs = [
"__init__.py",
"app.py", "app.py",
"static/css/simple-sidebar.css", "static/css/simple-sidebar.css",
"templates/index.html", "templates/index.html",

47
third_party/BREW vendored
View File

@ -92,10 +92,49 @@ cc_thirdparty_target(
], ],
) )
cc_thirdparty_target( shell_script(
name="cnmen", name = "cnmem_header",
deps=["//third_party/cnmem:cnmem"], srcs = ["cnmem/include/cnmem.h"],
cc_obj_files = [], commands=[
"DST=$CAFFE2_GENDIR/third_party/include/",
"mkdir -p $DST",
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem/include/cnmem.h $DST/",
],
)
cc_library(
name = "cnmem",
srcs = [
"cnmem/src/cnmem.cpp",
],
deps = [
":cnmem_header",
":cuda",
]
)
shell_script(
name = "nccl_header",
srcs = ["nccl/src/nccl.h"],
commands=[
"DST=$CAFFE2_GENDIR/third_party/include/",
"mkdir -p $DST",
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/nccl/src/nccl.h $DST/",
],
)
cuda_library(
name = "nccl",
srcs = Glob(["nccl/src/*.cu"]),
deps = [
":nccl_header",
":cuda",
],
compiler_flags=[
"-Wno-switch", # NCCL does not follow strict switch enum check.
"-DNCCL_MAJOR=1 -DNCCL_MINOR=2 -DNCCL_PATCH=3",
"-DCUDA_MAJOR=__CUDACC_VER_MAJOR__ -DCUDA_MINOR=__CUDACC_VER_MINOR__",
],
) )
############################################################################### ###############################################################################

1
third_party/cnmem vendored Submodule

@ -0,0 +1 @@
Subproject commit 28a182d49529da49f4ac4e3941cec3edf16b3540

View File

@ -1,24 +0,0 @@
# We need to copy over the header to the right folder.
shell_script(
name = "cnmem_header",
srcs = ["cnmem.h"],
commands=[
"DST=$CAFFE2_GENDIR/third_party/include/",
"mkdir -p $DST",
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem.h $DST/",
],
)
cuda_library(
name = "cnmem",
srcs = [
"cnmem.cpp",
],
hdrs = [
"cnmem.h",
],
deps = [
"cnmem_header",
"//third_party:cuda",
]
)

File diff suppressed because it is too large Load Diff

View File

@ -1,263 +0,0 @@
/* **********************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ********************************************************************** */
#pragma once
#ifdef __cplusplus
#include "cstdio"
#else
#include "stdio.h"
#endif
#include "cuda_runtime_api.h"
#if defined(_MSC_VER) || defined(WIN32)
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __declspec(dllexport)
#else
#define CNMEM_API __declspec(dllimport)
#endif
#else
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __attribute__((visibility ("default")))
#else
#define CNMEM_API
#endif
#endif
#define CNMEM_VERSION 100 // It corresponds to 1.0.0
#ifdef __cplusplus
extern "C" {
#endif
/* ********************************************************************************************* */
typedef enum
{
CNMEM_STATUS_SUCCESS = 0,
CNMEM_STATUS_CUDA_ERROR,
CNMEM_STATUS_INVALID_ARGUMENT,
CNMEM_STATUS_NOT_INITIALIZED,
CNMEM_STATUS_OUT_OF_MEMORY,
CNMEM_STATUS_UNKNOWN_ERROR
} cnmemStatus_t;
/* ********************************************************************************************* */
typedef enum
{
CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
} cnmemManagerFlags_t;
/* ********************************************************************************************* */
typedef struct cnmemDevice_t_
{
/** The device number. */
int device;
/** The size to allocate for that device. If 0, the implementation chooses the size. */
size_t size;
/** The number of named streams associated with the device. The NULL stream is not counted. */
int numStreams;
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */
cudaStream_t *streams;
/** The size reserved for each streams. It can be 0. */
size_t *streamSizes;
} cnmemDevice_t;
/**
* \brief Initialize the library and allocate memory on the listed devices.
*
* For each device, an internal memory manager is created and the specified amount of memory is
* allocated (it is the size defined in device[i].size). For each, named stream an additional
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root
* manager for the device and a list of children, one for each named stream.
*
* This function must be called before any other function in the library. It has to be called
* by a single thread since it is not thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
*/
cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
/**
* \brief Release all the allocated memory.
*
* This function must be called by a single thread and after all threads that called
* cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemFinalize();
/**
* \brief Increase the internal reference counter of the context object.
*
* This function increases the internal reference counter of the library. The purpose of that
* reference counting mechanism is to give more control to the user over the lifetime of the
* library. It is useful with scoped memory allocation which may be destroyed in a final
* memory collection after the end of main(). That function is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
*/
cnmemStatus_t CNMEM_API cnmemRetain();
/**
* \brief Decrease the internal reference counter of the context object.
*
* This function decreases the internal reference counter of the library. The purpose of that
* reference counting mechanism is to give more control to the user over the lifetime of the
* library. It is useful with scoped memory allocation which may be destroyed in a final
* memory collection after the end of main(). That function is thread-safe.
*
* You can use \c cnmemRelease to explicitly finalize the library.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
*/
cnmemStatus_t CNMEM_API cnmemRelease();
/**
* \brief Add a new stream to the pool of managed streams on a device.
*
* This function registers a new stream into a device memory manager. It is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
*/
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
/**
* \brief Allocate memory.
*
* This function allocates memory and initializes a pointer to device memory. If no memory
* is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
*
* The behavior of that function is the following:
*
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
* memory. If there's a buffer of size larger or equal to the requested size in the list of
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
* its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
* allowed to grow, the manager attempts to steal memory from one of its children (unless
* CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
* CNMEM_STATUS_OUT_OF_MEMORY.
*
* - If the stream is a named stream, the initial request goes to the memory manager associated
* with that stream. If a free node is available in the lists of that manager, it is returned.
* Otherwise, the request is passed to the root node and works as if the request were made on
* the NULL stream.
*
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to
* make sure no kernel uses a given buffer before stealing it) and it the execution is
* sequential (in a multi-threaded context, the code is executed in a critical section inside
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
/**
* \brief Release memory.
*
* This function releases memory and recycles a memory block in the manager. This function is
* thread safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
/* ********************************************************************************************* */
/* Utility functions. */
/* ********************************************************************************************* */
/**
* \brief Returns the amount of memory managed by the memory manager associated with a stream.
*
* The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
* xity linear in the number of allocated blocks so do not call it in performance critical
* sections.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
/**
* \brief Print a list of nodes to a file.
*
* This function is intended to be used in case of complex scenarios to help understand the
* behaviour of the memory managers/application. It is thread safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
* or free_mem == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
/**
* \brief Converts a cnmemStatus_t value to a string.
*/
const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
/* ********************************************************************************************* */
#ifdef __cplusplus
} // extern "C"
#endif

1
third_party/nccl vendored Submodule

@ -0,0 +1 @@
Subproject commit b3a9e1333d9e2e1b8553b5843ba1ba4f7c79739d