mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
more build updates:
(1) nccl submodule, cnmem submodule (2) mpi ops fallback test (3) a bit more blob interface (4) fixed tests (5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts (6) In the build system autogen __init__.py instead of having manual rules just to copy over an empty __init__.py.
This commit is contained in:
parent
b2c2d0b70c
commit
1ede7a7ff0
6
.gitmodules
vendored
6
.gitmodules
vendored
|
|
@ -1,3 +1,9 @@
|
||||||
[submodule "third_party/pybind11"]
|
[submodule "third_party/pybind11"]
|
||||||
path = third_party/pybind11
|
path = third_party/pybind11
|
||||||
url = https://github.com/pybind/pybind11.git
|
url = https://github.com/pybind/pybind11.git
|
||||||
|
[submodule "third_party/nccl"]
|
||||||
|
path = third_party/nccl
|
||||||
|
url = https://github.com/nvidia/nccl.git
|
||||||
|
[submodule "third_party/cnmem"]
|
||||||
|
path = third_party/cnmem
|
||||||
|
url = https://github.com/nvidia/cnmem.git
|
||||||
|
|
|
||||||
2
Makefile
2
Makefile
|
|
@ -16,6 +16,6 @@ lint:
|
||||||
@find caffe2 -type f -exec python brewtool/cpplint.py {} \;
|
@find caffe2 -type f -exec python brewtool/cpplint.py {} \;
|
||||||
|
|
||||||
linecount:
|
linecount:
|
||||||
@cloc --read-lang-def=brewtool/caffe.cloc caffe2 pycaffe2 || \
|
@cloc --read-lang-def=brewtool/caffe.cloc caffe2 || \
|
||||||
echo "Cloc is not available on the machine. You can install cloc with " && \
|
echo "Cloc is not available on the machine. You can install cloc with " && \
|
||||||
echo " sudo apt-get install cloc"
|
echo " sudo apt-get install cloc"
|
||||||
|
|
|
||||||
1
build.py
1
build.py
|
|
@ -108,6 +108,7 @@ class Config(object):
|
||||||
'arch=compute_30,code=sm_30',
|
'arch=compute_30,code=sm_30',
|
||||||
'arch=compute_35,code=sm_35',
|
'arch=compute_35,code=sm_35',
|
||||||
'arch=compute_50,code=sm_50',
|
'arch=compute_50,code=sm_50',
|
||||||
|
'arch=compute_61,code=sm_61',
|
||||||
]
|
]
|
||||||
# additional CUDA cflags to pass to nvcc.
|
# additional CUDA cflags to pass to nvcc.
|
||||||
CUDA_CFLAGS = []
|
CUDA_CFLAGS = []
|
||||||
|
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
filegroup(
|
|
||||||
name = "caffe_python",
|
|
||||||
srcs = ["__init__.py"],
|
|
||||||
)
|
|
||||||
|
|
@ -4,11 +4,3 @@ proto_library(
|
||||||
name = 'caffe_proto',
|
name = 'caffe_proto',
|
||||||
srcs = ['caffe.proto'],
|
srcs = ['caffe.proto'],
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
|
||||||
name = "caffe_proto_py",
|
|
||||||
srcs = ["__init__.py"],
|
|
||||||
deps = [
|
|
||||||
"//caffe:caffe_python",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ cc_library(
|
||||||
deps = [
|
deps = [
|
||||||
":core",
|
":core",
|
||||||
":core_gpu_cu",
|
":core_gpu_cu",
|
||||||
"//third_party/cnmem:cnmem",
|
"//third_party:cnmem",
|
||||||
"//third_party:cuda",
|
"//third_party:cuda",
|
||||||
],
|
],
|
||||||
whole_archive = True,
|
whole_archive = True,
|
||||||
|
|
@ -48,6 +48,7 @@ cc_test(
|
||||||
excludes=["*gpu_test*"]),
|
excludes=["*gpu_test*"]),
|
||||||
deps = [
|
deps = [
|
||||||
":core",
|
":core",
|
||||||
|
"//caffe2/operators:core_ops",
|
||||||
"//third_party:gtest",
|
"//third_party:gtest",
|
||||||
"//caffe2/test:caffe2_gtest_main",
|
"//caffe2/test:caffe2_gtest_main",
|
||||||
],
|
],
|
||||||
|
|
@ -63,11 +64,6 @@ cc_test(
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
|
||||||
name = "caffe2_python",
|
|
||||||
srcs = ["__init__.py"],
|
|
||||||
)
|
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "all_available_ops",
|
name = "all_available_ops",
|
||||||
srcs = [],
|
srcs = [],
|
||||||
|
|
@ -79,6 +75,7 @@ cc_library(
|
||||||
optional_deps = [
|
optional_deps = [
|
||||||
"//caffe2/operators:core_ops_gpu",
|
"//caffe2/operators:core_ops_gpu",
|
||||||
"//caffe2/operators:core_ops_cudnn",
|
"//caffe2/operators:core_ops_cudnn",
|
||||||
|
"//caffe2/contrib/nccl:nccl_ops",
|
||||||
"//caffe2/cuda_rtc:rtc_ops",
|
"//caffe2/cuda_rtc:rtc_ops",
|
||||||
"//caffe2/db:db_gpu",
|
"//caffe2/db:db_gpu",
|
||||||
"//caffe2/image:image_ops",
|
"//caffe2/image:image_ops",
|
||||||
|
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
"""
|
|
||||||
Caffe2: A General Tool for Neural Networks.
|
|
||||||
"""
|
|
||||||
|
|
||||||
__author__ = 'Yangqing Jia'
|
|
||||||
10
caffe2/contrib/nccl/BREW
Normal file
10
caffe2/contrib/nccl/BREW
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
cc_library(
|
||||||
|
name = "nccl_ops",
|
||||||
|
srcs = Glob(["*.cc"]),
|
||||||
|
hdrs = Glob(["*.h"]),
|
||||||
|
deps = [
|
||||||
|
"//caffe2:core_gpu",
|
||||||
|
"//third_party:nccl",
|
||||||
|
],
|
||||||
|
whole_archive = True,
|
||||||
|
)
|
||||||
|
|
@ -58,6 +58,9 @@ class Blob {
|
||||||
return *static_cast<const T*>(pointer_);
|
return *static_cast<const T*>(pointer_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const void* GetRaw() const { return pointer_; }
|
||||||
|
void* GetRaw() { return pointer_; }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Gets a mutable pointer to the stored object.
|
* @brief Gets a mutable pointer to the stored object.
|
||||||
*
|
*
|
||||||
|
|
@ -73,6 +76,7 @@ class Blob {
|
||||||
return static_cast<T*>(pointer_);
|
return static_cast<T*>(pointer_);
|
||||||
} else {
|
} else {
|
||||||
if (is_new_object) *is_new_object = true;
|
if (is_new_object) *is_new_object = true;
|
||||||
|
VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
|
||||||
return Reset<T>(new T());
|
return Reset<T>(new T());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -87,28 +91,53 @@ class Blob {
|
||||||
*/
|
*/
|
||||||
template <class T>
|
template <class T>
|
||||||
T* Reset(T* allocated) {
|
T* Reset(T* allocated) {
|
||||||
if (pointer_) {
|
if (pointer_ && destroy_) {
|
||||||
CHECK_NOTNULL(destroy_)(pointer_);
|
destroy_(pointer_);
|
||||||
}
|
}
|
||||||
VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
|
|
||||||
meta_ = TypeMeta::Make<T>();
|
meta_ = TypeMeta::Make<T>();
|
||||||
pointer_ = static_cast<void*>(allocated);
|
pointer_ = static_cast<void*>(allocated);
|
||||||
destroy_ = &Destroy<T>;
|
destroy_ = &Destroy<T>;
|
||||||
return allocated;
|
return allocated;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the underlying object to the allocated one, but does not take over
|
||||||
|
* the ownership of the passed in pointer. If there is already an object in
|
||||||
|
* the Blob, the old object is freed.
|
||||||
|
*
|
||||||
|
* Unlike Reset, this does not take over the ownership of the pointer and the
|
||||||
|
* caller is responsible for making sure that the lifetime of the allocated
|
||||||
|
* blob outlasts the lifetime of any access to this blob, until another Reset
|
||||||
|
* call is made or the blob is destructed.
|
||||||
|
*/
|
||||||
|
template <class T>
|
||||||
|
typename std::remove_const<T>::type* ShareExternal(
|
||||||
|
typename std::remove_const<T>::type* allocated) {
|
||||||
|
return static_cast<T*>(
|
||||||
|
ShareExternal(static_cast<void*>(allocated),
|
||||||
|
TypeMeta::Make<typename std::remove_const<T>::type>()));
|
||||||
|
}
|
||||||
|
|
||||||
|
void* ShareExternal(void* allocated, const TypeMeta& meta) {
|
||||||
|
if (pointer_ && destroy_) {
|
||||||
|
destroy_(pointer_);
|
||||||
|
}
|
||||||
|
meta_ = meta;
|
||||||
|
pointer_ = static_cast<void*>(allocated);
|
||||||
|
destroy_ = nullptr;
|
||||||
|
return allocated;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Resets the Blob to an empty one.
|
* Resets the Blob to an empty one.
|
||||||
*/
|
*/
|
||||||
inline void Reset() {
|
inline void Reset() {
|
||||||
if (pointer_) {
|
if (pointer_ && destroy_) {
|
||||||
CHECK_NOTNULL(destroy_)(pointer_);
|
destroy_(pointer_);
|
||||||
pointer_ = nullptr;
|
|
||||||
meta_ = TypeMeta();
|
|
||||||
destroy_ = nullptr;
|
|
||||||
}
|
}
|
||||||
pointer_ = nullptr;
|
pointer_ = nullptr;
|
||||||
meta_ = TypeMeta();
|
meta_ = TypeMeta();
|
||||||
|
destroy_ = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,32 @@ TEST(BlobTest, BlobWrongType) {
|
||||||
ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
|
ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(BlobTest, BlobReset) {
|
||||||
|
Blob blob;
|
||||||
|
std::unique_ptr<Foo> foo(new Foo());
|
||||||
|
EXPECT_TRUE(blob.Reset(foo.release()) != nullptr);
|
||||||
|
// Also test that Reset works.
|
||||||
|
blob.Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BlobTest, BlobShareExternalPointer) {
|
||||||
|
Blob blob;
|
||||||
|
std::unique_ptr<Foo> foo(new Foo());
|
||||||
|
EXPECT_EQ(blob.ShareExternal<Foo>(foo.get()), foo.get());
|
||||||
|
EXPECT_TRUE(blob.IsType<Foo>());
|
||||||
|
// Also test that Reset works.
|
||||||
|
blob.Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(BlobTest, BlobShareExternalObject) {
|
||||||
|
Blob blob;
|
||||||
|
Foo foo;
|
||||||
|
EXPECT_EQ(blob.ShareExternal<Foo>(&foo), &foo);
|
||||||
|
EXPECT_TRUE(blob.IsType<Foo>());
|
||||||
|
// Also test that Reset works.
|
||||||
|
blob.Reset();
|
||||||
|
}
|
||||||
|
|
||||||
TEST(BlobTest, StringSerialization) {
|
TEST(BlobTest, StringSerialization) {
|
||||||
const std::string kTestString = "Hello world?";
|
const std::string kTestString = "Hello world?";
|
||||||
Blob blob;
|
Blob blob;
|
||||||
|
|
@ -558,6 +584,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
|
||||||
"DUMMY_ENGINE");
|
"DUMMY_ENGINE");
|
||||||
Workspace ws;
|
Workspace ws;
|
||||||
auto load_op = CreateOperator(op_def, &ws);
|
auto load_op = CreateOperator(op_def, &ws);
|
||||||
|
EXPECT_TRUE(load_op != nullptr);
|
||||||
LOG(INFO) << "Running operator";
|
LOG(INFO) << "Running operator";
|
||||||
|
|
||||||
load_op->Run();
|
load_op->Run();
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include <chrono>
|
||||||
#include <future>
|
#include <future>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
@ -55,6 +56,8 @@ namespace {
|
||||||
void TEST_GetStreamAddress(cudaStream_t* ptr) {
|
void TEST_GetStreamAddress(cudaStream_t* ptr) {
|
||||||
CUDAContext context(0);
|
CUDAContext context(0);
|
||||||
*ptr = context.cuda_stream();
|
*ptr = context.cuda_stream();
|
||||||
|
// Sleep for a while so we have concurrent thread executions
|
||||||
|
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -35,9 +35,13 @@ class MPIBroadcastOp final : public Operator<Context> {
|
||||||
|
|
||||||
bool RunOnDevice() override {
|
bool RunOnDevice() override {
|
||||||
MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
|
MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
|
||||||
|
CAFFE_ENFORCE(OperatorBase::OutputIsType<Tensor<Context>>(0),
|
||||||
|
"Output is of wrong type.");
|
||||||
auto* output = Output(0);
|
auto* output = Output(0);
|
||||||
// Make sure that output is already allocated.
|
// Make sure that output is already allocated.
|
||||||
CHECK_GT(output->size(), 0);
|
CAFFE_ENFORCE(output->size() > 0,
|
||||||
|
"Broadcast op uses in-place operation so the output "
|
||||||
|
"should be already allocated.");
|
||||||
MPI_CHECK(MPI_Bcast(
|
MPI_CHECK(MPI_Bcast(
|
||||||
output->raw_mutable_data(),
|
output->raw_mutable_data(),
|
||||||
output->nbytes(),
|
output->nbytes(),
|
||||||
|
|
|
||||||
|
|
@ -54,8 +54,18 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
|
||||||
|
|
||||||
bool RunOnDevice() override {
|
bool RunOnDevice() override {
|
||||||
for (int i = 0; i < InputSize(); ++i) {
|
for (int i = 0; i < InputSize(); ++i) {
|
||||||
local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
|
if (OperatorBase::InputIsType<TensorCUDA>(i)) {
|
||||||
Input(i), &context_);
|
local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
|
||||||
|
Input(i), &context_);
|
||||||
|
} else {
|
||||||
|
VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
|
||||||
|
// Note(jiayq): This removes a const but conceptually
|
||||||
|
// local_input_blobs will only be used as const blob input for the
|
||||||
|
// base op so we are still fine.
|
||||||
|
local_input_blobs_[i]->ShareExternal(
|
||||||
|
const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
|
||||||
|
OperatorBase::Inputs()[i]->meta());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Sync to make sure copies are done.
|
// Sync to make sure copies are done.
|
||||||
context_.FinishDeviceComputation();
|
context_.FinishDeviceComputation();
|
||||||
|
|
@ -65,6 +75,9 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (int i = 0; i < OutputSize(); ++i) {
|
for (int i = 0; i < OutputSize(); ++i) {
|
||||||
|
CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(),
|
||||||
|
"GPU fallback op currently does not support non-TensorCPU "
|
||||||
|
"output type.");
|
||||||
Output(i)->CopyFrom(
|
Output(i)->CopyFrom(
|
||||||
local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
|
local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,11 +4,3 @@ proto_library(
|
||||||
name = 'caffe2_proto',
|
name = 'caffe2_proto',
|
||||||
srcs = Glob(['*.proto']),
|
srcs = Glob(['*.proto']),
|
||||||
)
|
)
|
||||||
|
|
||||||
filegroup(
|
|
||||||
name = "caffe2_proto_py",
|
|
||||||
srcs = ["__init__.py"],
|
|
||||||
deps = [
|
|
||||||
"//caffe2:caffe2_python",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
|
||||||
|
|
@ -39,8 +39,8 @@ py_library(
|
||||||
srcs=Glob(["*.py"], excludes=["*_test.py"]),
|
srcs=Glob(["*.py"], excludes=["*_test.py"]),
|
||||||
deps=[
|
deps=[
|
||||||
":caffe2_python_cpu",
|
":caffe2_python_cpu",
|
||||||
"//caffe/proto:caffe_proto_py",
|
"//caffe/proto:caffe_proto",
|
||||||
"//caffe2/proto:caffe2_proto_py",
|
"//caffe2/proto:caffe2_proto",
|
||||||
"//caffe2/python/mint:mint",
|
"//caffe2/python/mint:mint",
|
||||||
],
|
],
|
||||||
optional_deps=[
|
optional_deps=[
|
||||||
|
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
import atexit
|
|
||||||
|
|
||||||
from . import core, utils, workspace
|
|
||||||
from caffe2.proto import caffe2_pb2
|
|
||||||
|
|
@ -14,7 +14,7 @@ from __future__ import print_function
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from caffe2.python import core, workspace
|
from caffe2.python import core, workspace
|
||||||
from caffe2.python.io import Reader, Writer
|
from caffe2.python.dataio import Reader, Writer
|
||||||
from caffe2.python.schema import Struct
|
from caffe2.python.schema import Struct
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
py_library(
|
py_library(
|
||||||
name = "mint",
|
name = "mint",
|
||||||
srcs = [
|
srcs = [
|
||||||
"__init__.py",
|
|
||||||
"app.py",
|
"app.py",
|
||||||
"static/css/simple-sidebar.css",
|
"static/css/simple-sidebar.css",
|
||||||
"templates/index.html",
|
"templates/index.html",
|
||||||
|
|
|
||||||
47
third_party/BREW
vendored
47
third_party/BREW
vendored
|
|
@ -92,10 +92,49 @@ cc_thirdparty_target(
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
cc_thirdparty_target(
|
shell_script(
|
||||||
name="cnmen",
|
name = "cnmem_header",
|
||||||
deps=["//third_party/cnmem:cnmem"],
|
srcs = ["cnmem/include/cnmem.h"],
|
||||||
cc_obj_files = [],
|
commands=[
|
||||||
|
"DST=$CAFFE2_GENDIR/third_party/include/",
|
||||||
|
"mkdir -p $DST",
|
||||||
|
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem/include/cnmem.h $DST/",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
cc_library(
|
||||||
|
name = "cnmem",
|
||||||
|
srcs = [
|
||||||
|
"cnmem/src/cnmem.cpp",
|
||||||
|
],
|
||||||
|
deps = [
|
||||||
|
":cnmem_header",
|
||||||
|
":cuda",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
shell_script(
|
||||||
|
name = "nccl_header",
|
||||||
|
srcs = ["nccl/src/nccl.h"],
|
||||||
|
commands=[
|
||||||
|
"DST=$CAFFE2_GENDIR/third_party/include/",
|
||||||
|
"mkdir -p $DST",
|
||||||
|
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/nccl/src/nccl.h $DST/",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
cuda_library(
|
||||||
|
name = "nccl",
|
||||||
|
srcs = Glob(["nccl/src/*.cu"]),
|
||||||
|
deps = [
|
||||||
|
":nccl_header",
|
||||||
|
":cuda",
|
||||||
|
],
|
||||||
|
compiler_flags=[
|
||||||
|
"-Wno-switch", # NCCL does not follow strict switch enum check.
|
||||||
|
"-DNCCL_MAJOR=1 -DNCCL_MINOR=2 -DNCCL_PATCH=3",
|
||||||
|
"-DCUDA_MAJOR=__CUDACC_VER_MAJOR__ -DCUDA_MINOR=__CUDACC_VER_MINOR__",
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
|
||||||
1
third_party/cnmem
vendored
Submodule
1
third_party/cnmem
vendored
Submodule
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit 28a182d49529da49f4ac4e3941cec3edf16b3540
|
||||||
24
third_party/cnmem/BREW
vendored
24
third_party/cnmem/BREW
vendored
|
|
@ -1,24 +0,0 @@
|
||||||
# We need to copy over the header to the right folder.
|
|
||||||
shell_script(
|
|
||||||
name = "cnmem_header",
|
|
||||||
srcs = ["cnmem.h"],
|
|
||||||
commands=[
|
|
||||||
"DST=$CAFFE2_GENDIR/third_party/include/",
|
|
||||||
"mkdir -p $DST",
|
|
||||||
"cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem.h $DST/",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
cuda_library(
|
|
||||||
name = "cnmem",
|
|
||||||
srcs = [
|
|
||||||
"cnmem.cpp",
|
|
||||||
],
|
|
||||||
hdrs = [
|
|
||||||
"cnmem.h",
|
|
||||||
],
|
|
||||||
deps = [
|
|
||||||
"cnmem_header",
|
|
||||||
"//third_party:cuda",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
1287
third_party/cnmem/cnmem.cpp
vendored
1287
third_party/cnmem/cnmem.cpp
vendored
File diff suppressed because it is too large
Load Diff
263
third_party/cnmem/cnmem.h
vendored
263
third_party/cnmem/cnmem.h
vendored
|
|
@ -1,263 +0,0 @@
|
||||||
/* **********************************************************************
|
|
||||||
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
* * Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
* * Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
||||||
* contributors may be used to endorse or promote products derived
|
|
||||||
* from this software without specific prior written permission.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
||||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
||||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
||||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
||||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
||||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
||||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
||||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
* ********************************************************************** */
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
#include "cstdio"
|
|
||||||
#else
|
|
||||||
#include "stdio.h"
|
|
||||||
#endif
|
|
||||||
#include "cuda_runtime_api.h"
|
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(WIN32)
|
|
||||||
#ifdef CNMEM_DLLEXPORT
|
|
||||||
#define CNMEM_API __declspec(dllexport)
|
|
||||||
#else
|
|
||||||
#define CNMEM_API __declspec(dllimport)
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#ifdef CNMEM_DLLEXPORT
|
|
||||||
#define CNMEM_API __attribute__((visibility ("default")))
|
|
||||||
#else
|
|
||||||
#define CNMEM_API
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define CNMEM_VERSION 100 // It corresponds to 1.0.0
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* ********************************************************************************************* */
|
|
||||||
|
|
||||||
typedef enum
|
|
||||||
{
|
|
||||||
CNMEM_STATUS_SUCCESS = 0,
|
|
||||||
CNMEM_STATUS_CUDA_ERROR,
|
|
||||||
CNMEM_STATUS_INVALID_ARGUMENT,
|
|
||||||
CNMEM_STATUS_NOT_INITIALIZED,
|
|
||||||
CNMEM_STATUS_OUT_OF_MEMORY,
|
|
||||||
CNMEM_STATUS_UNKNOWN_ERROR
|
|
||||||
} cnmemStatus_t;
|
|
||||||
|
|
||||||
/* ********************************************************************************************* */
|
|
||||||
|
|
||||||
typedef enum
|
|
||||||
{
|
|
||||||
CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
|
|
||||||
CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
|
|
||||||
CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
|
|
||||||
} cnmemManagerFlags_t;
|
|
||||||
|
|
||||||
/* ********************************************************************************************* */
|
|
||||||
|
|
||||||
typedef struct cnmemDevice_t_
|
|
||||||
{
|
|
||||||
/** The device number. */
|
|
||||||
int device;
|
|
||||||
/** The size to allocate for that device. If 0, the implementation chooses the size. */
|
|
||||||
size_t size;
|
|
||||||
/** The number of named streams associated with the device. The NULL stream is not counted. */
|
|
||||||
int numStreams;
|
|
||||||
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */
|
|
||||||
cudaStream_t *streams;
|
|
||||||
/** The size reserved for each streams. It can be 0. */
|
|
||||||
size_t *streamSizes;
|
|
||||||
|
|
||||||
} cnmemDevice_t;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Initialize the library and allocate memory on the listed devices.
|
|
||||||
*
|
|
||||||
* For each device, an internal memory manager is created and the specified amount of memory is
|
|
||||||
* allocated (it is the size defined in device[i].size). For each, named stream an additional
|
|
||||||
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root
|
|
||||||
* manager for the device and a list of children, one for each named stream.
|
|
||||||
*
|
|
||||||
* This function must be called before any other function in the library. It has to be called
|
|
||||||
* by a single thread since it is not thread-safe.
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
|
||||||
* CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
|
|
||||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Release all the allocated memory.
|
|
||||||
*
|
|
||||||
* This function must be called by a single thread and after all threads that called
|
|
||||||
* cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
|
||||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemFinalize();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Increase the internal reference counter of the context object.
|
|
||||||
*
|
|
||||||
* This function increases the internal reference counter of the library. The purpose of that
|
|
||||||
* reference counting mechanism is to give more control to the user over the lifetime of the
|
|
||||||
* library. It is useful with scoped memory allocation which may be destroyed in a final
|
|
||||||
* memory collection after the end of main(). That function is thread-safe.
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemRetain();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Decrease the internal reference counter of the context object.
|
|
||||||
*
|
|
||||||
* This function decreases the internal reference counter of the library. The purpose of that
|
|
||||||
* reference counting mechanism is to give more control to the user over the lifetime of the
|
|
||||||
* library. It is useful with scoped memory allocation which may be destroyed in a final
|
|
||||||
* memory collection after the end of main(). That function is thread-safe.
|
|
||||||
*
|
|
||||||
* You can use \c cnmemRelease to explicitly finalize the library.
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemRelease();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Add a new stream to the pool of managed streams on a device.
|
|
||||||
*
|
|
||||||
* This function registers a new stream into a device memory manager. It is thread-safe.
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Allocate memory.
|
|
||||||
*
|
|
||||||
* This function allocates memory and initializes a pointer to device memory. If no memory
|
|
||||||
* is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
|
|
||||||
*
|
|
||||||
* The behavior of that function is the following:
|
|
||||||
*
|
|
||||||
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
|
|
||||||
* memory. If there's a buffer of size larger or equal to the requested size in the list of
|
|
||||||
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
|
|
||||||
* its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
|
|
||||||
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
|
|
||||||
* allowed to grow, the manager attempts to steal memory from one of its children (unless
|
|
||||||
* CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
|
|
||||||
* CNMEM_STATUS_OUT_OF_MEMORY.
|
|
||||||
*
|
|
||||||
* - If the stream is a named stream, the initial request goes to the memory manager associated
|
|
||||||
* with that stream. If a free node is available in the lists of that manager, it is returned.
|
|
||||||
* Otherwise, the request is passed to the root node and works as if the request were made on
|
|
||||||
* the NULL stream.
|
|
||||||
*
|
|
||||||
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
|
|
||||||
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to
|
|
||||||
* make sure no kernel uses a given buffer before stealing it) and it the execution is
|
|
||||||
* sequential (in a multi-threaded context, the code is executed in a critical section inside
|
|
||||||
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
|
||||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
|
|
||||||
* CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
|
|
||||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Release memory.
|
|
||||||
*
|
|
||||||
* This function releases memory and recycles a memory block in the manager. This function is
|
|
||||||
* thread safe.
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
|
||||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
|
|
||||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
|
|
||||||
|
|
||||||
/* ********************************************************************************************* */
|
|
||||||
/* Utility functions. */
|
|
||||||
/* ********************************************************************************************* */
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Returns the amount of memory managed by the memory manager associated with a stream.
|
|
||||||
*
|
|
||||||
* The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
|
|
||||||
* xity linear in the number of allocated blocks so do not call it in performance critical
|
|
||||||
* sections.
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
|
||||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
|
||||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Print a list of nodes to a file.
|
|
||||||
*
|
|
||||||
* This function is intended to be used in case of complex scenarios to help understand the
|
|
||||||
* behaviour of the memory managers/application. It is thread safe.
|
|
||||||
*
|
|
||||||
* \return
|
|
||||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
|
||||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
|
||||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
|
|
||||||
* or free_mem == 0,
|
|
||||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
|
||||||
*/
|
|
||||||
cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* \brief Converts a cnmemStatus_t value to a string.
|
|
||||||
*/
|
|
||||||
const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
|
|
||||||
|
|
||||||
/* ********************************************************************************************* */
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
} // extern "C"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
1
third_party/nccl
vendored
Submodule
1
third_party/nccl
vendored
Submodule
|
|
@ -0,0 +1 @@
|
||||||
|
Subproject commit b3a9e1333d9e2e1b8553b5843ba1ba4f7c79739d
|
||||||
Loading…
Reference in New Issue
Block a user