more build updates:

(1) nccl submodule, cnmem submodule (2) mpi ops fallback test (3) a bit more blob interface (4) fixed tests (5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts (6) In the build system autogen __init__.py instead of having manual rules just to copy over an empty __init__.py.
2025-12-06 12:20:52 +01:00 · 2016-07-22 23:58:24 -07:00 · 2016-07-22 23:58:24 -07:00 · 1ede7a7ff0
commit 1ede7a7ff0
parent b2c2d0b70c
29 changed files with 156 additions and 1629 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,9 @@
 [submodule "third_party/pybind11"]
 	path = third_party/pybind11
 	url = https://github.com/pybind/pybind11.git
 [submodule "third_party/nccl"]
 	path = third_party/nccl
 	url = https://github.com/nvidia/nccl.git
 [submodule "third_party/cnmem"]
 	path = third_party/cnmem
 	url = https://github.com/nvidia/cnmem.git
--- a/2
+++ b/2
@ -16,6 +16,6 @@ lint:
 	@find caffe2 -type f -exec python brewtool/cpplint.py {} \;
 linecount:
-	@cloc --read-lang-def=brewtool/caffe.cloc caffe2 pycaffe2 || \
+	@cloc --read-lang-def=brewtool/caffe.cloc caffe2 || \
 		echo "Cloc is not available on the machine. You can install cloc with " && \
 		echo "    sudo apt-get install cloc"
--- a/build.py
+++ b/build.py
@ -108,6 +108,7 @@ class Config(object):
        'arch=compute_30,code=sm_30',
        'arch=compute_35,code=sm_35',
        'arch=compute_50,code=sm_50',
        'arch=compute_61,code=sm_61',
    ]
    # additional CUDA cflags to pass to nvcc.
    CUDA_CFLAGS = []
--- a/caffe/BREW
+++ b/caffe/BREW
@ -1,4 +0,0 @@
 filegroup(
    name = "caffe_python",
    srcs = ["__init__.py"],
 )
--- a/caffe/init.py
+++ b/caffe/init.py
--- a/caffe/proto/BREW
+++ b/caffe/proto/BREW
@ -4,11 +4,3 @@ proto_library(
    name = 'caffe_proto',
    srcs = ['caffe.proto'],
 )
 filegroup(
    name = "caffe_proto_py",
    srcs = ["__init__.py"],
    deps = [
        "//caffe:caffe_python",
    ]
 )
--- a/caffe/proto/init.py
+++ b/caffe/proto/init.py
--- a/caffe2/BREW
+++ b/caffe2/BREW
@ -26,7 +26,7 @@ cc_library(
  deps = [
    ":core",
    ":core_gpu_cu",
-    "//third_party/cnmem:cnmem",
+    "//third_party:cnmem",
    "//third_party:cuda",
  ],
  whole_archive = True,
@ -48,6 +48,7 @@ cc_test(
              excludes=["*gpu_test*"]),
  deps = [
      ":core",
      "//caffe2/operators:core_ops",
      "//third_party:gtest",
      "//caffe2/test:caffe2_gtest_main",
  ],
@ -63,11 +64,6 @@ cc_test(
  ],
 )
 filegroup(
    name = "caffe2_python",
    srcs = ["__init__.py"],
 )
 cc_library(
    name = "all_available_ops",
    srcs = [],
@ -79,6 +75,7 @@ cc_library(
    optional_deps = [
      "//caffe2/operators:core_ops_gpu",
      "//caffe2/operators:core_ops_cudnn",
      "//caffe2/contrib/nccl:nccl_ops",
      "//caffe2/cuda_rtc:rtc_ops",
      "//caffe2/db:db_gpu",
      "//caffe2/image:image_ops",
--- a/caffe2/init.py
+++ b/caffe2/init.py
@ -1,5 +0,0 @@
 """
 Caffe2: A General Tool for Neural Networks.
 """
 __author__ = 'Yangqing Jia'
--- a/caffe2/contrib/nccl/BREW
+++ b/caffe2/contrib/nccl/BREW
@ -0,0 +1,10 @@
 cc_library(
  name = "nccl_ops",
  srcs = Glob(["*.cc"]),
  hdrs = Glob(["*.h"]),
  deps = [
    "//caffe2:core_gpu",
    "//third_party:nccl",
  ],
  whole_archive = True,
 )
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@ -58,6 +58,9 @@ class Blob {
    return *static_cast<const T*>(pointer_);
  }
  const void* GetRaw() const { return pointer_; }
  void* GetRaw() { return pointer_; }
  /**
   * @brief Gets a mutable pointer to the stored object.
   *
@ -73,6 +76,7 @@ class Blob {
      return static_cast<T*>(pointer_);
    } else {
      if (is_new_object) *is_new_object = true;
      VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
      return Reset<T>(new T());
    }
  }
@ -87,28 +91,53 @@ class Blob {
   */
  template <class T>
  T* Reset(T* allocated) {
-    if (pointer_) {
+    if (pointer_ && destroy_) {
-      CHECK_NOTNULL(destroy_)(pointer_);
+      destroy_(pointer_);
    }
    VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
    meta_ = TypeMeta::Make<T>();
    pointer_ = static_cast<void*>(allocated);
    destroy_ = &Destroy<T>;
    return allocated;
  }
  /**
   * Sets the underlying object to the allocated one, but does not take over
   * the ownership of the passed in pointer. If there is already an object in
   * the Blob, the old object is freed.
   *
   * Unlike Reset, this does not take over the ownership of the pointer and the
   * caller is responsible for making sure that the lifetime of the allocated
   * blob outlasts the lifetime of any access to this blob, until another Reset
   * call is made or the blob is destructed.
   */
  template <class T>
  typename std::remove_const<T>::type* ShareExternal(
      typename std::remove_const<T>::type* allocated) {
    return static_cast<T*>(
        ShareExternal(static_cast<void*>(allocated),
        TypeMeta::Make<typename std::remove_const<T>::type>()));
  }
  void* ShareExternal(void* allocated, const TypeMeta& meta) {
    if (pointer_ && destroy_) {
      destroy_(pointer_);
    }
    meta_ = meta;
    pointer_ = static_cast<void*>(allocated);
    destroy_ = nullptr;
    return allocated;
  }
  /**
   * Resets the Blob to an empty one.
   */
  inline void Reset() {
-    if (pointer_) {
+    if (pointer_ && destroy_) {
-      CHECK_NOTNULL(destroy_)(pointer_);
+      destroy_(pointer_);
      pointer_ = nullptr;
      meta_ = TypeMeta();
      destroy_ = nullptr;
    }
    pointer_ = nullptr;
    meta_ = TypeMeta();
    destroy_ = nullptr;
  }
  /**
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -69,6 +69,32 @@ TEST(BlobTest, BlobWrongType) {
  ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
 }
 TEST(BlobTest, BlobReset) {
  Blob blob;
  std::unique_ptr<Foo> foo(new Foo());
  EXPECT_TRUE(blob.Reset(foo.release()) != nullptr);
  // Also test that Reset works.
  blob.Reset();
 }
 TEST(BlobTest, BlobShareExternalPointer) {
  Blob blob;
  std::unique_ptr<Foo> foo(new Foo());
  EXPECT_EQ(blob.ShareExternal<Foo>(foo.get()), foo.get());
  EXPECT_TRUE(blob.IsType<Foo>());
  // Also test that Reset works.
  blob.Reset();
 }
 TEST(BlobTest, BlobShareExternalObject) {
  Blob blob;
  Foo foo;
  EXPECT_EQ(blob.ShareExternal<Foo>(&foo), &foo);
  EXPECT_TRUE(blob.IsType<Foo>());
  // Also test that Reset works.
  blob.Reset();
 }
 TEST(BlobTest, StringSerialization) {
  const std::string kTestString = "Hello world?";
  Blob blob;
@ -558,6 +584,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
        "DUMMY_ENGINE");
    Workspace ws;
    auto load_op = CreateOperator(op_def, &ws);
    EXPECT_TRUE(load_op != nullptr);
    LOG(INFO) << "Running operator";
    load_op->Run();
--- a/caffe2/core/context_gpu_test.cc
+++ b/caffe2/core/context_gpu_test.cc
@ -1,3 +1,4 @@
 #include <chrono>
 #include <future>
 #include <random>
 #include <thread>
@ -55,6 +56,8 @@ namespace {
 void TEST_GetStreamAddress(cudaStream_t* ptr) {
  CUDAContext context(0);
  *ptr = context.cuda_stream();
  // Sleep for a while so we have concurrent thread executions
  std::this_thread::sleep_for(std::chrono::seconds(1));  
 }
 }  // namespace
--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@ -35,9 +35,13 @@ class MPIBroadcastOp final : public Operator<Context> {
  bool RunOnDevice() override {
    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
    CAFFE_ENFORCE(OperatorBase::OutputIsType<Tensor<Context>>(0),
                  "Output is of wrong type.");
    auto* output = Output(0);
    // Make sure that output is already allocated.
-    CHECK_GT(output->size(), 0);
+    CAFFE_ENFORCE(output->size() > 0,
                  "Broadcast op uses in-place operation so the output "
                  "should be already allocated.");
    MPI_CHECK(MPI_Bcast(
        output->raw_mutable_data(),
        output->nbytes(),
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@ -54,8 +54,18 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
  bool RunOnDevice() override {
    for (int i = 0; i < InputSize(); ++i) {
-      local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
+      if (OperatorBase::InputIsType<TensorCUDA>(i)) {
-          Input(i), &context_);
+        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
            Input(i), &context_);
      } else {
        VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
        // Note(jiayq): This removes a const but conceptually
        // local_input_blobs will only be used as const blob input for the
        // base op so we are still fine.
        local_input_blobs_[i]->ShareExternal(
            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
            OperatorBase::Inputs()[i]->meta());
      }
    }
    // Sync to make sure copies are done.
    context_.FinishDeviceComputation();
@ -65,6 +75,9 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
      return false;
    }
    for (int i = 0; i < OutputSize(); ++i) {
      CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(),
                    "GPU fallback op currently does not support non-TensorCPU "
                    "output type.");
      Output(i)->CopyFrom(
          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
    }
--- a/caffe2/proto/BREW
+++ b/caffe2/proto/BREW
@ -4,11 +4,3 @@ proto_library(
    name = 'caffe2_proto',
    srcs = Glob(['*.proto']),
 )
 filegroup(
    name = "caffe2_proto_py",
    srcs = ["__init__.py"],
    deps = [
        "//caffe2:caffe2_python",
    ]
 )
--- a/caffe2/proto/init.py
+++ b/caffe2/proto/init.py
--- a/caffe2/python/BREW
+++ b/caffe2/python/BREW
@ -39,8 +39,8 @@ py_library(
  srcs=Glob(["*.py"], excludes=["*_test.py"]),
  deps=[
      ":caffe2_python_cpu",
-      "//caffe/proto:caffe_proto_py",
+      "//caffe/proto:caffe_proto",
-      "//caffe2/proto:caffe2_proto_py",
+      "//caffe2/proto:caffe2_proto",
      "//caffe2/python/mint:mint",
  ],
  optional_deps=[
--- a/caffe2/python/init.py
+++ b/caffe2/python/init.py
@ -1,4 +0,0 @@
 import atexit
 from . import core, utils, workspace
 from caffe2.proto import caffe2_pb2
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
--- a/caffe2/python/dataset.py
+++ b/caffe2/python/dataset.py
@ -14,7 +14,7 @@ from __future__ import print_function
 from __future__ import unicode_literals
 from caffe2.python import core, workspace
-from caffe2.python.io import Reader, Writer
+from caffe2.python.dataio import Reader, Writer
 from caffe2.python.schema import Struct
 import numpy as np
--- a/caffe2/python/mint/BREW
+++ b/caffe2/python/mint/BREW
@ -1,7 +1,6 @@
 py_library(
  name = "mint",
  srcs = [
      "__init__.py",
      "app.py",
      "static/css/simple-sidebar.css",
      "templates/index.html",
--- a/caffe2/python/mint/init.py
+++ b/caffe2/python/mint/init.py
--- a/third_party/BREW
+++ b/third_party/BREW
@ -92,10 +92,49 @@ cc_thirdparty_target(
  ],
 )
-cc_thirdparty_target(
+shell_script(
-  name="cnmen",
+  name = "cnmem_header",
-  deps=["//third_party/cnmem:cnmem"],
+  srcs = ["cnmem/include/cnmem.h"],
-  cc_obj_files = [],
+  commands=[
    "DST=$CAFFE2_GENDIR/third_party/include/",
    "mkdir -p $DST",
    "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem/include/cnmem.h $DST/",
  ],
 )
 cc_library(
  name = "cnmem",
  srcs = [
    "cnmem/src/cnmem.cpp",
  ],
  deps = [
      ":cnmem_header",
      ":cuda",
  ]
 )
 shell_script(
  name = "nccl_header",
  srcs = ["nccl/src/nccl.h"],
  commands=[
    "DST=$CAFFE2_GENDIR/third_party/include/",
    "mkdir -p $DST",
    "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/nccl/src/nccl.h $DST/",
  ],
 )
 cuda_library(
  name = "nccl",
  srcs = Glob(["nccl/src/*.cu"]),
  deps = [
      ":nccl_header",
      ":cuda",
  ],
  compiler_flags=[
      "-Wno-switch",  # NCCL does not follow strict switch enum check.
      "-DNCCL_MAJOR=1 -DNCCL_MINOR=2 -DNCCL_PATCH=3",
      "-DCUDA_MAJOR=__CUDACC_VER_MAJOR__ -DCUDA_MINOR=__CUDACC_VER_MINOR__",
  ],
 )
 ###############################################################################
--- a/third_party/cnmem
+++ b/third_party/cnmem
@ -0,0 +1 @@
 Subproject commit 28a182d49529da49f4ac4e3941cec3edf16b3540
--- a/third_party/cnmem/BREW
+++ b/third_party/cnmem/BREW
@ -1,24 +0,0 @@
 # We need to copy over the header to the right folder.
 shell_script(
  name = "cnmem_header",
  srcs = ["cnmem.h"],
  commands=[
    "DST=$CAFFE2_GENDIR/third_party/include/",
    "mkdir -p $DST",
    "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem.h $DST/",
  ],
 )
 cuda_library(
  name = "cnmem",
  srcs = [
    "cnmem.cpp",
  ],
  hdrs = [
    "cnmem.h",
  ],
  deps = [
      "cnmem_header",
      "//third_party:cuda",
  ]
 )
--- a/third_party/cnmem/cnmem.cpp
+++ b/third_party/cnmem/cnmem.cpp
--- a/third_party/cnmem/cnmem.h
+++ b/third_party/cnmem/cnmem.h
@ -1,263 +0,0 @@
 /* ********************************************************************** 
 * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ********************************************************************** */
 #pragma once
 #ifdef __cplusplus
 #include "cstdio"
 #else
 #include "stdio.h"
 #endif
 #include "cuda_runtime_api.h"
 #if defined(_MSC_VER) || defined(WIN32)
 #ifdef CNMEM_DLLEXPORT
 #define CNMEM_API __declspec(dllexport)
 #else
 #define CNMEM_API __declspec(dllimport)
 #endif
 #else
 #ifdef CNMEM_DLLEXPORT
 #define CNMEM_API __attribute__((visibility ("default")))
 #else
 #define CNMEM_API
 #endif
 #endif
 #define CNMEM_VERSION 100 // It corresponds to 1.0.0
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* ********************************************************************************************* */
 typedef enum
 {
  CNMEM_STATUS_SUCCESS = 0,
  CNMEM_STATUS_CUDA_ERROR,
  CNMEM_STATUS_INVALID_ARGUMENT,
  CNMEM_STATUS_NOT_INITIALIZED,
  CNMEM_STATUS_OUT_OF_MEMORY,
  CNMEM_STATUS_UNKNOWN_ERROR
 } cnmemStatus_t;
 /* ********************************************************************************************* */
 typedef enum
 {
  CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
  CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
  CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
 } cnmemManagerFlags_t;
 /* ********************************************************************************************* */
 typedef struct cnmemDevice_t_
 {
  /** The device number. */
  int device;
  /** The size to allocate for that device. If 0, the implementation chooses the size. */
  size_t size;
  /** The number of named streams associated with the device. The NULL stream is not counted. */
  int numStreams;
  /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
  cudaStream_t *streams;
  /** The size reserved for each streams. It can be 0. */
  size_t *streamSizes;
 } cnmemDevice_t;
 /**
 * \brief Initialize the library and allocate memory on the listed devices.
 *
 * For each device, an internal memory manager is created and the specified amount of memory is 
 * allocated (it is the size defined in device[i].size). For each, named stream an additional 
 * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
 * manager for the device and a list of children, one for each named stream.
 * 
 * This function must be called before any other function in the library. It has to be called 
 * by a single thread since it is not thread-safe.
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
 * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
 * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
 */
 cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
 /**
 * \brief Release all the allocated memory. 
 * 
 * This function must be called by a single thread and after all threads that called 
 * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
 * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
 cnmemStatus_t CNMEM_API cnmemFinalize();
 /**
 * \brief Increase the internal reference counter of the context object.
 * 
 * This function increases the internal reference counter of the library. The purpose of that
 * reference counting mechanism is to give more control to the user over the lifetime of the 
 * library. It is useful with scoped memory allocation which may be destroyed in a final 
 * memory collection after the end of main(). That function is thread-safe.
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
 */
 cnmemStatus_t CNMEM_API cnmemRetain();
 /**
 * \brief Decrease the internal reference counter of the context object.
 * 
 * This function decreases the internal reference counter of the library. The purpose of that
 * reference counting mechanism is to give more control to the user over the lifetime of the 
 * library. It is useful with scoped memory allocation which may be destroyed in a final 
 * memory collection after the end of main(). That function is thread-safe.
 *
 * You can use \c cnmemRelease to explicitly finalize the library.
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
 */
 cnmemStatus_t CNMEM_API cnmemRelease();
 /**
 * \brief Add a new stream to the pool of managed streams on a device.
 *
 * This function registers a new stream into a device memory manager. It is thread-safe.
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
 */
 cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
 /**
 * \brief Allocate memory. 
 * 
 * This function allocates memory and initializes a pointer to device memory. If no memory 
 * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
 *
 * The behavior of that function is the following: 
 *
 * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
 *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
 *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
 *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
 *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
 *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
 *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
 *   CNMEM_STATUS_OUT_OF_MEMORY.
 * 
 * - If the stream is a named stream, the initial request goes to the memory manager associated 
 *   with that stream. If a free node is available in the lists of that manager, it is returned. 
 *   Otherwise, the request is passed to the root node and works as if the request were made on 
 *   the NULL stream.
 *
 * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
 * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
 * make sure no kernel uses a given buffer before stealing it) and it the execution is 
 * sequential (in a multi-threaded context, the code is executed in a critical section inside
 * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
 * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
 * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
 * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
 cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
 /**
 * \brief Release memory. 
 * 
 * This function releases memory and recycles a memory block in the manager. This function is 
 * thread safe. 
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
 * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
 * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
 cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
 /* ********************************************************************************************* */
 /* Utility functions.                                                                            */
 /* ********************************************************************************************* */
 /**
 * \brief Returns the amount of memory managed by the memory manager associated with a stream.
 * 
 * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
 * xity linear in the number of allocated blocks so do not call it in performance critical 
 * sections. 
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
 * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
 * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
 cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
 /**
 * \brief Print a list of nodes to a file. 
 * 
 * This function is intended to be used in case of complex scenarios to help understand the 
 * behaviour of the memory managers/application. It is thread safe.
 *
 * \return 
 * CNMEM_STATUS_SUCCESS,          if everything goes fine,
 * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
 * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
 *                                or free_mem == 0,
 * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
 */
 cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
 /**
 * \brief Converts a cnmemStatus_t value to a string.
 */
 const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
 /* ********************************************************************************************* */
 #ifdef __cplusplus
 } // extern "C"
 #endif
--- a/third_party/nccl
+++ b/third_party/nccl
@ -0,0 +1 @@
 Subproject commit b3a9e1333d9e2e1b8553b5843ba1ba4f7c79739d
		`@ -0,0 +1 @@`
							`Subproject commit 28a182d49529da49f4ac4e3941cec3edf16b3540`
		`@ -0,0 +1 @@`
							`Subproject commit b3a9e1333d9e2e1b8553b5843ba1ba4f7c79739d`