Move c10d to libtorch(_cuda) (#59563)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/59563 ghstack-source-id: 131331264 Test Plan: CI Reviewed By: malfet Differential Revision: D28932239 fbshipit-source-id: 5df6cdfa5253b15cbbc97039fe672d6d97321e34
2025-12-06 12:20:52 +01:00 · 2021-06-15 02:00:08 -07:00 · 2021-06-15 02:00:08 -07:00 · a1780432fa
commit a1780432fa
parent 8d50a4e326
29 changed files with 183 additions and 239 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -289,6 +289,12 @@ cmake_dependent_option(
 cmake_dependent_option(
    USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
    "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(
    USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(
    USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
 cmake_dependent_option(
    USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
    USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
    "USE_DISTRIBUTED" OFF)
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -357,8 +357,8 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
        "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp"
        "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h"
      )
-      target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
+      target_link_libraries(process_group_agent PRIVATE torch fmt::fmt-header-only)
-      add_dependencies(process_group_agent torch c10d)
+      add_dependencies(process_group_agent torch)
      if(USE_TENSORPIPE)
        add_library(tensorpipe_agent
@ -370,8 +370,8 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
          "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
          "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
          )
-        target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
+        target_link_libraries(tensorpipe_agent PRIVATE torch tensorpipe fmt::fmt-header-only)
-        add_dependencies(tensorpipe_agent torch c10d)
+        add_dependencies(tensorpipe_agent torch)
        if(USE_CUDA)
          target_compile_definitions(tensorpipe_agent PUBLIC USE_CUDA)
        endif()
@ -621,8 +621,11 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
        PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
      )
    endif()
-    if(USE_DISTRIBUTED AND NOT WIN32)
+    if(USE_DISTRIBUTED)
-      append_filelist("libtorch_distributed_sources" TORCH_SRCS)
+      append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
      if(NOT WIN32)
        append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
      endif()
    endif()
  endif()
@ -653,6 +656,17 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
      list(APPEND Caffe2_GPU_SRCS
        ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
    endif()
    if(USE_DISTRIBUTED)
      if(BUILD_SPLIT_CUDA)
        set(_target "Caffe2_GPU_SRCS_CPP")
      else()
        set(_target "Caffe2_GPU_SRCS")
      endif()
      append_filelist("libtorch_cuda_distributed_base_sources" ${_target})
      if(NOT WIN32)
        append_filelist("libtorch_cuda_distributed_extra_sources" ${_target})
      endif()
    endif()
    set_source_files_properties(
      ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
      PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
@ -670,6 +684,12 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
      list(APPEND Caffe2_HIP_SRCS
        ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
    endif()
    if(USE_DISTRIBUTED)
      append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
      if(NOT WIN32)
        append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
      endif()
    endif()
    # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
    # See NOTE [ ATen NVRTC Stub and HIP ]
    add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
@ -1047,6 +1067,9 @@ endif()
  install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
    DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
    FILES_MATCHING PATTERN "*.h")
  install(DIRECTORY "${TORCH_SRC_DIR}/lib/c10d"
    DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}
    FILES_MATCHING PATTERN "*.hpp")
  install(FILES
    "${TORCH_SRC_DIR}/script.h"
    "${TORCH_SRC_DIR}/extension.h"
@ -1210,9 +1233,31 @@ endif()
 # Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
 # jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
 if(USE_DISTRIBUTED)
-  target_compile_definitions(torch_cpu PRIVATE
+  # Needed to support the inclusion of c10d/Foo.hpp headers.
-    USE_DISTRIBUTED
+  target_include_directories(torch_cpu PUBLIC ${TORCH_SRC_DIR}/lib)
-  )
+  target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED)
  if(USE_GLOO AND USE_C10D_GLOO)
    target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
  endif()
  if(USE_NCCL AND USE_C10D_NCCL)
    if(USE_ROCM)
      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
    else()
      if(BUILD_SPLIT_CUDA)
        target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_NCCL)
      else()
        target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
      endif()
    endif()
  endif()
  if(USE_MPI AND USE_C10D_MPI)
    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
      set_source_files_properties(
        "${TORCH_SRC_DIR}/lib/c10d/ProcessGroupMPI.cpp"
        PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
    endif()
    target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
  endif()
  # Pass USE_RPC in order to reduce use of
  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
  # need to be removed when RPC is supported
--- a/test/cpp/rpc/CMakeLists.txt
+++ b/test/cpp/rpc/CMakeLists.txt
@ -5,7 +5,7 @@ set(TORCH_RPC_TEST_SOURCES
  ${TORCH_RPC_TEST_DIR}/test_wire_serialization.cpp
 )
 set(TORCH_RPC_TEST_DEPENDENCY_LIBS
-  torch c10d gtest process_group_agent
+  torch gtest process_group_agent
 )
 if(USE_GLOO)
--- a/tools/build_variables.bzl
+++ b/tools/build_variables.bzl
@ -313,7 +313,28 @@ core_sources_full = core_sources_full_mobile + [
 libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources)
-libtorch_distributed_sources = [
+# These files are the only ones that are supported on Windows.
 libtorch_distributed_base_sources = [
    "torch/lib/c10d/comm.cpp",
    "torch/lib/c10d/default_comm_hooks.cpp",
    "torch/lib/c10d/FileStore.cpp",
    "torch/lib/c10d/GlooDeviceFactory.cpp",
    "torch/lib/c10d/logger.cpp",
    "torch/lib/c10d/ParamCommsUtils.cpp",
    "torch/lib/c10d/PrefixStore.cpp",
    "torch/lib/c10d/ProcessGroup.cpp",
    "torch/lib/c10d/ProcessGroupGloo.cpp",
    "torch/lib/c10d/ProcessGroupMPI.cpp",
    "torch/lib/c10d/ProcessGroupWrapper.cpp",
    "torch/lib/c10d/reducer.cpp",
    "torch/lib/c10d/sequence_num.cpp",
    "torch/lib/c10d/Store.cpp",
    "torch/lib/c10d/TCPStore.cpp",
    "torch/lib/c10d/Utils.cpp",
 ]
 # These files are only supported on Linux (and others) but not on Windows.
 libtorch_distributed_extra_sources = [
    "torch/csrc/distributed/autograd/autograd.cpp",
    "torch/csrc/distributed/autograd/utils.cpp",
    "torch/csrc/distributed/autograd/context/container.cpp",
@ -350,8 +371,12 @@ libtorch_distributed_sources = [
    "torch/csrc/distributed/rpc/types.cpp",
    "torch/csrc/distributed/rpc/utils.cpp",
    "torch/csrc/distributed/rpc/metrics/registry.cpp",
    "torch/lib/c10d/HashStore.cpp",
    "torch/lib/c10d/ProcessGroupRoundRobin.cpp",
 ]
 libtorch_distributed_sources = libtorch_distributed_base_sources + libtorch_distributed_extra_sources
 jit_sources_full = [
    "torch/csrc/jit/codegen/cuda/interface.cpp",
    "torch/csrc/jit/passes/lower_graph.cpp",
@ -490,7 +515,20 @@ libtorch_cuda_core_sources = [
    "torch/csrc/jit/runtime/register_cuda_ops.cpp",
 ]
-libtorch_cuda_sources = libtorch_cuda_core_sources + [
+# These files are the only ones that are supported on Windows.
 libtorch_cuda_distributed_base_sources = [
    "torch/lib/c10d/reducer_cuda.cpp",
 ]
 # These files are only supported on Linux (and others) but not on Windows.
 libtorch_cuda_distributed_extra_sources = [
    "torch/lib/c10d/NCCLUtils.cpp",
    "torch/lib/c10d/ProcessGroupNCCL.cpp",
 ]
 libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
 libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
    "torch/csrc/cuda/nccl.cpp",
 ]
@ -665,13 +703,9 @@ libtorch_python_core_sources = [
 ]
 libtorch_python_distributed_core_sources = [
    "torch/lib/c10d/comm.cpp",
    "torch/lib/c10d/default_comm_hooks.cpp",
    "torch/lib/c10d/reducer.cpp",
    "torch/lib/c10d/reducer_cuda.cpp",
    "torch/lib/c10d/logger.cpp",
    "torch/csrc/distributed/c10d/python_comm_hook.cpp",
    "torch/csrc/distributed/c10d/init.cpp",
    "torch/lib/c10d/frontend.cpp",
 ]
 libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@ -278,7 +278,17 @@ if(USE_DISTRIBUTED)
      list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
      list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
    endif()
-    list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
+    # NCCL is a private dependency of libtorch, but libtorch_python includes
    # some private headers of libtorch, which in turn include NCCL. As a hacky
    # alternative to making NCCL a public dependency of libtorch, we make it
    # a private dependency of libtorch_python as well.
    if(USE_NCCL)
      list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
    endif()
    # Same for MPI.
    if(USE_MPI)
      list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
    endif()
    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
 endif()
--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@ -2,7 +2,6 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <THC/THC.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/util/Optional.h>
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@ -1,166 +1,5 @@
 cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
 # Find modules.
 list(APPEND CMAKE_MODULE_PATH
  ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/public
  ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules
  ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules_CUDA_fix)
 if(USE_CUDA)
  add_definitions(-DUSE_CUDA=1)
 elseif(USE_ROCM)
  add_definitions(-DUSE_ROCM=1)
  add_definitions(-D__HIP_PLATFORM_HCC__=1)
 else()
  message(STATUS "Building c10d without CUDA/ROCm support")
 endif()
 if(USE_TBB)
  include_directories(${TBB_ROOT_DIR}/include)
 endif()
 if(USE_GLOO)
  option(USE_C10D_GLOO "USE C10D GLOO" ON)
 endif()
 if(USE_NCCL)
  option(USE_C10D_NCCL "USE C10D NCCL" ON)
 endif()
 if(USE_MPI)
  find_package(MPI)
  if(MPI_FOUND)
    message(STATUS "MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}")
    message(STATUS "MPI_LIBRARIES: ${MPI_LIBRARIES}")
    message(STATUS "MPIEXEC: ${MPIEXEC}")
    option(USE_C10D_MPI "USE C10D MPI" ON)
  else()
    message(STATUS "Not able to find MPI, will compile c10d without MPI support")
  endif()
 endif()
 function(copy_header file)
  configure_file(${file} ${CMAKE_BINARY_DIR}/include/c10d/${file} COPYONLY)
 endfunction()
 set(C10D_SRCS
  frontend.cpp
  FileStore.cpp
  ParamCommsUtils.cpp
  PrefixStore.cpp
  ProcessGroup.cpp
  sequence_num.cpp
  Store.cpp
  TCPStore.cpp
  Utils.cpp
  )
 if(NOT WIN32)
  list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp)
 endif()
 set(C10D_LIBS torch)
 if(USE_C10D_NCCL)
  list(APPEND C10D_SRCS ProcessGroupNCCL.cpp NCCLUtils.cpp)
  list(APPEND C10D_LIBS __caffe2_nccl)
 endif()
 if(USE_C10D_MPI)
  list(APPEND C10D_SRCS ProcessGroupMPI.cpp)
  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
    set_source_files_properties(ProcessGroupMPI.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
  endif()
  list(APPEND C10D_LIBS ${MPI_LIBRARIES})
 endif()
 if(USE_C10D_GLOO)
  list(APPEND C10D_SRCS ProcessGroupGloo.cpp GlooDeviceFactory.cpp ProcessGroupWrapper.cpp)
  list(APPEND C10D_LIBS gloo)
  if(USE_CUDA)
    list(APPEND C10D_LIBS gloo_cuda)
  endif()
 endif()
 add_library(c10d STATIC ${C10D_SRCS})
 set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_property(TARGET c10d PROPERTY CXX_STANDARD 14)
 if(NOT MSVC)
  target_compile_options(c10d PUBLIC
    -Wall
    -Wextra
    -Wno-unused-parameter
    -Wno-missing-field-initializers
    -Wno-write-strings
    -Wno-unknown-pragmas
    )
 endif()
 add_dependencies(c10d torch)
 if(USE_C10D_GLOO)
  add_dependencies(c10d gloo)
  if(USE_CUDA)
    add_dependencies(c10d gloo_cuda)
  endif()
 endif()
 target_include_directories(c10d PUBLIC
  ${CMAKE_BINARY_DIR}/aten/src # provides "ATen/TypeExtendedInterface.h" to ATen.h
  ${CMAKE_BINARY_DIR}/caffe2/aten/src # provides <TH/THGeneral.h> to THC.h
  )
 # For <c10d/...>
 target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
 if(USE_C10D_NCCL)
  target_compile_definitions(c10d PUBLIC USE_C10D_NCCL)
 endif()
 if(USE_C10D_MPI)
  target_compile_definitions(c10d PUBLIC USE_C10D_MPI)
 endif()
 if(USE_C10D_GLOO)
  target_compile_definitions(c10d PUBLIC USE_C10D_GLOO)
 endif()
 copy_header(FileStore.hpp)
 copy_header(ParamCommsUtils.hpp)
 copy_header(PrefixStore.hpp)
 copy_header(ProcessGroup.hpp)
 copy_header(Store.hpp)
 copy_header(TCPStore.hpp)
 copy_header(Types.hpp)
 copy_header(Utils.hpp)
 copy_header(sequence_num.hpp)
 if(USE_GLOO)
  copy_header(ProcessGroupGloo.hpp)
  copy_header(GlooDeviceFactory.hpp)
  copy_header(ProcessGroupWrapper.hpp)
 endif()
 if(NOT WIN32)
  copy_header(HashStore.hpp)
  copy_header(UnixSockUtils.hpp)
 else()
  copy_header(WinSockUtils.hpp)
 endif()
 if(USE_C10D_NCCL)
  copy_header(ProcessGroupNCCL.hpp)
  copy_header(NCCLUtils.hpp)
 endif()
 if(USE_C10D_MPI)
  target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH})
  copy_header(ProcessGroupMPI.hpp)
 endif()
 target_link_libraries(c10d PUBLIC ${C10D_LIBS})
 install(TARGETS c10d DESTINATION lib)
 option(BUILD_EXAMPLES "Build examples" OFF)
 if(BUILD_EXAMPLES)
  add_subdirectory(example)
@ -171,6 +10,3 @@ if(BUILD_TEST)
  enable_testing()
  add_subdirectory(test)
 endif()
 # Install all header files that were prepared in the build directory
 install(DIRECTORY ${CMAKE_BINARY_DIR}/include/ DESTINATION include)
--- a/torch/lib/c10d/FileStore.hpp
+++ b/torch/lib/c10d/FileStore.hpp
@ -9,7 +9,7 @@
 namespace c10d {
-class FileStore : public Store {
+class TORCH_API FileStore : public Store {
 public:
  explicit FileStore(const std::string& path, int numWorkers);
--- a/torch/lib/c10d/GlooDeviceFactory.hpp
+++ b/torch/lib/c10d/GlooDeviceFactory.hpp
@ -10,7 +10,7 @@
 namespace c10d {
-class GlooDeviceFactory {
+class TORCH_API GlooDeviceFactory {
 public:
  // Create new device instance for specific interface.
  static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(
--- a/torch/lib/c10d/HashStore.hpp
+++ b/torch/lib/c10d/HashStore.hpp
@ -10,7 +10,7 @@
 namespace c10d {
-class HashStore : public Store {
+class TORCH_API HashStore : public Store {
 public:
  ~HashStore() override {}
--- a/torch/lib/c10d/ParamCommsUtils.hpp
+++ b/torch/lib/c10d/ParamCommsUtils.hpp
@ -2,14 +2,15 @@
 #include <string>
 #include <vector>
 #include <c10/macros/Macros.h>
 #include <c10/util/ThreadLocalDebugInfo.h>
 #include <ATen/core/ivalue.h>
 namespace torch {
-extern const std::string kParamCommsCallName;
+extern TORCH_API const std::string kParamCommsCallName;
-class ParamCommsDebugInfo
+class TORCH_API ParamCommsDebugInfo
    : public c10::DebugInfoBase {
 public:
--- a/torch/lib/c10d/PrefixStore.hpp
+++ b/torch/lib/c10d/PrefixStore.hpp
@ -5,7 +5,7 @@
 namespace c10d {
-class PrefixStore : public Store {
+class TORCH_API PrefixStore : public Store {
 public:
  explicit PrefixStore(
      const std::string& prefix,
--- a/torch/lib/c10d/ProcessGroup.hpp
+++ b/torch/lib/c10d/ProcessGroup.hpp
@ -8,6 +8,7 @@
 #include <vector>
 #include <ATen/ATen.h>
 #include <c10/macros/Macros.h>
 #include <c10d/Types.hpp>
 #include <c10d/Utils.hpp>
@ -50,10 +51,10 @@ enum class OpType : std::uint8_t {
 };
 // Converts OpType to human readable string.
-std::string opTypeToString(OpType opType);
+TORCH_API std::string opTypeToString(OpType opType);
 // Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
-bool isP2POp(OpType opType);
+TORCH_API bool isP2POp(OpType opType);
 // ProcessGroup is a base class that captures collective and point to
 // point communication in a fixed set of processes.
@ -75,13 +76,13 @@ bool isP2POp(OpType opType);
 // process group to find each other (referred to as rendezvous from
 // hereon)
 //
-class ProcessGroup : public torch::CustomClassHolder {
+class TORCH_API ProcessGroup : public torch::CustomClassHolder {
 public:
  // Please do not use ProcessGroup::Work API, it is going away, to be
  // replaced by ivalue::Future.
  // Python binding for this class might change, please do not assume
  // this will be bound using pybind.
-  class Work : public torch::CustomClassHolder {
+  class TORCH_API Work : public torch::CustomClassHolder {
   public:
    Work(
        int rank = -1,
@ -176,7 +177,7 @@ class ProcessGroup : public torch::CustomClassHolder {
  // when constructing a ProcessGroup. Each ProcessGroup subclass should
  // extend this struct and define its options if it wants to provide more
  // config options (beyond basic ones defined here) to end user.
-  struct Options : torch::CustomClassHolder {
+  struct TORCH_API Options : torch::CustomClassHolder {
    explicit Options(
        std::string backend,
        std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout)
--- a/torch/lib/c10d/ProcessGroupGloo.hpp
+++ b/torch/lib/c10d/ProcessGroupGloo.hpp
@ -50,7 +50,7 @@ constexpr const char* GLOO_BACKEND_NAME = "gloo";
 // number can be automatically tuned, but only if we let a single
 // process take charge, and have it broadcast the limits.
 //
-class ProcessGroupGloo : public ProcessGroup {
+class TORCH_API ProcessGroupGloo : public ProcessGroup {
 public:
  // AsyncWork is the Gloo specific superclass for asynchronous work items.
  // We can split asynchronous work into 3 phases:
@ -68,7 +68,7 @@ class ProcessGroupGloo : public ProcessGroup {
  //
  // FIXME: This probably should be called WorkGloo since the work is executed in sync mode
  // by a background thread.
-  class AsyncWork : public ProcessGroup::Work {
+  class TORCH_API AsyncWork : public ProcessGroup::Work {
   public:
    explicit AsyncWork(
        std::vector<std::vector<at::Tensor>> outputTensors,
@ -97,7 +97,7 @@ class ProcessGroupGloo : public ProcessGroup {
  };
  // Wrap c10d store as Gloo store
-  class GlooStore : public ::gloo::rendezvous::Store {
+  class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
   public:
    GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
@ -140,7 +140,7 @@ class ProcessGroupGloo : public ProcessGroup {
  // recv operation. It keeps a reference to the tensor it is
  // operating on to prevent it from being deallocated while the
  // operation is still in flight.
-  class SendWork : public ProcessGroup::Work {
+  class TORCH_API SendWork : public ProcessGroup::Work {
   public:
    explicit SendWork(
        at::Tensor& tensor,
@ -155,7 +155,7 @@ class ProcessGroupGloo : public ProcessGroup {
    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
  };
-  class RecvWork : public ProcessGroup::Work {
+  class TORCH_API RecvWork : public ProcessGroup::Work {
   public:
    explicit RecvWork(
        at::Tensor& tensor,
@ -174,7 +174,7 @@ class ProcessGroupGloo : public ProcessGroup {
    int srcRank_;
  };
-  struct Options : public ProcessGroup::Options {
+  struct TORCH_API Options : public ProcessGroup::Options {
    explicit Options(
        std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout);
--- a/torch/lib/c10d/ProcessGroupMPI.hpp
+++ b/torch/lib/c10d/ProcessGroupMPI.hpp
@ -78,7 +78,7 @@ struct WorkEntry {
 //
 // CUDA tensor can be supported if the MPI used is CUDA-aware MPI, and
 // ProcessGroupMPI will automatically detect this support.
-class ProcessGroupMPI : public ProcessGroup {
+class TORCH_API ProcessGroupMPI : public ProcessGroup {
 public:
  class WorkMPI : public ProcessGroup::Work {
   public:
--- a/torch/lib/c10d/ProcessGroupNCCL.hpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.hpp
@ -70,7 +70,7 @@ constexpr const char* NCCL_BACKEND_NAME = "nccl";
 //   work->wait()
 //
 //   // Now continue on other work in the current stream.
-class ProcessGroupNCCL : public ProcessGroup {
+class TORCH_API ProcessGroupNCCL : public ProcessGroup {
 public:
  class WorkNCCL : public ProcessGroup::Work,
    public std::enable_shared_from_this<WorkNCCL> {
--- a/torch/lib/c10d/ProcessGroupRoundRobin.hpp
+++ b/torch/lib/c10d/ProcessGroupRoundRobin.hpp
@ -18,7 +18,7 @@ constexpr const char* ROUND_ROBIN_BACKEND_NAME = "round_robin";
 // across all processes in the process group. This is the only way that we
 // can guarantee to match up the same calls among all processes.
 //
-class ProcessGroupRoundRobin final : public ProcessGroup {
+class TORCH_API ProcessGroupRoundRobin final : public ProcessGroup {
 public:
  explicit ProcessGroupRoundRobin(
      int rank,
--- a/torch/lib/c10d/ProcessGroupWrapper.hpp
+++ b/torch/lib/c10d/ProcessGroupWrapper.hpp
@ -9,7 +9,7 @@
 namespace c10d {
-class ProcessGroupWrapper : public ProcessGroup {
+class TORCH_API ProcessGroupWrapper : public ProcessGroup {
 public:
  explicit ProcessGroupWrapper(
      c10::intrusive_ptr<ProcessGroup> pg,
--- a/torch/lib/c10d/Store.hpp
+++ b/torch/lib/c10d/Store.hpp
@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 #include <c10/macros/Macros.h>
 #include <torch/custom_class.h>
 namespace c10d {
@ -15,7 +16,7 @@ namespace c10d {
 using WatchKeyCallback =
    std::function<void(c10::optional<std::string>, c10::optional<std::string>)>;
-class Store : public torch::CustomClassHolder {
+class TORCH_API Store : public torch::CustomClassHolder {
 public:
  static constexpr std::chrono::milliseconds kDefaultTimeout =
      std::chrono::seconds(300);
--- a/torch/lib/c10d/TCPStore.hpp
+++ b/torch/lib/c10d/TCPStore.hpp
@ -36,7 +36,7 @@ struct TCPStoreOptions {
  bool multiTenant = false;
 };
-class TCPStore : public Store {
+class TORCH_API TCPStore : public Store {
 public:
  explicit TCPStore(std::string host, const TCPStoreOptions& opts = {});
--- a/torch/lib/c10d/Utils.hpp
+++ b/torch/lib/c10d/Utils.hpp
@ -45,12 +45,12 @@ extern const char* kDistDebugDetailLogLevel;
 extern const char* kDistDebugInfoLogLevel;
 extern const char* kDistDebugOffLogLevel;
-std::string parse_env(const char* env_var_name);
+TORCH_API std::string parse_env(const char* env_var_name);
-DistributedDebugLevel parseDistDebugLevel();
+TORCH_API DistributedDebugLevel parseDistDebugLevel();
 // Retrieve tensor shapes from a given tensor.
-std::vector<at::Tensor> getTensorShapes(const std::vector<at::Tensor>& tensors);
+TORCH_API std::vector<at::Tensor> getTensorShapes(const std::vector<at::Tensor>& tensors);
 // Turns at::IntArrayRef into "(1, 2, 3, 4)".
 inline std::string toString(at::IntArrayRef l) {
--- a/torch/lib/c10d/comm.cpp
+++ b/torch/lib/c10d/comm.cpp
@ -5,7 +5,6 @@
 #include <ATen/core/functional.h>
 #include <c10/util/irange.h>
 #include <c10d/reducer.hpp>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/utils/tensor_flatten.h>
 namespace c10d {
--- a/torch/lib/c10d/comm.hpp
+++ b/torch/lib/c10d/comm.hpp
@ -7,14 +7,14 @@
 namespace c10d {
 // Broadcast many tensors to all processes in the process group.
-void broadcast_coalesced(
+TORCH_API void broadcast_coalesced(
    c10::intrusive_ptr<c10d::ProcessGroup> process_group,
    at::TensorList tensors,
    size_t buffer_size,
    int rank = 0);
 // This class passes bucket contents tensor to DDP communication hook.
-class GradBucket {
+class TORCH_API GradBucket {
 public:
  explicit GradBucket(
      size_t index,
--- a/torch/lib/c10d/example/CMakeLists.txt
+++ b/torch/lib/c10d/example/CMakeLists.txt
@ -1,3 +1,6 @@
 add_executable(allreduce allreduce.cpp)
 target_include_directories(allreduce PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
-target_link_libraries(allreduce pthread c10d)
+target_link_libraries(allreduce pthread torch_cpu)
 if(USE_CUDA)
  target_link_libraries(allreduce torch_cuda)
 endif()
--- a/torch/lib/c10d/logger.hpp
+++ b/torch/lib/c10d/logger.hpp
@ -3,7 +3,7 @@
 namespace c10d {
-class Logger {
+class TORCH_API Logger {
 public:
  explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
  // Set logging data that can be got during DistributedDataParallel
--- a/torch/lib/c10d/reducer.hpp
+++ b/torch/lib/c10d/reducer.hpp
@ -9,6 +9,7 @@
 #include <ATen/core/ivalue_inl.h>
 #include <ATen/ThreadLocalState.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10d/ProcessGroup.hpp>
 #include <c10d/Utils.hpp>
@ -30,7 +31,7 @@ constexpr int kDDPRuntimeLoggingSampleRate = 100;
 // Forward declaration
 class Logger;
-class Timer {
+class TORCH_API Timer {
 public:
  enum class Event {
    kForwardStart,
@ -52,7 +53,7 @@ class Timer {
 C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
-class Reducer {
+class TORCH_API Reducer {
 public:
  // The constructor takes a list of variables for every model replica.
  // The bucket assignment for this reducer is specified as a list of
@ -492,7 +493,7 @@ class Reducer {
 // The index of tensors[i] assigned to bucket is tensor_indices[i],
 // when tensor_indices is empty, the index of tensors[i] assigned to
 // bucket is i.
-std::vector<std::vector<size_t>> compute_bucket_assignment_by_size(
+TORCH_API std::vector<std::vector<size_t>> compute_bucket_assignment_by_size(
    const std::vector<at::Tensor>& tensors,
    const std::vector<size_t>& bucket_size,
    const std::vector<bool>& expect_sparse_gradient = {},
@ -500,7 +501,7 @@ std::vector<std::vector<size_t>> compute_bucket_assignment_by_size(
 // Verify models across all processes are the same as model on rank 0 with
 // respect to no. of params and matching dtype/size/layout.
-void verify_replica0_across_processes(
+TORCH_API void verify_replica0_across_processes(
    c10::intrusive_ptr<c10d::ProcessGroup> process_group,
    std::vector<std::vector<at::Tensor>> model_replicas);
 } // namespace c10d
--- a/torch/lib/c10d/reducer_cuda.cpp
+++ b/torch/lib/c10d/reducer_cuda.cpp
@ -1,7 +1,5 @@
 #include <c10d/reducer.hpp>
 #ifdef USE_CUDA
 #include <c10/core/DeviceGuard.h>
 #include <ATen/cuda/CUDAEvent.h>
@ -85,5 +83,3 @@ C10_REGISTER_TYPED_CLASS(TimerRegistry, c10::kCUDA, CudaTimer);
 } // namespace
 } // namespace c10d
 #endif
--- a/torch/lib/c10d/sequence_num.hpp
+++ b/torch/lib/c10d/sequence_num.hpp
@ -1,6 +1,7 @@
 #pragma once
 #include <vector>
 #include <c10/macros/Macros.h>
 #include <c10/util/Optional.h>
 #include <c10/util/irange.h>
@ -36,7 +37,7 @@ inline uint64_t fromVec(const std::vector<T>& values) {
  return num;
 }
-class SequenceNum {
+class TORCH_API SequenceNum {
 public:
  SequenceNum();
  explicit SequenceNum(const uint64_t num);
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@ -1,7 +1,7 @@
 if(USE_CUDA)
  cuda_add_library(c10d_cuda_test CUDATest.cu)
-  target_link_libraries(c10d_cuda_test c10d)
+  target_link_libraries(c10d_cuda_test torch_cuda)
-  add_dependencies(c10d_cuda_test c10d)
+  add_dependencies(c10d_cuda_test torch_cuda)
 endif()
 function(c10d_add_test test_src)
@ -16,29 +16,40 @@ function(c10d_add_test test_src)
  add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
 endfunction()
-c10d_add_test(FileStoreTest.cpp c10d gtest_main)
+c10d_add_test(FileStoreTest.cpp torch_cpu gtest_main)
-c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
+c10d_add_test(TCPStoreTest.cpp torch_cpu gtest_main)
 if(NOT WIN32)
-  c10d_add_test(HashStoreTest.cpp c10d gtest_main)
+  c10d_add_test(HashStoreTest.cpp torch_cpu gtest_main)
 endif()
 if(USE_CUDA)
-  if(USE_C10D_GLOO)
+  if(USE_GLOO AND USE_C10D_GLOO)
-    c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test gtest_main)
+    c10d_add_test(ProcessGroupGlooTest.cpp torch_cpu c10d_cuda_test gtest_main)
-    c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test gtest_main)
+    c10d_add_test(ProcessGroupGlooAsyncTest.cpp torch_cpu c10d_cuda_test gtest_main)
  endif()
-  if(USE_C10D_NCCL)
+  if(USE_NCCL AND USE_C10D_NCCL)
-    c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test gtest_main)
+    # NCCL is a private dependency of libtorch, but the tests include some
-    c10d_add_test(ProcessGroupNCCLErrorsTest.cpp c10d c10d_cuda_test
+    # private headers of libtorch, which in turn include NCCL. As a hacky
-        gtest_main)
+    # alternative to making NCCL a public dependency of libtorch, we make it
    # a private dependency of the tests as well.
    c10d_add_test(
      ProcessGroupNCCLTest.cpp
      torch_cpu c10d_cuda_test gtest_main __caffe2_nccl)
    c10d_add_test(
      ProcessGroupNCCLErrorsTest.cpp
      torch_cpu c10d_cuda_test gtest_main __caffe2_nccl)
  endif()
 else()
-  if(USE_C10D_GLOO)
+  if(USE_GLOO AND USE_C10D_GLOO)
-    c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main)
+    c10d_add_test(ProcessGroupGlooTest.cpp torch_cpu gtest_main)
  endif()
 endif()
-if(USE_C10D_MPI)
+if(USE_MPI AND USE_C10D_MPI)
  add_definitions(-DMPIEXEC=${MPIEXEC})
-  c10d_add_test(ProcessGroupMPITest.cpp c10d)
+  # MPI is a private dependency of libtorch, but the tests include some
  # private headers of libtorch, which in turn include MPI. As a hacky
  # alternative to making MPI a public dependency of libtorch, we make it
  # a private dependency of the tests as well.
  c10d_add_test(ProcessGroupMPITest.cpp torch_cpu ${MPI_CXX_LIBRARIES})
 endif()