Move c10d to libtorch(_cuda) (#59563)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/59563

ghstack-source-id: 131331264

Test Plan: CI

Reviewed By: malfet

Differential Revision: D28932239

fbshipit-source-id: 5df6cdfa5253b15cbbc97039fe672d6d97321e34
This commit is contained in:
Luca Wehrstedt 2021-06-15 02:00:08 -07:00 committed by Facebook GitHub Bot
parent 8d50a4e326
commit a1780432fa
29 changed files with 183 additions and 239 deletions

View File

@ -289,6 +289,12 @@ cmake_dependent_option(
cmake_dependent_option( cmake_dependent_option(
USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
"USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF) "USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
cmake_dependent_option(
USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
cmake_dependent_option(
USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
cmake_dependent_option(
USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
cmake_dependent_option( cmake_dependent_option(
USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
"USE_DISTRIBUTED" OFF) "USE_DISTRIBUTED" OFF)

View File

@ -357,8 +357,8 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
"${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp"
"${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h" "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h"
) )
target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only) target_link_libraries(process_group_agent PRIVATE torch fmt::fmt-header-only)
add_dependencies(process_group_agent torch c10d) add_dependencies(process_group_agent torch)
if(USE_TENSORPIPE) if(USE_TENSORPIPE)
add_library(tensorpipe_agent add_library(tensorpipe_agent
@ -370,8 +370,8 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
"${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
"${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
) )
target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only) target_link_libraries(tensorpipe_agent PRIVATE torch tensorpipe fmt::fmt-header-only)
add_dependencies(tensorpipe_agent torch c10d) add_dependencies(tensorpipe_agent torch)
if(USE_CUDA) if(USE_CUDA)
target_compile_definitions(tensorpipe_agent PUBLIC USE_CUDA) target_compile_definitions(tensorpipe_agent PUBLIC USE_CUDA)
endif() endif()
@ -621,8 +621,11 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
) )
endif() endif()
if(USE_DISTRIBUTED AND NOT WIN32) if(USE_DISTRIBUTED)
append_filelist("libtorch_distributed_sources" TORCH_SRCS) append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
if(NOT WIN32)
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
endif()
endif() endif()
endif() endif()
@ -653,6 +656,17 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
list(APPEND Caffe2_GPU_SRCS list(APPEND Caffe2_GPU_SRCS
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp) ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
endif() endif()
if(USE_DISTRIBUTED)
if(BUILD_SPLIT_CUDA)
set(_target "Caffe2_GPU_SRCS_CPP")
else()
set(_target "Caffe2_GPU_SRCS")
endif()
append_filelist("libtorch_cuda_distributed_base_sources" ${_target})
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" ${_target})
endif()
endif()
set_source_files_properties( set_source_files_properties(
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}" PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
@ -670,6 +684,12 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
list(APPEND Caffe2_HIP_SRCS list(APPEND Caffe2_HIP_SRCS
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp) ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
endif() endif()
if(USE_DISTRIBUTED)
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
if(NOT WIN32)
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
endif()
endif()
# caffe2_nvrtc's stubs to driver APIs are useful for HIP. # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
# See NOTE [ ATen NVRTC Stub and HIP ] # See NOTE [ ATen NVRTC Stub and HIP ]
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS}) add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
@ -1047,6 +1067,9 @@ endif()
install(DIRECTORY "${TORCH_SRC_DIR}/csrc" install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
FILES_MATCHING PATTERN "*.h") FILES_MATCHING PATTERN "*.h")
install(DIRECTORY "${TORCH_SRC_DIR}/lib/c10d"
DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}
FILES_MATCHING PATTERN "*.hpp")
install(FILES install(FILES
"${TORCH_SRC_DIR}/script.h" "${TORCH_SRC_DIR}/script.h"
"${TORCH_SRC_DIR}/extension.h" "${TORCH_SRC_DIR}/extension.h"
@ -1210,9 +1233,31 @@ endif()
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and # Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set # jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
if(USE_DISTRIBUTED) if(USE_DISTRIBUTED)
target_compile_definitions(torch_cpu PRIVATE # Needed to support the inclusion of c10d/Foo.hpp headers.
USE_DISTRIBUTED target_include_directories(torch_cpu PUBLIC ${TORCH_SRC_DIR}/lib)
) target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED)
if(USE_GLOO AND USE_C10D_GLOO)
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
endif()
if(USE_NCCL AND USE_C10D_NCCL)
if(USE_ROCM)
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
else()
if(BUILD_SPLIT_CUDA)
target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_NCCL)
else()
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
endif()
endif()
endif()
if(USE_MPI AND USE_C10D_MPI)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_source_files_properties(
"${TORCH_SRC_DIR}/lib/c10d/ProcessGroupMPI.cpp"
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
endif()
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
endif()
# Pass USE_RPC in order to reduce use of # Pass USE_RPC in order to reduce use of
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32) # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
# need to be removed when RPC is supported # need to be removed when RPC is supported

View File

@ -5,7 +5,7 @@ set(TORCH_RPC_TEST_SOURCES
${TORCH_RPC_TEST_DIR}/test_wire_serialization.cpp ${TORCH_RPC_TEST_DIR}/test_wire_serialization.cpp
) )
set(TORCH_RPC_TEST_DEPENDENCY_LIBS set(TORCH_RPC_TEST_DEPENDENCY_LIBS
torch c10d gtest process_group_agent torch gtest process_group_agent
) )
if(USE_GLOO) if(USE_GLOO)

View File

@ -313,7 +313,28 @@ core_sources_full = core_sources_full_mobile + [
libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources) libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources)
libtorch_distributed_sources = [ # These files are the only ones that are supported on Windows.
libtorch_distributed_base_sources = [
"torch/lib/c10d/comm.cpp",
"torch/lib/c10d/default_comm_hooks.cpp",
"torch/lib/c10d/FileStore.cpp",
"torch/lib/c10d/GlooDeviceFactory.cpp",
"torch/lib/c10d/logger.cpp",
"torch/lib/c10d/ParamCommsUtils.cpp",
"torch/lib/c10d/PrefixStore.cpp",
"torch/lib/c10d/ProcessGroup.cpp",
"torch/lib/c10d/ProcessGroupGloo.cpp",
"torch/lib/c10d/ProcessGroupMPI.cpp",
"torch/lib/c10d/ProcessGroupWrapper.cpp",
"torch/lib/c10d/reducer.cpp",
"torch/lib/c10d/sequence_num.cpp",
"torch/lib/c10d/Store.cpp",
"torch/lib/c10d/TCPStore.cpp",
"torch/lib/c10d/Utils.cpp",
]
# These files are only supported on Linux (and others) but not on Windows.
libtorch_distributed_extra_sources = [
"torch/csrc/distributed/autograd/autograd.cpp", "torch/csrc/distributed/autograd/autograd.cpp",
"torch/csrc/distributed/autograd/utils.cpp", "torch/csrc/distributed/autograd/utils.cpp",
"torch/csrc/distributed/autograd/context/container.cpp", "torch/csrc/distributed/autograd/context/container.cpp",
@ -350,8 +371,12 @@ libtorch_distributed_sources = [
"torch/csrc/distributed/rpc/types.cpp", "torch/csrc/distributed/rpc/types.cpp",
"torch/csrc/distributed/rpc/utils.cpp", "torch/csrc/distributed/rpc/utils.cpp",
"torch/csrc/distributed/rpc/metrics/registry.cpp", "torch/csrc/distributed/rpc/metrics/registry.cpp",
"torch/lib/c10d/HashStore.cpp",
"torch/lib/c10d/ProcessGroupRoundRobin.cpp",
] ]
libtorch_distributed_sources = libtorch_distributed_base_sources + libtorch_distributed_extra_sources
jit_sources_full = [ jit_sources_full = [
"torch/csrc/jit/codegen/cuda/interface.cpp", "torch/csrc/jit/codegen/cuda/interface.cpp",
"torch/csrc/jit/passes/lower_graph.cpp", "torch/csrc/jit/passes/lower_graph.cpp",
@ -490,7 +515,20 @@ libtorch_cuda_core_sources = [
"torch/csrc/jit/runtime/register_cuda_ops.cpp", "torch/csrc/jit/runtime/register_cuda_ops.cpp",
] ]
libtorch_cuda_sources = libtorch_cuda_core_sources + [ # These files are the only ones that are supported on Windows.
libtorch_cuda_distributed_base_sources = [
"torch/lib/c10d/reducer_cuda.cpp",
]
# These files are only supported on Linux (and others) but not on Windows.
libtorch_cuda_distributed_extra_sources = [
"torch/lib/c10d/NCCLUtils.cpp",
"torch/lib/c10d/ProcessGroupNCCL.cpp",
]
libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
"torch/csrc/cuda/nccl.cpp", "torch/csrc/cuda/nccl.cpp",
] ]
@ -665,13 +703,9 @@ libtorch_python_core_sources = [
] ]
libtorch_python_distributed_core_sources = [ libtorch_python_distributed_core_sources = [
"torch/lib/c10d/comm.cpp",
"torch/lib/c10d/default_comm_hooks.cpp",
"torch/lib/c10d/reducer.cpp",
"torch/lib/c10d/reducer_cuda.cpp",
"torch/lib/c10d/logger.cpp",
"torch/csrc/distributed/c10d/python_comm_hook.cpp", "torch/csrc/distributed/c10d/python_comm_hook.cpp",
"torch/csrc/distributed/c10d/init.cpp", "torch/csrc/distributed/c10d/init.cpp",
"torch/lib/c10d/frontend.cpp",
] ]
libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [

View File

@ -278,7 +278,17 @@ if(USE_DISTRIBUTED)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
endif() endif()
list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d) # NCCL is a private dependency of libtorch, but libtorch_python includes
# some private headers of libtorch, which in turn include NCCL. As a hacky
# alternative to making NCCL a public dependency of libtorch, we make it
# a private dependency of libtorch_python as well.
if(USE_NCCL)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
endif()
# Same for MPI.
if(USE_MPI)
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
endif()
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D) list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
endif() endif()

View File

@ -2,7 +2,6 @@
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
#include <c10/cuda/CUDACachingAllocator.h> #include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/Optional.h> #include <c10/util/Optional.h>

View File

@ -1,166 +1,5 @@
cmake_minimum_required(VERSION 3.2 FATAL_ERROR) cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
# Find modules.
list(APPEND CMAKE_MODULE_PATH
${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/public
${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules
${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules_CUDA_fix)
if(USE_CUDA)
add_definitions(-DUSE_CUDA=1)
elseif(USE_ROCM)
add_definitions(-DUSE_ROCM=1)
add_definitions(-D__HIP_PLATFORM_HCC__=1)
else()
message(STATUS "Building c10d without CUDA/ROCm support")
endif()
if(USE_TBB)
include_directories(${TBB_ROOT_DIR}/include)
endif()
if(USE_GLOO)
option(USE_C10D_GLOO "USE C10D GLOO" ON)
endif()
if(USE_NCCL)
option(USE_C10D_NCCL "USE C10D NCCL" ON)
endif()
if(USE_MPI)
find_package(MPI)
if(MPI_FOUND)
message(STATUS "MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}")
message(STATUS "MPI_LIBRARIES: ${MPI_LIBRARIES}")
message(STATUS "MPIEXEC: ${MPIEXEC}")
option(USE_C10D_MPI "USE C10D MPI" ON)
else()
message(STATUS "Not able to find MPI, will compile c10d without MPI support")
endif()
endif()
function(copy_header file)
configure_file(${file} ${CMAKE_BINARY_DIR}/include/c10d/${file} COPYONLY)
endfunction()
set(C10D_SRCS
frontend.cpp
FileStore.cpp
ParamCommsUtils.cpp
PrefixStore.cpp
ProcessGroup.cpp
sequence_num.cpp
Store.cpp
TCPStore.cpp
Utils.cpp
)
if(NOT WIN32)
list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp)
endif()
set(C10D_LIBS torch)
if(USE_C10D_NCCL)
list(APPEND C10D_SRCS ProcessGroupNCCL.cpp NCCLUtils.cpp)
list(APPEND C10D_LIBS __caffe2_nccl)
endif()
if(USE_C10D_MPI)
list(APPEND C10D_SRCS ProcessGroupMPI.cpp)
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set_source_files_properties(ProcessGroupMPI.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
endif()
list(APPEND C10D_LIBS ${MPI_LIBRARIES})
endif()
if(USE_C10D_GLOO)
list(APPEND C10D_SRCS ProcessGroupGloo.cpp GlooDeviceFactory.cpp ProcessGroupWrapper.cpp)
list(APPEND C10D_LIBS gloo)
if(USE_CUDA)
list(APPEND C10D_LIBS gloo_cuda)
endif()
endif()
add_library(c10d STATIC ${C10D_SRCS})
set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET c10d PROPERTY CXX_STANDARD 14)
if(NOT MSVC)
target_compile_options(c10d PUBLIC
-Wall
-Wextra
-Wno-unused-parameter
-Wno-missing-field-initializers
-Wno-write-strings
-Wno-unknown-pragmas
)
endif()
add_dependencies(c10d torch)
if(USE_C10D_GLOO)
add_dependencies(c10d gloo)
if(USE_CUDA)
add_dependencies(c10d gloo_cuda)
endif()
endif()
target_include_directories(c10d PUBLIC
${CMAKE_BINARY_DIR}/aten/src # provides "ATen/TypeExtendedInterface.h" to ATen.h
${CMAKE_BINARY_DIR}/caffe2/aten/src # provides <TH/THGeneral.h> to THC.h
)
# For <c10d/...>
target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
if(USE_C10D_NCCL)
target_compile_definitions(c10d PUBLIC USE_C10D_NCCL)
endif()
if(USE_C10D_MPI)
target_compile_definitions(c10d PUBLIC USE_C10D_MPI)
endif()
if(USE_C10D_GLOO)
target_compile_definitions(c10d PUBLIC USE_C10D_GLOO)
endif()
copy_header(FileStore.hpp)
copy_header(ParamCommsUtils.hpp)
copy_header(PrefixStore.hpp)
copy_header(ProcessGroup.hpp)
copy_header(Store.hpp)
copy_header(TCPStore.hpp)
copy_header(Types.hpp)
copy_header(Utils.hpp)
copy_header(sequence_num.hpp)
if(USE_GLOO)
copy_header(ProcessGroupGloo.hpp)
copy_header(GlooDeviceFactory.hpp)
copy_header(ProcessGroupWrapper.hpp)
endif()
if(NOT WIN32)
copy_header(HashStore.hpp)
copy_header(UnixSockUtils.hpp)
else()
copy_header(WinSockUtils.hpp)
endif()
if(USE_C10D_NCCL)
copy_header(ProcessGroupNCCL.hpp)
copy_header(NCCLUtils.hpp)
endif()
if(USE_C10D_MPI)
target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH})
copy_header(ProcessGroupMPI.hpp)
endif()
target_link_libraries(c10d PUBLIC ${C10D_LIBS})
install(TARGETS c10d DESTINATION lib)
option(BUILD_EXAMPLES "Build examples" OFF) option(BUILD_EXAMPLES "Build examples" OFF)
if(BUILD_EXAMPLES) if(BUILD_EXAMPLES)
add_subdirectory(example) add_subdirectory(example)
@ -171,6 +10,3 @@ if(BUILD_TEST)
enable_testing() enable_testing()
add_subdirectory(test) add_subdirectory(test)
endif() endif()
# Install all header files that were prepared in the build directory
install(DIRECTORY ${CMAKE_BINARY_DIR}/include/ DESTINATION include)

View File

@ -9,7 +9,7 @@
namespace c10d { namespace c10d {
class FileStore : public Store { class TORCH_API FileStore : public Store {
public: public:
explicit FileStore(const std::string& path, int numWorkers); explicit FileStore(const std::string& path, int numWorkers);

View File

@ -10,7 +10,7 @@
namespace c10d { namespace c10d {
class GlooDeviceFactory { class TORCH_API GlooDeviceFactory {
public: public:
// Create new device instance for specific interface. // Create new device instance for specific interface.
static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface( static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(

View File

@ -10,7 +10,7 @@
namespace c10d { namespace c10d {
class HashStore : public Store { class TORCH_API HashStore : public Store {
public: public:
~HashStore() override {} ~HashStore() override {}

View File

@ -2,14 +2,15 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <c10/macros/Macros.h>
#include <c10/util/ThreadLocalDebugInfo.h> #include <c10/util/ThreadLocalDebugInfo.h>
#include <ATen/core/ivalue.h> #include <ATen/core/ivalue.h>
namespace torch { namespace torch {
extern const std::string kParamCommsCallName; extern TORCH_API const std::string kParamCommsCallName;
class ParamCommsDebugInfo class TORCH_API ParamCommsDebugInfo
: public c10::DebugInfoBase { : public c10::DebugInfoBase {
public: public:

View File

@ -5,7 +5,7 @@
namespace c10d { namespace c10d {
class PrefixStore : public Store { class TORCH_API PrefixStore : public Store {
public: public:
explicit PrefixStore( explicit PrefixStore(
const std::string& prefix, const std::string& prefix,

View File

@ -8,6 +8,7 @@
#include <vector> #include <vector>
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <c10/macros/Macros.h>
#include <c10d/Types.hpp> #include <c10d/Types.hpp>
#include <c10d/Utils.hpp> #include <c10d/Utils.hpp>
@ -50,10 +51,10 @@ enum class OpType : std::uint8_t {
}; };
// Converts OpType to human readable string. // Converts OpType to human readable string.
std::string opTypeToString(OpType opType); TORCH_API std::string opTypeToString(OpType opType);
// Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE) // Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
bool isP2POp(OpType opType); TORCH_API bool isP2POp(OpType opType);
// ProcessGroup is a base class that captures collective and point to // ProcessGroup is a base class that captures collective and point to
// point communication in a fixed set of processes. // point communication in a fixed set of processes.
@ -75,13 +76,13 @@ bool isP2POp(OpType opType);
// process group to find each other (referred to as rendezvous from // process group to find each other (referred to as rendezvous from
// hereon) // hereon)
// //
class ProcessGroup : public torch::CustomClassHolder { class TORCH_API ProcessGroup : public torch::CustomClassHolder {
public: public:
// Please do not use ProcessGroup::Work API, it is going away, to be // Please do not use ProcessGroup::Work API, it is going away, to be
// replaced by ivalue::Future. // replaced by ivalue::Future.
// Python binding for this class might change, please do not assume // Python binding for this class might change, please do not assume
// this will be bound using pybind. // this will be bound using pybind.
class Work : public torch::CustomClassHolder { class TORCH_API Work : public torch::CustomClassHolder {
public: public:
Work( Work(
int rank = -1, int rank = -1,
@ -176,7 +177,7 @@ class ProcessGroup : public torch::CustomClassHolder {
// when constructing a ProcessGroup. Each ProcessGroup subclass should // when constructing a ProcessGroup. Each ProcessGroup subclass should
// extend this struct and define its options if it wants to provide more // extend this struct and define its options if it wants to provide more
// config options (beyond basic ones defined here) to end user. // config options (beyond basic ones defined here) to end user.
struct Options : torch::CustomClassHolder { struct TORCH_API Options : torch::CustomClassHolder {
explicit Options( explicit Options(
std::string backend, std::string backend,
std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout) std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout)

View File

@ -50,7 +50,7 @@ constexpr const char* GLOO_BACKEND_NAME = "gloo";
// number can be automatically tuned, but only if we let a single // number can be automatically tuned, but only if we let a single
// process take charge, and have it broadcast the limits. // process take charge, and have it broadcast the limits.
// //
class ProcessGroupGloo : public ProcessGroup { class TORCH_API ProcessGroupGloo : public ProcessGroup {
public: public:
// AsyncWork is the Gloo specific superclass for asynchronous work items. // AsyncWork is the Gloo specific superclass for asynchronous work items.
// We can split asynchronous work into 3 phases: // We can split asynchronous work into 3 phases:
@ -68,7 +68,7 @@ class ProcessGroupGloo : public ProcessGroup {
// //
// FIXME: This probably should be called WorkGloo since the work is executed in sync mode // FIXME: This probably should be called WorkGloo since the work is executed in sync mode
// by a background thread. // by a background thread.
class AsyncWork : public ProcessGroup::Work { class TORCH_API AsyncWork : public ProcessGroup::Work {
public: public:
explicit AsyncWork( explicit AsyncWork(
std::vector<std::vector<at::Tensor>> outputTensors, std::vector<std::vector<at::Tensor>> outputTensors,
@ -97,7 +97,7 @@ class ProcessGroupGloo : public ProcessGroup {
}; };
// Wrap c10d store as Gloo store // Wrap c10d store as Gloo store
class GlooStore : public ::gloo::rendezvous::Store { class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
public: public:
GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {} GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
@ -140,7 +140,7 @@ class ProcessGroupGloo : public ProcessGroup {
// recv operation. It keeps a reference to the tensor it is // recv operation. It keeps a reference to the tensor it is
// operating on to prevent it from being deallocated while the // operating on to prevent it from being deallocated while the
// operation is still in flight. // operation is still in flight.
class SendWork : public ProcessGroup::Work { class TORCH_API SendWork : public ProcessGroup::Work {
public: public:
explicit SendWork( explicit SendWork(
at::Tensor& tensor, at::Tensor& tensor,
@ -155,7 +155,7 @@ class ProcessGroupGloo : public ProcessGroup {
std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_; std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
}; };
class RecvWork : public ProcessGroup::Work { class TORCH_API RecvWork : public ProcessGroup::Work {
public: public:
explicit RecvWork( explicit RecvWork(
at::Tensor& tensor, at::Tensor& tensor,
@ -174,7 +174,7 @@ class ProcessGroupGloo : public ProcessGroup {
int srcRank_; int srcRank_;
}; };
struct Options : public ProcessGroup::Options { struct TORCH_API Options : public ProcessGroup::Options {
explicit Options( explicit Options(
std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout); std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout);

View File

@ -78,7 +78,7 @@ struct WorkEntry {
// //
// CUDA tensor can be supported if the MPI used is CUDA-aware MPI, and // CUDA tensor can be supported if the MPI used is CUDA-aware MPI, and
// ProcessGroupMPI will automatically detect this support. // ProcessGroupMPI will automatically detect this support.
class ProcessGroupMPI : public ProcessGroup { class TORCH_API ProcessGroupMPI : public ProcessGroup {
public: public:
class WorkMPI : public ProcessGroup::Work { class WorkMPI : public ProcessGroup::Work {
public: public:

View File

@ -70,7 +70,7 @@ constexpr const char* NCCL_BACKEND_NAME = "nccl";
// work->wait() // work->wait()
// //
// // Now continue on other work in the current stream. // // Now continue on other work in the current stream.
class ProcessGroupNCCL : public ProcessGroup { class TORCH_API ProcessGroupNCCL : public ProcessGroup {
public: public:
class WorkNCCL : public ProcessGroup::Work, class WorkNCCL : public ProcessGroup::Work,
public std::enable_shared_from_this<WorkNCCL> { public std::enable_shared_from_this<WorkNCCL> {

View File

@ -18,7 +18,7 @@ constexpr const char* ROUND_ROBIN_BACKEND_NAME = "round_robin";
// across all processes in the process group. This is the only way that we // across all processes in the process group. This is the only way that we
// can guarantee to match up the same calls among all processes. // can guarantee to match up the same calls among all processes.
// //
class ProcessGroupRoundRobin final : public ProcessGroup { class TORCH_API ProcessGroupRoundRobin final : public ProcessGroup {
public: public:
explicit ProcessGroupRoundRobin( explicit ProcessGroupRoundRobin(
int rank, int rank,

View File

@ -9,7 +9,7 @@
namespace c10d { namespace c10d {
class ProcessGroupWrapper : public ProcessGroup { class TORCH_API ProcessGroupWrapper : public ProcessGroup {
public: public:
explicit ProcessGroupWrapper( explicit ProcessGroupWrapper(
c10::intrusive_ptr<ProcessGroup> pg, c10::intrusive_ptr<ProcessGroup> pg,

View File

@ -6,6 +6,7 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <c10/macros/Macros.h>
#include <torch/custom_class.h> #include <torch/custom_class.h>
namespace c10d { namespace c10d {
@ -15,7 +16,7 @@ namespace c10d {
using WatchKeyCallback = using WatchKeyCallback =
std::function<void(c10::optional<std::string>, c10::optional<std::string>)>; std::function<void(c10::optional<std::string>, c10::optional<std::string>)>;
class Store : public torch::CustomClassHolder { class TORCH_API Store : public torch::CustomClassHolder {
public: public:
static constexpr std::chrono::milliseconds kDefaultTimeout = static constexpr std::chrono::milliseconds kDefaultTimeout =
std::chrono::seconds(300); std::chrono::seconds(300);

View File

@ -36,7 +36,7 @@ struct TCPStoreOptions {
bool multiTenant = false; bool multiTenant = false;
}; };
class TCPStore : public Store { class TORCH_API TCPStore : public Store {
public: public:
explicit TCPStore(std::string host, const TCPStoreOptions& opts = {}); explicit TCPStore(std::string host, const TCPStoreOptions& opts = {});

View File

@ -45,12 +45,12 @@ extern const char* kDistDebugDetailLogLevel;
extern const char* kDistDebugInfoLogLevel; extern const char* kDistDebugInfoLogLevel;
extern const char* kDistDebugOffLogLevel; extern const char* kDistDebugOffLogLevel;
std::string parse_env(const char* env_var_name); TORCH_API std::string parse_env(const char* env_var_name);
DistributedDebugLevel parseDistDebugLevel(); TORCH_API DistributedDebugLevel parseDistDebugLevel();
// Retrieve tensor shapes from a given tensor. // Retrieve tensor shapes from a given tensor.
std::vector<at::Tensor> getTensorShapes(const std::vector<at::Tensor>& tensors); TORCH_API std::vector<at::Tensor> getTensorShapes(const std::vector<at::Tensor>& tensors);
// Turns at::IntArrayRef into "(1, 2, 3, 4)". // Turns at::IntArrayRef into "(1, 2, 3, 4)".
inline std::string toString(at::IntArrayRef l) { inline std::string toString(at::IntArrayRef l) {

View File

@ -5,7 +5,6 @@
#include <ATen/core/functional.h> #include <ATen/core/functional.h>
#include <c10/util/irange.h> #include <c10/util/irange.h>
#include <c10d/reducer.hpp> #include <c10d/reducer.hpp>
#include <torch/csrc/jit/python/pybind_utils.h>
#include <torch/csrc/utils/tensor_flatten.h> #include <torch/csrc/utils/tensor_flatten.h>
namespace c10d { namespace c10d {

View File

@ -7,14 +7,14 @@
namespace c10d { namespace c10d {
// Broadcast many tensors to all processes in the process group. // Broadcast many tensors to all processes in the process group.
void broadcast_coalesced( TORCH_API void broadcast_coalesced(
c10::intrusive_ptr<c10d::ProcessGroup> process_group, c10::intrusive_ptr<c10d::ProcessGroup> process_group,
at::TensorList tensors, at::TensorList tensors,
size_t buffer_size, size_t buffer_size,
int rank = 0); int rank = 0);
// This class passes bucket contents tensor to DDP communication hook. // This class passes bucket contents tensor to DDP communication hook.
class GradBucket { class TORCH_API GradBucket {
public: public:
explicit GradBucket( explicit GradBucket(
size_t index, size_t index,

View File

@ -1,3 +1,6 @@
add_executable(allreduce allreduce.cpp) add_executable(allreduce allreduce.cpp)
target_include_directories(allreduce PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..) target_include_directories(allreduce PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_link_libraries(allreduce pthread c10d) target_link_libraries(allreduce pthread torch_cpu)
if(USE_CUDA)
target_link_libraries(allreduce torch_cuda)
endif()

View File

@ -3,7 +3,7 @@
namespace c10d { namespace c10d {
class Logger { class TORCH_API Logger {
public: public:
explicit Logger(std::shared_ptr<c10d::Reducer> reducer); explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
// Set logging data that can be got during DistributedDataParallel // Set logging data that can be got during DistributedDataParallel

View File

@ -9,6 +9,7 @@
#include <ATen/core/ivalue_inl.h> #include <ATen/core/ivalue_inl.h>
#include <ATen/ThreadLocalState.h> #include <ATen/ThreadLocalState.h>
#include <c10/macros/Macros.h>
#include <c10/util/intrusive_ptr.h> #include <c10/util/intrusive_ptr.h>
#include <c10d/ProcessGroup.hpp> #include <c10d/ProcessGroup.hpp>
#include <c10d/Utils.hpp> #include <c10d/Utils.hpp>
@ -30,7 +31,7 @@ constexpr int kDDPRuntimeLoggingSampleRate = 100;
// Forward declaration // Forward declaration
class Logger; class Logger;
class Timer { class TORCH_API Timer {
public: public:
enum class Event { enum class Event {
kForwardStart, kForwardStart,
@ -52,7 +53,7 @@ class Timer {
C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device); C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
class Reducer { class TORCH_API Reducer {
public: public:
// The constructor takes a list of variables for every model replica. // The constructor takes a list of variables for every model replica.
// The bucket assignment for this reducer is specified as a list of // The bucket assignment for this reducer is specified as a list of
@ -492,7 +493,7 @@ class Reducer {
// The index of tensors[i] assigned to bucket is tensor_indices[i], // The index of tensors[i] assigned to bucket is tensor_indices[i],
// when tensor_indices is empty, the index of tensors[i] assigned to // when tensor_indices is empty, the index of tensors[i] assigned to
// bucket is i. // bucket is i.
std::vector<std::vector<size_t>> compute_bucket_assignment_by_size( TORCH_API std::vector<std::vector<size_t>> compute_bucket_assignment_by_size(
const std::vector<at::Tensor>& tensors, const std::vector<at::Tensor>& tensors,
const std::vector<size_t>& bucket_size, const std::vector<size_t>& bucket_size,
const std::vector<bool>& expect_sparse_gradient = {}, const std::vector<bool>& expect_sparse_gradient = {},
@ -500,7 +501,7 @@ std::vector<std::vector<size_t>> compute_bucket_assignment_by_size(
// Verify models across all processes are the same as model on rank 0 with // Verify models across all processes are the same as model on rank 0 with
// respect to no. of params and matching dtype/size/layout. // respect to no. of params and matching dtype/size/layout.
void verify_replica0_across_processes( TORCH_API void verify_replica0_across_processes(
c10::intrusive_ptr<c10d::ProcessGroup> process_group, c10::intrusive_ptr<c10d::ProcessGroup> process_group,
std::vector<std::vector<at::Tensor>> model_replicas); std::vector<std::vector<at::Tensor>> model_replicas);
} // namespace c10d } // namespace c10d

View File

@ -1,7 +1,5 @@
#include <c10d/reducer.hpp> #include <c10d/reducer.hpp>
#ifdef USE_CUDA
#include <c10/core/DeviceGuard.h> #include <c10/core/DeviceGuard.h>
#include <ATen/cuda/CUDAEvent.h> #include <ATen/cuda/CUDAEvent.h>
@ -85,5 +83,3 @@ C10_REGISTER_TYPED_CLASS(TimerRegistry, c10::kCUDA, CudaTimer);
} // namespace } // namespace
} // namespace c10d } // namespace c10d
#endif

View File

@ -1,6 +1,7 @@
#pragma once #pragma once
#include <vector> #include <vector>
#include <c10/macros/Macros.h>
#include <c10/util/Optional.h> #include <c10/util/Optional.h>
#include <c10/util/irange.h> #include <c10/util/irange.h>
@ -36,7 +37,7 @@ inline uint64_t fromVec(const std::vector<T>& values) {
return num; return num;
} }
class SequenceNum { class TORCH_API SequenceNum {
public: public:
SequenceNum(); SequenceNum();
explicit SequenceNum(const uint64_t num); explicit SequenceNum(const uint64_t num);

View File

@ -1,7 +1,7 @@
if(USE_CUDA) if(USE_CUDA)
cuda_add_library(c10d_cuda_test CUDATest.cu) cuda_add_library(c10d_cuda_test CUDATest.cu)
target_link_libraries(c10d_cuda_test c10d) target_link_libraries(c10d_cuda_test torch_cuda)
add_dependencies(c10d_cuda_test c10d) add_dependencies(c10d_cuda_test torch_cuda)
endif() endif()
function(c10d_add_test test_src) function(c10d_add_test test_src)
@ -16,29 +16,40 @@ function(c10d_add_test test_src)
add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>) add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
endfunction() endfunction()
c10d_add_test(FileStoreTest.cpp c10d gtest_main) c10d_add_test(FileStoreTest.cpp torch_cpu gtest_main)
c10d_add_test(TCPStoreTest.cpp c10d gtest_main) c10d_add_test(TCPStoreTest.cpp torch_cpu gtest_main)
if(NOT WIN32) if(NOT WIN32)
c10d_add_test(HashStoreTest.cpp c10d gtest_main) c10d_add_test(HashStoreTest.cpp torch_cpu gtest_main)
endif() endif()
if(USE_CUDA) if(USE_CUDA)
if(USE_C10D_GLOO) if(USE_GLOO AND USE_C10D_GLOO)
c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test gtest_main) c10d_add_test(ProcessGroupGlooTest.cpp torch_cpu c10d_cuda_test gtest_main)
c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test gtest_main) c10d_add_test(ProcessGroupGlooAsyncTest.cpp torch_cpu c10d_cuda_test gtest_main)
endif() endif()
if(USE_C10D_NCCL) if(USE_NCCL AND USE_C10D_NCCL)
c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test gtest_main) # NCCL is a private dependency of libtorch, but the tests include some
c10d_add_test(ProcessGroupNCCLErrorsTest.cpp c10d c10d_cuda_test # private headers of libtorch, which in turn include NCCL. As a hacky
gtest_main) # alternative to making NCCL a public dependency of libtorch, we make it
# a private dependency of the tests as well.
c10d_add_test(
ProcessGroupNCCLTest.cpp
torch_cpu c10d_cuda_test gtest_main __caffe2_nccl)
c10d_add_test(
ProcessGroupNCCLErrorsTest.cpp
torch_cpu c10d_cuda_test gtest_main __caffe2_nccl)
endif() endif()
else() else()
if(USE_C10D_GLOO) if(USE_GLOO AND USE_C10D_GLOO)
c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main) c10d_add_test(ProcessGroupGlooTest.cpp torch_cpu gtest_main)
endif() endif()
endif() endif()
if(USE_C10D_MPI) if(USE_MPI AND USE_C10D_MPI)
add_definitions(-DMPIEXEC=${MPIEXEC}) add_definitions(-DMPIEXEC=${MPIEXEC})
c10d_add_test(ProcessGroupMPITest.cpp c10d) # MPI is a private dependency of libtorch, but the tests include some
# private headers of libtorch, which in turn include MPI. As a hacky
# alternative to making MPI a public dependency of libtorch, we make it
# a private dependency of the tests as well.
c10d_add_test(ProcessGroupMPITest.cpp torch_cpu ${MPI_CXX_LIBRARIES})
endif() endif()