mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Move c10d to libtorch(_cuda) (#59563)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/59563 ghstack-source-id: 131331264 Test Plan: CI Reviewed By: malfet Differential Revision: D28932239 fbshipit-source-id: 5df6cdfa5253b15cbbc97039fe672d6d97321e34
This commit is contained in:
parent
8d50a4e326
commit
a1780432fa
|
|
@ -289,6 +289,12 @@ cmake_dependent_option(
|
||||||
cmake_dependent_option(
|
cmake_dependent_option(
|
||||||
USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
|
USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
|
||||||
"USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
|
"USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
|
||||||
|
cmake_dependent_option(
|
||||||
|
USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
|
||||||
|
cmake_dependent_option(
|
||||||
|
USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
|
||||||
|
cmake_dependent_option(
|
||||||
|
USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
|
||||||
cmake_dependent_option(
|
cmake_dependent_option(
|
||||||
USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
|
USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
|
||||||
"USE_DISTRIBUTED" OFF)
|
"USE_DISTRIBUTED" OFF)
|
||||||
|
|
|
||||||
|
|
@ -357,8 +357,8 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp"
|
"${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp"
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h"
|
"${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h"
|
||||||
)
|
)
|
||||||
target_link_libraries(process_group_agent PRIVATE torch c10d fmt::fmt-header-only)
|
target_link_libraries(process_group_agent PRIVATE torch fmt::fmt-header-only)
|
||||||
add_dependencies(process_group_agent torch c10d)
|
add_dependencies(process_group_agent torch)
|
||||||
|
|
||||||
if(USE_TENSORPIPE)
|
if(USE_TENSORPIPE)
|
||||||
add_library(tensorpipe_agent
|
add_library(tensorpipe_agent
|
||||||
|
|
@ -370,8 +370,8 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
|
"${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp"
|
||||||
"${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
|
"${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h"
|
||||||
)
|
)
|
||||||
target_link_libraries(tensorpipe_agent PRIVATE torch c10d tensorpipe fmt::fmt-header-only)
|
target_link_libraries(tensorpipe_agent PRIVATE torch tensorpipe fmt::fmt-header-only)
|
||||||
add_dependencies(tensorpipe_agent torch c10d)
|
add_dependencies(tensorpipe_agent torch)
|
||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
target_compile_definitions(tensorpipe_agent PUBLIC USE_CUDA)
|
target_compile_definitions(tensorpipe_agent PUBLIC USE_CUDA)
|
||||||
endif()
|
endif()
|
||||||
|
|
@ -621,8 +621,11 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
|
||||||
PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
|
PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT"
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
if(USE_DISTRIBUTED AND NOT WIN32)
|
if(USE_DISTRIBUTED)
|
||||||
append_filelist("libtorch_distributed_sources" TORCH_SRCS)
|
append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
|
||||||
|
if(NOT WIN32)
|
||||||
|
append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
@ -653,6 +656,17 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
|
||||||
list(APPEND Caffe2_GPU_SRCS
|
list(APPEND Caffe2_GPU_SRCS
|
||||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
if(USE_DISTRIBUTED)
|
||||||
|
if(BUILD_SPLIT_CUDA)
|
||||||
|
set(_target "Caffe2_GPU_SRCS_CPP")
|
||||||
|
else()
|
||||||
|
set(_target "Caffe2_GPU_SRCS")
|
||||||
|
endif()
|
||||||
|
append_filelist("libtorch_cuda_distributed_base_sources" ${_target})
|
||||||
|
if(NOT WIN32)
|
||||||
|
append_filelist("libtorch_cuda_distributed_extra_sources" ${_target})
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
set_source_files_properties(
|
set_source_files_properties(
|
||||||
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
||||||
PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
|
PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
|
||||||
|
|
@ -670,6 +684,12 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
|
||||||
list(APPEND Caffe2_HIP_SRCS
|
list(APPEND Caffe2_HIP_SRCS
|
||||||
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
if(USE_DISTRIBUTED)
|
||||||
|
append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
|
||||||
|
if(NOT WIN32)
|
||||||
|
append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
|
# caffe2_nvrtc's stubs to driver APIs are useful for HIP.
|
||||||
# See NOTE [ ATen NVRTC Stub and HIP ]
|
# See NOTE [ ATen NVRTC Stub and HIP ]
|
||||||
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
|
add_library(caffe2_nvrtc SHARED ${ATen_NVRTC_STUB_SRCS})
|
||||||
|
|
@ -1047,6 +1067,9 @@ endif()
|
||||||
install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
|
install(DIRECTORY "${TORCH_SRC_DIR}/csrc"
|
||||||
DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
|
DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}/torch
|
||||||
FILES_MATCHING PATTERN "*.h")
|
FILES_MATCHING PATTERN "*.h")
|
||||||
|
install(DIRECTORY "${TORCH_SRC_DIR}/lib/c10d"
|
||||||
|
DESTINATION ${TORCH_INSTALL_INCLUDE_DIR}
|
||||||
|
FILES_MATCHING PATTERN "*.hpp")
|
||||||
install(FILES
|
install(FILES
|
||||||
"${TORCH_SRC_DIR}/script.h"
|
"${TORCH_SRC_DIR}/script.h"
|
||||||
"${TORCH_SRC_DIR}/extension.h"
|
"${TORCH_SRC_DIR}/extension.h"
|
||||||
|
|
@ -1210,9 +1233,31 @@ endif()
|
||||||
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
|
# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
|
||||||
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
|
# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
|
||||||
if(USE_DISTRIBUTED)
|
if(USE_DISTRIBUTED)
|
||||||
target_compile_definitions(torch_cpu PRIVATE
|
# Needed to support the inclusion of c10d/Foo.hpp headers.
|
||||||
USE_DISTRIBUTED
|
target_include_directories(torch_cpu PUBLIC ${TORCH_SRC_DIR}/lib)
|
||||||
)
|
target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED)
|
||||||
|
if(USE_GLOO AND USE_C10D_GLOO)
|
||||||
|
target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
|
||||||
|
endif()
|
||||||
|
if(USE_NCCL AND USE_C10D_NCCL)
|
||||||
|
if(USE_ROCM)
|
||||||
|
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||||
|
else()
|
||||||
|
if(BUILD_SPLIT_CUDA)
|
||||||
|
target_compile_definitions(torch_cuda_cpp PUBLIC USE_C10D_NCCL)
|
||||||
|
else()
|
||||||
|
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if(USE_MPI AND USE_C10D_MPI)
|
||||||
|
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
||||||
|
set_source_files_properties(
|
||||||
|
"${TORCH_SRC_DIR}/lib/c10d/ProcessGroupMPI.cpp"
|
||||||
|
PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
||||||
|
endif()
|
||||||
|
target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
|
||||||
|
endif()
|
||||||
# Pass USE_RPC in order to reduce use of
|
# Pass USE_RPC in order to reduce use of
|
||||||
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
|
||||||
# need to be removed when RPC is supported
|
# need to be removed when RPC is supported
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ set(TORCH_RPC_TEST_SOURCES
|
||||||
${TORCH_RPC_TEST_DIR}/test_wire_serialization.cpp
|
${TORCH_RPC_TEST_DIR}/test_wire_serialization.cpp
|
||||||
)
|
)
|
||||||
set(TORCH_RPC_TEST_DEPENDENCY_LIBS
|
set(TORCH_RPC_TEST_DEPENDENCY_LIBS
|
||||||
torch c10d gtest process_group_agent
|
torch gtest process_group_agent
|
||||||
)
|
)
|
||||||
|
|
||||||
if(USE_GLOO)
|
if(USE_GLOO)
|
||||||
|
|
|
||||||
|
|
@ -313,7 +313,28 @@ core_sources_full = core_sources_full_mobile + [
|
||||||
|
|
||||||
libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources)
|
libtorch_core_sources = sorted(core_sources_common + core_sources_full + core_trainer_sources)
|
||||||
|
|
||||||
libtorch_distributed_sources = [
|
# These files are the only ones that are supported on Windows.
|
||||||
|
libtorch_distributed_base_sources = [
|
||||||
|
"torch/lib/c10d/comm.cpp",
|
||||||
|
"torch/lib/c10d/default_comm_hooks.cpp",
|
||||||
|
"torch/lib/c10d/FileStore.cpp",
|
||||||
|
"torch/lib/c10d/GlooDeviceFactory.cpp",
|
||||||
|
"torch/lib/c10d/logger.cpp",
|
||||||
|
"torch/lib/c10d/ParamCommsUtils.cpp",
|
||||||
|
"torch/lib/c10d/PrefixStore.cpp",
|
||||||
|
"torch/lib/c10d/ProcessGroup.cpp",
|
||||||
|
"torch/lib/c10d/ProcessGroupGloo.cpp",
|
||||||
|
"torch/lib/c10d/ProcessGroupMPI.cpp",
|
||||||
|
"torch/lib/c10d/ProcessGroupWrapper.cpp",
|
||||||
|
"torch/lib/c10d/reducer.cpp",
|
||||||
|
"torch/lib/c10d/sequence_num.cpp",
|
||||||
|
"torch/lib/c10d/Store.cpp",
|
||||||
|
"torch/lib/c10d/TCPStore.cpp",
|
||||||
|
"torch/lib/c10d/Utils.cpp",
|
||||||
|
]
|
||||||
|
|
||||||
|
# These files are only supported on Linux (and others) but not on Windows.
|
||||||
|
libtorch_distributed_extra_sources = [
|
||||||
"torch/csrc/distributed/autograd/autograd.cpp",
|
"torch/csrc/distributed/autograd/autograd.cpp",
|
||||||
"torch/csrc/distributed/autograd/utils.cpp",
|
"torch/csrc/distributed/autograd/utils.cpp",
|
||||||
"torch/csrc/distributed/autograd/context/container.cpp",
|
"torch/csrc/distributed/autograd/context/container.cpp",
|
||||||
|
|
@ -350,8 +371,12 @@ libtorch_distributed_sources = [
|
||||||
"torch/csrc/distributed/rpc/types.cpp",
|
"torch/csrc/distributed/rpc/types.cpp",
|
||||||
"torch/csrc/distributed/rpc/utils.cpp",
|
"torch/csrc/distributed/rpc/utils.cpp",
|
||||||
"torch/csrc/distributed/rpc/metrics/registry.cpp",
|
"torch/csrc/distributed/rpc/metrics/registry.cpp",
|
||||||
|
"torch/lib/c10d/HashStore.cpp",
|
||||||
|
"torch/lib/c10d/ProcessGroupRoundRobin.cpp",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
libtorch_distributed_sources = libtorch_distributed_base_sources + libtorch_distributed_extra_sources
|
||||||
|
|
||||||
jit_sources_full = [
|
jit_sources_full = [
|
||||||
"torch/csrc/jit/codegen/cuda/interface.cpp",
|
"torch/csrc/jit/codegen/cuda/interface.cpp",
|
||||||
"torch/csrc/jit/passes/lower_graph.cpp",
|
"torch/csrc/jit/passes/lower_graph.cpp",
|
||||||
|
|
@ -490,7 +515,20 @@ libtorch_cuda_core_sources = [
|
||||||
"torch/csrc/jit/runtime/register_cuda_ops.cpp",
|
"torch/csrc/jit/runtime/register_cuda_ops.cpp",
|
||||||
]
|
]
|
||||||
|
|
||||||
libtorch_cuda_sources = libtorch_cuda_core_sources + [
|
# These files are the only ones that are supported on Windows.
|
||||||
|
libtorch_cuda_distributed_base_sources = [
|
||||||
|
"torch/lib/c10d/reducer_cuda.cpp",
|
||||||
|
]
|
||||||
|
|
||||||
|
# These files are only supported on Linux (and others) but not on Windows.
|
||||||
|
libtorch_cuda_distributed_extra_sources = [
|
||||||
|
"torch/lib/c10d/NCCLUtils.cpp",
|
||||||
|
"torch/lib/c10d/ProcessGroupNCCL.cpp",
|
||||||
|
]
|
||||||
|
|
||||||
|
libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources
|
||||||
|
|
||||||
|
libtorch_cuda_sources = libtorch_cuda_core_sources + libtorch_cuda_distributed_sources + [
|
||||||
"torch/csrc/cuda/nccl.cpp",
|
"torch/csrc/cuda/nccl.cpp",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -665,13 +703,9 @@ libtorch_python_core_sources = [
|
||||||
]
|
]
|
||||||
|
|
||||||
libtorch_python_distributed_core_sources = [
|
libtorch_python_distributed_core_sources = [
|
||||||
"torch/lib/c10d/comm.cpp",
|
|
||||||
"torch/lib/c10d/default_comm_hooks.cpp",
|
|
||||||
"torch/lib/c10d/reducer.cpp",
|
|
||||||
"torch/lib/c10d/reducer_cuda.cpp",
|
|
||||||
"torch/lib/c10d/logger.cpp",
|
|
||||||
"torch/csrc/distributed/c10d/python_comm_hook.cpp",
|
"torch/csrc/distributed/c10d/python_comm_hook.cpp",
|
||||||
"torch/csrc/distributed/c10d/init.cpp",
|
"torch/csrc/distributed/c10d/init.cpp",
|
||||||
|
"torch/lib/c10d/frontend.cpp",
|
||||||
]
|
]
|
||||||
|
|
||||||
libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
|
libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [
|
||||||
|
|
|
||||||
|
|
@ -278,7 +278,17 @@ if(USE_DISTRIBUTED)
|
||||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
|
list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe)
|
||||||
list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
|
list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE)
|
||||||
endif()
|
endif()
|
||||||
list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
|
# NCCL is a private dependency of libtorch, but libtorch_python includes
|
||||||
|
# some private headers of libtorch, which in turn include NCCL. As a hacky
|
||||||
|
# alternative to making NCCL a public dependency of libtorch, we make it
|
||||||
|
# a private dependency of libtorch_python as well.
|
||||||
|
if(USE_NCCL)
|
||||||
|
list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
|
||||||
|
endif()
|
||||||
|
# Same for MPI.
|
||||||
|
if(USE_MPI)
|
||||||
|
list(APPEND TORCH_PYTHON_LINK_LIBRARIES ${MPI_CXX_LIBRARIES})
|
||||||
|
endif()
|
||||||
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <THC/THC.h>
|
|
||||||
#include <c10/cuda/CUDACachingAllocator.h>
|
#include <c10/cuda/CUDACachingAllocator.h>
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,166 +1,5 @@
|
||||||
cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
|
cmake_minimum_required(VERSION 3.2 FATAL_ERROR)
|
||||||
|
|
||||||
# Find modules.
|
|
||||||
list(APPEND CMAKE_MODULE_PATH
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/public
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules_CUDA_fix)
|
|
||||||
|
|
||||||
if(USE_CUDA)
|
|
||||||
add_definitions(-DUSE_CUDA=1)
|
|
||||||
elseif(USE_ROCM)
|
|
||||||
add_definitions(-DUSE_ROCM=1)
|
|
||||||
add_definitions(-D__HIP_PLATFORM_HCC__=1)
|
|
||||||
else()
|
|
||||||
message(STATUS "Building c10d without CUDA/ROCm support")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_TBB)
|
|
||||||
include_directories(${TBB_ROOT_DIR}/include)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_GLOO)
|
|
||||||
option(USE_C10D_GLOO "USE C10D GLOO" ON)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_NCCL)
|
|
||||||
option(USE_C10D_NCCL "USE C10D NCCL" ON)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_MPI)
|
|
||||||
find_package(MPI)
|
|
||||||
if(MPI_FOUND)
|
|
||||||
message(STATUS "MPI_INCLUDE_PATH: ${MPI_INCLUDE_PATH}")
|
|
||||||
message(STATUS "MPI_LIBRARIES: ${MPI_LIBRARIES}")
|
|
||||||
message(STATUS "MPIEXEC: ${MPIEXEC}")
|
|
||||||
option(USE_C10D_MPI "USE C10D MPI" ON)
|
|
||||||
else()
|
|
||||||
message(STATUS "Not able to find MPI, will compile c10d without MPI support")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
function(copy_header file)
|
|
||||||
configure_file(${file} ${CMAKE_BINARY_DIR}/include/c10d/${file} COPYONLY)
|
|
||||||
endfunction()
|
|
||||||
|
|
||||||
set(C10D_SRCS
|
|
||||||
frontend.cpp
|
|
||||||
FileStore.cpp
|
|
||||||
ParamCommsUtils.cpp
|
|
||||||
PrefixStore.cpp
|
|
||||||
ProcessGroup.cpp
|
|
||||||
sequence_num.cpp
|
|
||||||
Store.cpp
|
|
||||||
TCPStore.cpp
|
|
||||||
Utils.cpp
|
|
||||||
)
|
|
||||||
|
|
||||||
if(NOT WIN32)
|
|
||||||
list(APPEND C10D_SRCS HashStore.cpp ProcessGroupRoundRobin.cpp)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
set(C10D_LIBS torch)
|
|
||||||
|
|
||||||
if(USE_C10D_NCCL)
|
|
||||||
list(APPEND C10D_SRCS ProcessGroupNCCL.cpp NCCLUtils.cpp)
|
|
||||||
list(APPEND C10D_LIBS __caffe2_nccl)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_C10D_MPI)
|
|
||||||
list(APPEND C10D_SRCS ProcessGroupMPI.cpp)
|
|
||||||
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
|
||||||
set_source_files_properties(ProcessGroupMPI.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
|
|
||||||
endif()
|
|
||||||
list(APPEND C10D_LIBS ${MPI_LIBRARIES})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_C10D_GLOO)
|
|
||||||
list(APPEND C10D_SRCS ProcessGroupGloo.cpp GlooDeviceFactory.cpp ProcessGroupWrapper.cpp)
|
|
||||||
list(APPEND C10D_LIBS gloo)
|
|
||||||
if(USE_CUDA)
|
|
||||||
list(APPEND C10D_LIBS gloo_cuda)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_library(c10d STATIC ${C10D_SRCS})
|
|
||||||
set_property(TARGET c10d PROPERTY POSITION_INDEPENDENT_CODE ON)
|
|
||||||
set_property(TARGET c10d PROPERTY CXX_STANDARD 14)
|
|
||||||
|
|
||||||
if(NOT MSVC)
|
|
||||||
target_compile_options(c10d PUBLIC
|
|
||||||
-Wall
|
|
||||||
-Wextra
|
|
||||||
-Wno-unused-parameter
|
|
||||||
-Wno-missing-field-initializers
|
|
||||||
-Wno-write-strings
|
|
||||||
-Wno-unknown-pragmas
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
add_dependencies(c10d torch)
|
|
||||||
|
|
||||||
if(USE_C10D_GLOO)
|
|
||||||
add_dependencies(c10d gloo)
|
|
||||||
if(USE_CUDA)
|
|
||||||
add_dependencies(c10d gloo_cuda)
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
target_include_directories(c10d PUBLIC
|
|
||||||
${CMAKE_BINARY_DIR}/aten/src # provides "ATen/TypeExtendedInterface.h" to ATen.h
|
|
||||||
${CMAKE_BINARY_DIR}/caffe2/aten/src # provides <TH/THGeneral.h> to THC.h
|
|
||||||
)
|
|
||||||
|
|
||||||
# For <c10d/...>
|
|
||||||
target_include_directories(c10d PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
|
||||||
|
|
||||||
if(USE_C10D_NCCL)
|
|
||||||
target_compile_definitions(c10d PUBLIC USE_C10D_NCCL)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_C10D_MPI)
|
|
||||||
target_compile_definitions(c10d PUBLIC USE_C10D_MPI)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_C10D_GLOO)
|
|
||||||
target_compile_definitions(c10d PUBLIC USE_C10D_GLOO)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
copy_header(FileStore.hpp)
|
|
||||||
copy_header(ParamCommsUtils.hpp)
|
|
||||||
copy_header(PrefixStore.hpp)
|
|
||||||
copy_header(ProcessGroup.hpp)
|
|
||||||
copy_header(Store.hpp)
|
|
||||||
copy_header(TCPStore.hpp)
|
|
||||||
copy_header(Types.hpp)
|
|
||||||
copy_header(Utils.hpp)
|
|
||||||
copy_header(sequence_num.hpp)
|
|
||||||
if(USE_GLOO)
|
|
||||||
copy_header(ProcessGroupGloo.hpp)
|
|
||||||
copy_header(GlooDeviceFactory.hpp)
|
|
||||||
copy_header(ProcessGroupWrapper.hpp)
|
|
||||||
endif()
|
|
||||||
if(NOT WIN32)
|
|
||||||
copy_header(HashStore.hpp)
|
|
||||||
copy_header(UnixSockUtils.hpp)
|
|
||||||
else()
|
|
||||||
copy_header(WinSockUtils.hpp)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_C10D_NCCL)
|
|
||||||
copy_header(ProcessGroupNCCL.hpp)
|
|
||||||
copy_header(NCCLUtils.hpp)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(USE_C10D_MPI)
|
|
||||||
target_include_directories(c10d PUBLIC ${MPI_INCLUDE_PATH})
|
|
||||||
copy_header(ProcessGroupMPI.hpp)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
target_link_libraries(c10d PUBLIC ${C10D_LIBS})
|
|
||||||
|
|
||||||
install(TARGETS c10d DESTINATION lib)
|
|
||||||
|
|
||||||
option(BUILD_EXAMPLES "Build examples" OFF)
|
option(BUILD_EXAMPLES "Build examples" OFF)
|
||||||
if(BUILD_EXAMPLES)
|
if(BUILD_EXAMPLES)
|
||||||
add_subdirectory(example)
|
add_subdirectory(example)
|
||||||
|
|
@ -171,6 +10,3 @@ if(BUILD_TEST)
|
||||||
enable_testing()
|
enable_testing()
|
||||||
add_subdirectory(test)
|
add_subdirectory(test)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Install all header files that were prepared in the build directory
|
|
||||||
install(DIRECTORY ${CMAKE_BINARY_DIR}/include/ DESTINATION include)
|
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
||||||
class FileStore : public Store {
|
class TORCH_API FileStore : public Store {
|
||||||
public:
|
public:
|
||||||
explicit FileStore(const std::string& path, int numWorkers);
|
explicit FileStore(const std::string& path, int numWorkers);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
||||||
class GlooDeviceFactory {
|
class TORCH_API GlooDeviceFactory {
|
||||||
public:
|
public:
|
||||||
// Create new device instance for specific interface.
|
// Create new device instance for specific interface.
|
||||||
static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(
|
static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
||||||
class HashStore : public Store {
|
class TORCH_API HashStore : public Store {
|
||||||
public:
|
public:
|
||||||
~HashStore() override {}
|
~HashStore() override {}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,14 +2,15 @@
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
#include <c10/util/ThreadLocalDebugInfo.h>
|
#include <c10/util/ThreadLocalDebugInfo.h>
|
||||||
#include <ATen/core/ivalue.h>
|
#include <ATen/core/ivalue.h>
|
||||||
|
|
||||||
namespace torch {
|
namespace torch {
|
||||||
|
|
||||||
extern const std::string kParamCommsCallName;
|
extern TORCH_API const std::string kParamCommsCallName;
|
||||||
|
|
||||||
class ParamCommsDebugInfo
|
class TORCH_API ParamCommsDebugInfo
|
||||||
: public c10::DebugInfoBase {
|
: public c10::DebugInfoBase {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
||||||
class PrefixStore : public Store {
|
class TORCH_API PrefixStore : public Store {
|
||||||
public:
|
public:
|
||||||
explicit PrefixStore(
|
explicit PrefixStore(
|
||||||
const std::string& prefix,
|
const std::string& prefix,
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
|
|
||||||
#include <c10d/Types.hpp>
|
#include <c10d/Types.hpp>
|
||||||
#include <c10d/Utils.hpp>
|
#include <c10d/Utils.hpp>
|
||||||
|
|
@ -50,10 +51,10 @@ enum class OpType : std::uint8_t {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Converts OpType to human readable string.
|
// Converts OpType to human readable string.
|
||||||
std::string opTypeToString(OpType opType);
|
TORCH_API std::string opTypeToString(OpType opType);
|
||||||
|
|
||||||
// Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
|
// Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
|
||||||
bool isP2POp(OpType opType);
|
TORCH_API bool isP2POp(OpType opType);
|
||||||
|
|
||||||
// ProcessGroup is a base class that captures collective and point to
|
// ProcessGroup is a base class that captures collective and point to
|
||||||
// point communication in a fixed set of processes.
|
// point communication in a fixed set of processes.
|
||||||
|
|
@ -75,13 +76,13 @@ bool isP2POp(OpType opType);
|
||||||
// process group to find each other (referred to as rendezvous from
|
// process group to find each other (referred to as rendezvous from
|
||||||
// hereon)
|
// hereon)
|
||||||
//
|
//
|
||||||
class ProcessGroup : public torch::CustomClassHolder {
|
class TORCH_API ProcessGroup : public torch::CustomClassHolder {
|
||||||
public:
|
public:
|
||||||
// Please do not use ProcessGroup::Work API, it is going away, to be
|
// Please do not use ProcessGroup::Work API, it is going away, to be
|
||||||
// replaced by ivalue::Future.
|
// replaced by ivalue::Future.
|
||||||
// Python binding for this class might change, please do not assume
|
// Python binding for this class might change, please do not assume
|
||||||
// this will be bound using pybind.
|
// this will be bound using pybind.
|
||||||
class Work : public torch::CustomClassHolder {
|
class TORCH_API Work : public torch::CustomClassHolder {
|
||||||
public:
|
public:
|
||||||
Work(
|
Work(
|
||||||
int rank = -1,
|
int rank = -1,
|
||||||
|
|
@ -176,7 +177,7 @@ class ProcessGroup : public torch::CustomClassHolder {
|
||||||
// when constructing a ProcessGroup. Each ProcessGroup subclass should
|
// when constructing a ProcessGroup. Each ProcessGroup subclass should
|
||||||
// extend this struct and define its options if it wants to provide more
|
// extend this struct and define its options if it wants to provide more
|
||||||
// config options (beyond basic ones defined here) to end user.
|
// config options (beyond basic ones defined here) to end user.
|
||||||
struct Options : torch::CustomClassHolder {
|
struct TORCH_API Options : torch::CustomClassHolder {
|
||||||
explicit Options(
|
explicit Options(
|
||||||
std::string backend,
|
std::string backend,
|
||||||
std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout)
|
std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout)
|
||||||
|
|
|
||||||
|
|
@ -50,7 +50,7 @@ constexpr const char* GLOO_BACKEND_NAME = "gloo";
|
||||||
// number can be automatically tuned, but only if we let a single
|
// number can be automatically tuned, but only if we let a single
|
||||||
// process take charge, and have it broadcast the limits.
|
// process take charge, and have it broadcast the limits.
|
||||||
//
|
//
|
||||||
class ProcessGroupGloo : public ProcessGroup {
|
class TORCH_API ProcessGroupGloo : public ProcessGroup {
|
||||||
public:
|
public:
|
||||||
// AsyncWork is the Gloo specific superclass for asynchronous work items.
|
// AsyncWork is the Gloo specific superclass for asynchronous work items.
|
||||||
// We can split asynchronous work into 3 phases:
|
// We can split asynchronous work into 3 phases:
|
||||||
|
|
@ -68,7 +68,7 @@ class ProcessGroupGloo : public ProcessGroup {
|
||||||
//
|
//
|
||||||
// FIXME: This probably should be called WorkGloo since the work is executed in sync mode
|
// FIXME: This probably should be called WorkGloo since the work is executed in sync mode
|
||||||
// by a background thread.
|
// by a background thread.
|
||||||
class AsyncWork : public ProcessGroup::Work {
|
class TORCH_API AsyncWork : public ProcessGroup::Work {
|
||||||
public:
|
public:
|
||||||
explicit AsyncWork(
|
explicit AsyncWork(
|
||||||
std::vector<std::vector<at::Tensor>> outputTensors,
|
std::vector<std::vector<at::Tensor>> outputTensors,
|
||||||
|
|
@ -97,7 +97,7 @@ class ProcessGroupGloo : public ProcessGroup {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Wrap c10d store as Gloo store
|
// Wrap c10d store as Gloo store
|
||||||
class GlooStore : public ::gloo::rendezvous::Store {
|
class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
|
||||||
public:
|
public:
|
||||||
GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
|
GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {}
|
||||||
|
|
||||||
|
|
@ -140,7 +140,7 @@ class ProcessGroupGloo : public ProcessGroup {
|
||||||
// recv operation. It keeps a reference to the tensor it is
|
// recv operation. It keeps a reference to the tensor it is
|
||||||
// operating on to prevent it from being deallocated while the
|
// operating on to prevent it from being deallocated while the
|
||||||
// operation is still in flight.
|
// operation is still in flight.
|
||||||
class SendWork : public ProcessGroup::Work {
|
class TORCH_API SendWork : public ProcessGroup::Work {
|
||||||
public:
|
public:
|
||||||
explicit SendWork(
|
explicit SendWork(
|
||||||
at::Tensor& tensor,
|
at::Tensor& tensor,
|
||||||
|
|
@ -155,7 +155,7 @@ class ProcessGroupGloo : public ProcessGroup {
|
||||||
std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
|
std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
|
||||||
};
|
};
|
||||||
|
|
||||||
class RecvWork : public ProcessGroup::Work {
|
class TORCH_API RecvWork : public ProcessGroup::Work {
|
||||||
public:
|
public:
|
||||||
explicit RecvWork(
|
explicit RecvWork(
|
||||||
at::Tensor& tensor,
|
at::Tensor& tensor,
|
||||||
|
|
@ -174,7 +174,7 @@ class ProcessGroupGloo : public ProcessGroup {
|
||||||
int srcRank_;
|
int srcRank_;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Options : public ProcessGroup::Options {
|
struct TORCH_API Options : public ProcessGroup::Options {
|
||||||
explicit Options(
|
explicit Options(
|
||||||
std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout);
|
std::chrono::milliseconds timeout = kProcessGroupDefaultTimeout);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -78,7 +78,7 @@ struct WorkEntry {
|
||||||
//
|
//
|
||||||
// CUDA tensor can be supported if the MPI used is CUDA-aware MPI, and
|
// CUDA tensor can be supported if the MPI used is CUDA-aware MPI, and
|
||||||
// ProcessGroupMPI will automatically detect this support.
|
// ProcessGroupMPI will automatically detect this support.
|
||||||
class ProcessGroupMPI : public ProcessGroup {
|
class TORCH_API ProcessGroupMPI : public ProcessGroup {
|
||||||
public:
|
public:
|
||||||
class WorkMPI : public ProcessGroup::Work {
|
class WorkMPI : public ProcessGroup::Work {
|
||||||
public:
|
public:
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,7 @@ constexpr const char* NCCL_BACKEND_NAME = "nccl";
|
||||||
// work->wait()
|
// work->wait()
|
||||||
//
|
//
|
||||||
// // Now continue on other work in the current stream.
|
// // Now continue on other work in the current stream.
|
||||||
class ProcessGroupNCCL : public ProcessGroup {
|
class TORCH_API ProcessGroupNCCL : public ProcessGroup {
|
||||||
public:
|
public:
|
||||||
class WorkNCCL : public ProcessGroup::Work,
|
class WorkNCCL : public ProcessGroup::Work,
|
||||||
public std::enable_shared_from_this<WorkNCCL> {
|
public std::enable_shared_from_this<WorkNCCL> {
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ constexpr const char* ROUND_ROBIN_BACKEND_NAME = "round_robin";
|
||||||
// across all processes in the process group. This is the only way that we
|
// across all processes in the process group. This is the only way that we
|
||||||
// can guarantee to match up the same calls among all processes.
|
// can guarantee to match up the same calls among all processes.
|
||||||
//
|
//
|
||||||
class ProcessGroupRoundRobin final : public ProcessGroup {
|
class TORCH_API ProcessGroupRoundRobin final : public ProcessGroup {
|
||||||
public:
|
public:
|
||||||
explicit ProcessGroupRoundRobin(
|
explicit ProcessGroupRoundRobin(
|
||||||
int rank,
|
int rank,
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
||||||
class ProcessGroupWrapper : public ProcessGroup {
|
class TORCH_API ProcessGroupWrapper : public ProcessGroup {
|
||||||
public:
|
public:
|
||||||
explicit ProcessGroupWrapper(
|
explicit ProcessGroupWrapper(
|
||||||
c10::intrusive_ptr<ProcessGroup> pg,
|
c10::intrusive_ptr<ProcessGroup> pg,
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
#include <torch/custom_class.h>
|
#include <torch/custom_class.h>
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
@ -15,7 +16,7 @@ namespace c10d {
|
||||||
using WatchKeyCallback =
|
using WatchKeyCallback =
|
||||||
std::function<void(c10::optional<std::string>, c10::optional<std::string>)>;
|
std::function<void(c10::optional<std::string>, c10::optional<std::string>)>;
|
||||||
|
|
||||||
class Store : public torch::CustomClassHolder {
|
class TORCH_API Store : public torch::CustomClassHolder {
|
||||||
public:
|
public:
|
||||||
static constexpr std::chrono::milliseconds kDefaultTimeout =
|
static constexpr std::chrono::milliseconds kDefaultTimeout =
|
||||||
std::chrono::seconds(300);
|
std::chrono::seconds(300);
|
||||||
|
|
|
||||||
|
|
@ -36,7 +36,7 @@ struct TCPStoreOptions {
|
||||||
bool multiTenant = false;
|
bool multiTenant = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
class TCPStore : public Store {
|
class TORCH_API TCPStore : public Store {
|
||||||
public:
|
public:
|
||||||
explicit TCPStore(std::string host, const TCPStoreOptions& opts = {});
|
explicit TCPStore(std::string host, const TCPStoreOptions& opts = {});
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,12 +45,12 @@ extern const char* kDistDebugDetailLogLevel;
|
||||||
extern const char* kDistDebugInfoLogLevel;
|
extern const char* kDistDebugInfoLogLevel;
|
||||||
extern const char* kDistDebugOffLogLevel;
|
extern const char* kDistDebugOffLogLevel;
|
||||||
|
|
||||||
std::string parse_env(const char* env_var_name);
|
TORCH_API std::string parse_env(const char* env_var_name);
|
||||||
|
|
||||||
DistributedDebugLevel parseDistDebugLevel();
|
TORCH_API DistributedDebugLevel parseDistDebugLevel();
|
||||||
|
|
||||||
// Retrieve tensor shapes from a given tensor.
|
// Retrieve tensor shapes from a given tensor.
|
||||||
std::vector<at::Tensor> getTensorShapes(const std::vector<at::Tensor>& tensors);
|
TORCH_API std::vector<at::Tensor> getTensorShapes(const std::vector<at::Tensor>& tensors);
|
||||||
|
|
||||||
// Turns at::IntArrayRef into "(1, 2, 3, 4)".
|
// Turns at::IntArrayRef into "(1, 2, 3, 4)".
|
||||||
inline std::string toString(at::IntArrayRef l) {
|
inline std::string toString(at::IntArrayRef l) {
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@
|
||||||
#include <ATen/core/functional.h>
|
#include <ATen/core/functional.h>
|
||||||
#include <c10/util/irange.h>
|
#include <c10/util/irange.h>
|
||||||
#include <c10d/reducer.hpp>
|
#include <c10d/reducer.hpp>
|
||||||
#include <torch/csrc/jit/python/pybind_utils.h>
|
|
||||||
#include <torch/csrc/utils/tensor_flatten.h>
|
#include <torch/csrc/utils/tensor_flatten.h>
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
|
||||||
|
|
@ -7,14 +7,14 @@
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
||||||
// Broadcast many tensors to all processes in the process group.
|
// Broadcast many tensors to all processes in the process group.
|
||||||
void broadcast_coalesced(
|
TORCH_API void broadcast_coalesced(
|
||||||
c10::intrusive_ptr<c10d::ProcessGroup> process_group,
|
c10::intrusive_ptr<c10d::ProcessGroup> process_group,
|
||||||
at::TensorList tensors,
|
at::TensorList tensors,
|
||||||
size_t buffer_size,
|
size_t buffer_size,
|
||||||
int rank = 0);
|
int rank = 0);
|
||||||
|
|
||||||
// This class passes bucket contents tensor to DDP communication hook.
|
// This class passes bucket contents tensor to DDP communication hook.
|
||||||
class GradBucket {
|
class TORCH_API GradBucket {
|
||||||
public:
|
public:
|
||||||
explicit GradBucket(
|
explicit GradBucket(
|
||||||
size_t index,
|
size_t index,
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,6 @@
|
||||||
add_executable(allreduce allreduce.cpp)
|
add_executable(allreduce allreduce.cpp)
|
||||||
target_include_directories(allreduce PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
target_include_directories(allreduce PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||||
target_link_libraries(allreduce pthread c10d)
|
target_link_libraries(allreduce pthread torch_cpu)
|
||||||
|
if(USE_CUDA)
|
||||||
|
target_link_libraries(allreduce torch_cuda)
|
||||||
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
|
|
||||||
class Logger {
|
class TORCH_API Logger {
|
||||||
public:
|
public:
|
||||||
explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
|
explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
|
||||||
// Set logging data that can be got during DistributedDataParallel
|
// Set logging data that can be got during DistributedDataParallel
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@
|
||||||
|
|
||||||
#include <ATen/core/ivalue_inl.h>
|
#include <ATen/core/ivalue_inl.h>
|
||||||
#include <ATen/ThreadLocalState.h>
|
#include <ATen/ThreadLocalState.h>
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
#include <c10/util/intrusive_ptr.h>
|
#include <c10/util/intrusive_ptr.h>
|
||||||
#include <c10d/ProcessGroup.hpp>
|
#include <c10d/ProcessGroup.hpp>
|
||||||
#include <c10d/Utils.hpp>
|
#include <c10d/Utils.hpp>
|
||||||
|
|
@ -30,7 +31,7 @@ constexpr int kDDPRuntimeLoggingSampleRate = 100;
|
||||||
// Forward declaration
|
// Forward declaration
|
||||||
class Logger;
|
class Logger;
|
||||||
|
|
||||||
class Timer {
|
class TORCH_API Timer {
|
||||||
public:
|
public:
|
||||||
enum class Event {
|
enum class Event {
|
||||||
kForwardStart,
|
kForwardStart,
|
||||||
|
|
@ -52,7 +53,7 @@ class Timer {
|
||||||
|
|
||||||
C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
|
C10_DECLARE_TYPED_REGISTRY(TimerRegistry, c10::DeviceType, Timer, std::unique_ptr, c10::Device);
|
||||||
|
|
||||||
class Reducer {
|
class TORCH_API Reducer {
|
||||||
public:
|
public:
|
||||||
// The constructor takes a list of variables for every model replica.
|
// The constructor takes a list of variables for every model replica.
|
||||||
// The bucket assignment for this reducer is specified as a list of
|
// The bucket assignment for this reducer is specified as a list of
|
||||||
|
|
@ -492,7 +493,7 @@ class Reducer {
|
||||||
// The index of tensors[i] assigned to bucket is tensor_indices[i],
|
// The index of tensors[i] assigned to bucket is tensor_indices[i],
|
||||||
// when tensor_indices is empty, the index of tensors[i] assigned to
|
// when tensor_indices is empty, the index of tensors[i] assigned to
|
||||||
// bucket is i.
|
// bucket is i.
|
||||||
std::vector<std::vector<size_t>> compute_bucket_assignment_by_size(
|
TORCH_API std::vector<std::vector<size_t>> compute_bucket_assignment_by_size(
|
||||||
const std::vector<at::Tensor>& tensors,
|
const std::vector<at::Tensor>& tensors,
|
||||||
const std::vector<size_t>& bucket_size,
|
const std::vector<size_t>& bucket_size,
|
||||||
const std::vector<bool>& expect_sparse_gradient = {},
|
const std::vector<bool>& expect_sparse_gradient = {},
|
||||||
|
|
@ -500,7 +501,7 @@ std::vector<std::vector<size_t>> compute_bucket_assignment_by_size(
|
||||||
|
|
||||||
// Verify models across all processes are the same as model on rank 0 with
|
// Verify models across all processes are the same as model on rank 0 with
|
||||||
// respect to no. of params and matching dtype/size/layout.
|
// respect to no. of params and matching dtype/size/layout.
|
||||||
void verify_replica0_across_processes(
|
TORCH_API void verify_replica0_across_processes(
|
||||||
c10::intrusive_ptr<c10d::ProcessGroup> process_group,
|
c10::intrusive_ptr<c10d::ProcessGroup> process_group,
|
||||||
std::vector<std::vector<at::Tensor>> model_replicas);
|
std::vector<std::vector<at::Tensor>> model_replicas);
|
||||||
} // namespace c10d
|
} // namespace c10d
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,5 @@
|
||||||
#include <c10d/reducer.hpp>
|
#include <c10d/reducer.hpp>
|
||||||
|
|
||||||
#ifdef USE_CUDA
|
|
||||||
|
|
||||||
#include <c10/core/DeviceGuard.h>
|
#include <c10/core/DeviceGuard.h>
|
||||||
#include <ATen/cuda/CUDAEvent.h>
|
#include <ATen/cuda/CUDAEvent.h>
|
||||||
|
|
||||||
|
|
@ -85,5 +83,3 @@ C10_REGISTER_TYPED_CLASS(TimerRegistry, c10::kCUDA, CudaTimer);
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace c10d
|
} // namespace c10d
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <c10/macros/Macros.h>
|
||||||
#include <c10/util/Optional.h>
|
#include <c10/util/Optional.h>
|
||||||
#include <c10/util/irange.h>
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
|
|
@ -36,7 +37,7 @@ inline uint64_t fromVec(const std::vector<T>& values) {
|
||||||
return num;
|
return num;
|
||||||
}
|
}
|
||||||
|
|
||||||
class SequenceNum {
|
class TORCH_API SequenceNum {
|
||||||
public:
|
public:
|
||||||
SequenceNum();
|
SequenceNum();
|
||||||
explicit SequenceNum(const uint64_t num);
|
explicit SequenceNum(const uint64_t num);
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
cuda_add_library(c10d_cuda_test CUDATest.cu)
|
cuda_add_library(c10d_cuda_test CUDATest.cu)
|
||||||
target_link_libraries(c10d_cuda_test c10d)
|
target_link_libraries(c10d_cuda_test torch_cuda)
|
||||||
add_dependencies(c10d_cuda_test c10d)
|
add_dependencies(c10d_cuda_test torch_cuda)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
function(c10d_add_test test_src)
|
function(c10d_add_test test_src)
|
||||||
|
|
@ -16,29 +16,40 @@ function(c10d_add_test test_src)
|
||||||
add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
|
add_test(NAME ${test_name} COMMAND $<TARGET_FILE:${test_name}>)
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
c10d_add_test(FileStoreTest.cpp c10d gtest_main)
|
c10d_add_test(FileStoreTest.cpp torch_cpu gtest_main)
|
||||||
c10d_add_test(TCPStoreTest.cpp c10d gtest_main)
|
c10d_add_test(TCPStoreTest.cpp torch_cpu gtest_main)
|
||||||
if(NOT WIN32)
|
if(NOT WIN32)
|
||||||
c10d_add_test(HashStoreTest.cpp c10d gtest_main)
|
c10d_add_test(HashStoreTest.cpp torch_cpu gtest_main)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_CUDA)
|
if(USE_CUDA)
|
||||||
if(USE_C10D_GLOO)
|
if(USE_GLOO AND USE_C10D_GLOO)
|
||||||
c10d_add_test(ProcessGroupGlooTest.cpp c10d c10d_cuda_test gtest_main)
|
c10d_add_test(ProcessGroupGlooTest.cpp torch_cpu c10d_cuda_test gtest_main)
|
||||||
c10d_add_test(ProcessGroupGlooAsyncTest.cpp c10d c10d_cuda_test gtest_main)
|
c10d_add_test(ProcessGroupGlooAsyncTest.cpp torch_cpu c10d_cuda_test gtest_main)
|
||||||
endif()
|
endif()
|
||||||
if(USE_C10D_NCCL)
|
if(USE_NCCL AND USE_C10D_NCCL)
|
||||||
c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test gtest_main)
|
# NCCL is a private dependency of libtorch, but the tests include some
|
||||||
c10d_add_test(ProcessGroupNCCLErrorsTest.cpp c10d c10d_cuda_test
|
# private headers of libtorch, which in turn include NCCL. As a hacky
|
||||||
gtest_main)
|
# alternative to making NCCL a public dependency of libtorch, we make it
|
||||||
|
# a private dependency of the tests as well.
|
||||||
|
c10d_add_test(
|
||||||
|
ProcessGroupNCCLTest.cpp
|
||||||
|
torch_cpu c10d_cuda_test gtest_main __caffe2_nccl)
|
||||||
|
c10d_add_test(
|
||||||
|
ProcessGroupNCCLErrorsTest.cpp
|
||||||
|
torch_cpu c10d_cuda_test gtest_main __caffe2_nccl)
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
if(USE_C10D_GLOO)
|
if(USE_GLOO AND USE_C10D_GLOO)
|
||||||
c10d_add_test(ProcessGroupGlooTest.cpp c10d gtest_main)
|
c10d_add_test(ProcessGroupGlooTest.cpp torch_cpu gtest_main)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(USE_C10D_MPI)
|
if(USE_MPI AND USE_C10D_MPI)
|
||||||
add_definitions(-DMPIEXEC=${MPIEXEC})
|
add_definitions(-DMPIEXEC=${MPIEXEC})
|
||||||
c10d_add_test(ProcessGroupMPITest.cpp c10d)
|
# MPI is a private dependency of libtorch, but the tests include some
|
||||||
|
# private headers of libtorch, which in turn include MPI. As a hacky
|
||||||
|
# alternative to making MPI a public dependency of libtorch, we make it
|
||||||
|
# a private dependency of the tests as well.
|
||||||
|
c10d_add_test(ProcessGroupMPITest.cpp torch_cpu ${MPI_CXX_LIBRARIES})
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user