From 08ce5eedf526181bccc64a3b9a3bb869bc06c38d Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Fri, 18 Jun 2021 05:12:27 -0700 Subject: [PATCH] [reland] Move RPC agents to libtorch (#60170) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/60170 Reland of #59939. Test Plan: CI Reviewed By: mrshenli Differential Revision: D29193234 fbshipit-source-id: ee2a90d6be961c10f91361512bdd4cadca43dd60 --- BUILD.bazel | 2 +- caffe2/CMakeLists.txt | 57 +-------- cmake/Dependencies.cmake | 7 ++ test/cpp/rpc/CMakeLists.txt | 4 +- third_party/tensorpipe | 2 +- third_party/tensorpipe.BUILD | 112 ++++++++++-------- tools/build_variables.bzl | 12 +- torch/CMakeLists.txt | 6 - torch/csrc/distributed/rpc/macros.h | 5 - .../csrc/distributed/rpc/tensorpipe_agent.cpp | 1 - torch/csrc/distributed/rpc/tensorpipe_agent.h | 1 - .../csrc/distributed/rpc/tensorpipe_cuda.cpp | 3 +- .../csrc/distributed/rpc/tensorpipe_utils.cpp | 1 - torch/csrc/distributed/rpc/tensorpipe_utils.h | 1 - 14 files changed, 86 insertions(+), 128 deletions(-) delete mode 100644 torch/csrc/distributed/rpc/macros.h diff --git a/BUILD.bazel b/BUILD.bazel index b7e16ac1c91..45d71a8d462 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1728,7 +1728,7 @@ cc_library( ], [ ":aten", - "@tensorpipe", + "@tensorpipe//:tensorpipe_cpu", ], ), alwayslink = True, diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 174018456ef..88cffd1a75d 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -344,53 +344,6 @@ endif() if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE) - if(USE_DISTRIBUTED) - - # Define this target even if we're building without TensorPipe, to make life - # easier to other targets that depend on this. However, in that case, by not - # setting the USE_TENSORPIPE compile definition, this target will just end - # up being empty. Downstream targets should also add a #ifdef guard. - if(NOT WIN32) - add_library(process_group_agent - "${TORCH_SRC_DIR}/csrc/distributed/rpc/agent_utils.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/agent_utils.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.h" - ) - target_link_libraries(process_group_agent PRIVATE torch fmt::fmt-header-only) - add_dependencies(process_group_agent torch) - - if(USE_TENSORPIPE) - add_library(tensorpipe_agent - "${TORCH_SRC_DIR}/csrc/distributed/rpc/agent_utils.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/agent_utils.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/macros.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_agent.h" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_cuda.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.cpp" - "${TORCH_SRC_DIR}/csrc/distributed/rpc/tensorpipe_utils.h" - ) - target_link_libraries(tensorpipe_agent PRIVATE torch tensorpipe fmt::fmt-header-only) - add_dependencies(tensorpipe_agent torch) - if(USE_CUDA) - target_compile_definitions(tensorpipe_agent PUBLIC USE_CUDA) - endif() - - if(USE_ROCM) - target_compile_definitions(tensorpipe_agent PRIVATE - USE_ROCM - __HIP_PLATFORM_HCC__ - ) - endif() - - target_compile_definitions(tensorpipe_agent PUBLIC USE_TENSORPIPE) - target_link_libraries(tensorpipe_agent PRIVATE tensorpipe) - add_dependencies(tensorpipe_agent tensorpipe) - endif() - endif() - endif() - set(CMAKE_POSITION_INDEPENDENT_CODE TRUE) # Generate files @@ -1236,7 +1189,7 @@ endif() if(USE_DISTRIBUTED) # Needed to support the inclusion of c10d/Foo.hpp headers. target_include_directories(torch_cpu PUBLIC ${TORCH_SRC_DIR}/lib) - target_compile_definitions(torch_cpu PRIVATE USE_DISTRIBUTED) + target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED) if(USE_GLOO AND USE_C10D_GLOO) target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO) endif() @@ -1263,16 +1216,12 @@ if(USE_DISTRIBUTED) # #if defined(USE_DISTRIBUTED) && !defined(_WIN32) # need to be removed when RPC is supported if(NOT WIN32) - target_compile_definitions(torch_cpu PRIVATE - USE_RPC - ) + target_compile_definitions(torch_cpu PUBLIC USE_RPC) endif() # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp # can only be compiled with USE_TENSORPIPE is set. if(USE_TENSORPIPE) - target_compile_definitions(torch_cpu PRIVATE - USE_TENSORPIPE - ) + target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE) endif() endif() diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index ab4cd32c40b..70b6d71face 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1377,6 +1377,13 @@ if(USE_DISTRIBUTED AND USE_TENSORPIPE) add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/tensorpipe) list(APPEND Caffe2_DEPENDENCY_LIBS tensorpipe) + if(USE_CUDA) + list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS tensorpipe_cuda) + elseif(USE_ROCM) + message(WARNING "TensorPipe doesn't yet support ROCm") + # Not yet... + # list(APPEND Caffe2_HIP_DEPENDENCY_LIBS tensorpipe_hip) + endif() endif() endif() diff --git a/test/cpp/rpc/CMakeLists.txt b/test/cpp/rpc/CMakeLists.txt index 0eff382d2b1..c9fb1b0e7f1 100644 --- a/test/cpp/rpc/CMakeLists.txt +++ b/test/cpp/rpc/CMakeLists.txt @@ -5,7 +5,7 @@ set(TORCH_RPC_TEST_SOURCES ${TORCH_RPC_TEST_DIR}/test_wire_serialization.cpp ) set(TORCH_RPC_TEST_DEPENDENCY_LIBS - torch gtest process_group_agent + torch gtest ) if(USE_GLOO) @@ -20,7 +20,7 @@ if(USE_TENSORPIPE) ${TORCH_RPC_TEST_DIR}/test_tensorpipe_serialization.cpp ) list(APPEND TORCH_RPC_TEST_DEPENDENCY_LIBS - tensorpipe_agent tensorpipe + tensorpipe ) endif() diff --git a/third_party/tensorpipe b/third_party/tensorpipe index 42a67277c18..c0e7623adb0 160000 --- a/third_party/tensorpipe +++ b/third_party/tensorpipe @@ -1 +1 @@ -Subproject commit 42a67277c1882c90cec0da6e57afb20247424994 +Subproject commit c0e7623adb05f36311c7cde6dac8fc4c290419d9 diff --git a/third_party/tensorpipe.BUILD b/third_party/tensorpipe.BUILD index d9e4bdb3957..ae210f47393 100644 --- a/third_party/tensorpipe.BUILD +++ b/third_party/tensorpipe.BUILD @@ -71,63 +71,82 @@ cc_library( ) header_template_rule( - name = "tensorpipe_config_header", + name = "tensorpipe_cpu_config_header", src = "tensorpipe/config.h.in", out = "tensorpipe/config.h", substitutions = { - "#cmakedefine01 TENSORPIPE_HAS_SHM_TRANSPORT": "", - "#cmakedefine01 TENSORPIPE_HAS_CMA_CHANNEL": "", - "#cmakedefine01 TENSORPIPE_HAS_CUDA_IPC_CHANNEL": "", - "#cmakedefine01 TENSORPIPE_HAS_CUDA_GDR_CHANNEL": "", - "#cmakedefine01 TENSORPIPE_HAS_IBV_TRANSPORT": "", - "#cmakedefine01 TENSORPIPE_SUPPORTS_CUDA": "", + "#cmakedefine01 TENSORPIPE_HAS_SHM_TRANSPORT": "#define TENSORPIPE_HAS_SHM_TRANSPORT 1", + "#cmakedefine01 TENSORPIPE_HAS_IBV_TRANSPORT": "#define TENSORPIPE_HAS_IBV_TRANSPORT 1", + "#cmakedefine01 TENSORPIPE_HAS_CMA_CHANNEL": "#define TENSORPIPE_HAS_CMA_CHANNEL 1", }, ) -TENSORPIPE_HEADERS = glob([ - "tensorpipe/*.h", - "tensorpipe/channel/*.h", - "tensorpipe/channel/*/*.h", - "tensorpipe/common/*.h", - "tensorpipe/core/*.h", - "tensorpipe/transport/*.h", - "tensorpipe/transport/*/*.h", - "tensorpipe/util/*/*.h", -]) +header_template_rule( + name = "tensorpipe_cuda_config_header", + src = "tensorpipe/config_cuda.h.in", + out = "tensorpipe/config_cuda.h", + substitutions = { + "#cmakedefine01 TENSORPIPE_HAS_CUDA_IPC_CHANNEL": "#define TENSORPIPE_HAS_CUDA_IPC_CHANNEL 1", + "#cmakedefine01 TENSORPIPE_HAS_CUDA_GDR_CHANNEL": "#define TENSORPIPE_HAS_CUDA_GDR_CHANNEL 1", + }, +) -TENSORPIPE_BASE_SRCS = glob([ - "tensorpipe/*.cc", - "tensorpipe/channel/*.cc", - "tensorpipe/common/address.cc", - "tensorpipe/common/epoll_loop.cc", - "tensorpipe/common/error.cc", - "tensorpipe/common/fd.cc", - "tensorpipe/common/ibv.cc", - "tensorpipe/common/socket.cc", - "tensorpipe/common/system.cc", - "tensorpipe/core/*.cc", - "tensorpipe/transport/*.cc", - "tensorpipe/util/*/*.cc", -]) +# We explicitly list the CUDA headers & sources, and we consider everything else +# as CPU (using a catch-all glob). This is both because there's fewer CUDA files +# (thus making it easier to list them exhaustively) and because it will make it +# more likely to catch a misclassified file: if we forget to mark a file as CUDA +# we'll try to build it on CPU and that's likely to fail. -TENSORPIPE_SRCS = TENSORPIPE_BASE_SRCS + glob([ - "tensorpipe/channel/basic/*.cc", - "tensorpipe/channel/mpt/*.cc", - "tensorpipe/channel/xth/*.cc", - "tensorpipe/transport/uv/*.cc", -]) +TENSORPIPE_CUDA_HEADERS = [ + "tensorpipe/tensorpipe_cuda.h", + "tensorpipe/channel/cuda_basic/*.h", + "tensorpipe/channel/cuda_gdr/*.h", + "tensorpipe/channel/cuda_ipc/*.h", + "tensorpipe/channel/cuda_xth/*.h", + "tensorpipe/common/cuda.h", + "tensorpipe/common/cuda_buffer.h", + "tensorpipe/common/cuda_lib.h", + "tensorpipe/common/cuda_loop.h", + "tensorpipe/common/nvml_lib.h", +] -TENSORPIPE_SRCS_CUDA = TENSORPIPE_SRCS + glob([ - "tensorpipe/common/cuda_loop.cc", +TENSORPIPE_CUDA_SOURCES = [ "tensorpipe/channel/cuda_basic/*.cc", + "tensorpipe/channel/cuda_gdr/*.cc", "tensorpipe/channel/cuda_ipc/*.cc", "tensorpipe/channel/cuda_xth/*.cc", -]) + "tensorpipe/common/cuda_buffer.cc", + "tensorpipe/common/cuda_loop.cc", +] + +TENSORPIPE_CPU_HEADERS = glob( + [ + "tensorpipe/*.h", + "tensorpipe/channel/*.h", + "tensorpipe/channel/*/*.h", + "tensorpipe/common/*.h", + "tensorpipe/core/*.h", + "tensorpipe/transport/*.h", + "tensorpipe/transport/*/*.h", + ], + exclude=TENSORPIPE_CUDA_HEADERS) + +TENSORPIPE_CPU_SOURCES = glob( + [ + "tensorpipe/*.cc", + "tensorpipe/channel/*.cc", + "tensorpipe/channel/*/*.cc", + "tensorpipe/common/*.cc", + "tensorpipe/core/*.cc", + "tensorpipe/transport/*.cc", + "tensorpipe/transport/*/*.cc", + ], + exclude=TENSORPIPE_CUDA_SOURCES) cc_library( - name = "tensorpipe", - srcs = TENSORPIPE_SRCS + [":tensorpipe_config_header"], - hdrs = TENSORPIPE_HEADERS, + name = "tensorpipe_cpu", + srcs = TENSORPIPE_CPU_SOURCES, + hdrs = TENSORPIPE_CPU_HEADERS + [":tensorpipe_cpu_config_header"], includes = [ ".", ], @@ -143,8 +162,8 @@ cc_library( cc_library( name = "tensorpipe_cuda", - srcs = TENSORPIPE_SRCS_CUDA + [":tensorpipe_config_header"], - hdrs = TENSORPIPE_HEADERS, + srcs = TENSORPIPE_CUDA_SOURCES, + hdrs = TENSORPIPE_CUDA_HEADERS + [":tensorpipe_cuda_config_header"], includes = [ ".", ], @@ -153,8 +172,7 @@ cc_library( ], visibility = ["//visibility:public"], deps = [ - ":libnop", - ":libuv", + ":tensorpipe_cpu", "@cuda", ], ) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index c0b206fa48e..06acafd645e 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -356,12 +356,14 @@ libtorch_distributed_extra_sources = [ "torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.cpp", "torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.cpp", "torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.cpp", + "torch/csrc/distributed/rpc/agent_utils.cpp", "torch/csrc/distributed/rpc/message.cpp", "torch/csrc/distributed/rpc/profiler/remote_profiler_manager.cpp", "torch/csrc/distributed/rpc/profiler/server_process_global_profiler.cpp", "torch/csrc/distributed/rpc/python_call.cpp", "torch/csrc/distributed/rpc/python_remote_call.cpp", "torch/csrc/distributed/rpc/python_resp.cpp", + "torch/csrc/distributed/rpc/process_group_agent.cpp", "torch/csrc/distributed/rpc/request_callback.cpp", "torch/csrc/distributed/rpc/request_callback_no_python.cpp", "torch/csrc/distributed/rpc/rpc_agent.cpp", @@ -371,6 +373,9 @@ libtorch_distributed_extra_sources = [ "torch/csrc/distributed/rpc/script_call.cpp", "torch/csrc/distributed/rpc/script_remote_call.cpp", "torch/csrc/distributed/rpc/script_resp.cpp", + "torch/csrc/distributed/rpc/tensorpipe_agent.cpp", + "torch/csrc/distributed/rpc/tensorpipe_utils.cpp", + "torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp", "torch/csrc/distributed/rpc/torchscript_functions.cpp", "torch/csrc/distributed/rpc/types.cpp", "torch/csrc/distributed/rpc/utils.cpp", @@ -526,6 +531,7 @@ libtorch_cuda_distributed_base_sources = [ # These files are only supported on Linux (and others) but not on Windows. libtorch_cuda_distributed_extra_sources = [ + "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp", "torch/lib/c10d/NCCLUtils.cpp", "torch/lib/c10d/ProcessGroupNCCL.cpp", ] @@ -714,17 +720,11 @@ libtorch_python_distributed_core_sources = [ libtorch_python_distributed_sources = libtorch_python_distributed_core_sources + [ "torch/csrc/distributed/autograd/init.cpp", - "torch/csrc/distributed/rpc/agent_utils.cpp", "torch/csrc/distributed/rpc/init.cpp", - "torch/csrc/distributed/rpc/process_group_agent.cpp", "torch/csrc/distributed/rpc/py_rref.cpp", "torch/csrc/distributed/rpc/python_functions.cpp", "torch/csrc/distributed/rpc/python_rpc_handler.cpp", "torch/csrc/distributed/rpc/request_callback_impl.cpp", - "torch/csrc/distributed/rpc/tensorpipe_agent.cpp", - "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp", - "torch/csrc/distributed/rpc/tensorpipe_utils.cpp", - "torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp", "torch/csrc/distributed/rpc/testing/init.cpp", "torch/csrc/distributed/rpc/unpickled_python_call.cpp", "torch/csrc/distributed/rpc/unpickled_python_remote_call.cpp", diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt index 197926f3098..ce0f16bf5ab 100644 --- a/torch/CMakeLists.txt +++ b/torch/CMakeLists.txt @@ -261,11 +261,9 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") endif() if(USE_DISTRIBUTED) - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_DISTRIBUTED) if(WIN32) append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS) else() - list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_RPC) append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS) endif() # Disable certain warnings for GCC-9.X @@ -274,10 +272,6 @@ if(USE_DISTRIBUTED) set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type") endif() - if(USE_TENSORPIPE) - list(APPEND TORCH_PYTHON_LINK_LIBRARIES tensorpipe) - list(APPEND TORCH_PYTHON_PUBLIC_COMPILE_DEFINITIONS USE_TENSORPIPE) - endif() # NCCL is a private dependency of libtorch, but libtorch_python includes # some private headers of libtorch, which in turn include NCCL. As a hacky # alternative to making NCCL a public dependency of libtorch, we make it diff --git a/torch/csrc/distributed/rpc/macros.h b/torch/csrc/distributed/rpc/macros.h deleted file mode 100644 index 2763dd0207b..00000000000 --- a/torch/csrc/distributed/rpc/macros.h +++ /dev/null @@ -1,5 +0,0 @@ -#pragma once - -#if defined(USE_CUDA) && !defined(__HIP_PLATFORM_HCC__) -#define USE_CUDA_NOT_ROCM -#endif diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index 0f6645cdcd5..74c27942565 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -10,7 +10,6 @@ #include #include -#include #include #include diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h index 9462c396b0f..4450792a0f0 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.h +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h @@ -9,7 +9,6 @@ #include #include #include -#include #include // Forward-declare the TensorPipe classes we need, to avoid including its diff --git a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp index 9489fcd222b..03ec63d8ddc 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_cuda.cpp @@ -1,8 +1,7 @@ -#include #include #include -#if defined(USE_TENSORPIPE) && defined(USE_CUDA_NOT_ROCM) +#if defined(USE_TENSORPIPE) && !defined(__HIP_PLATFORM_HCC__) #include #include diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp index 55b8554f66d..32f3a132f8f 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_utils.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_utils.cpp @@ -1,4 +1,3 @@ -#include #include #ifdef USE_TENSORPIPE diff --git a/torch/csrc/distributed/rpc/tensorpipe_utils.h b/torch/csrc/distributed/rpc/tensorpipe_utils.h index ab328b9dca1..bf5d87cacc4 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_utils.h +++ b/torch/csrc/distributed/rpc/tensorpipe_utils.h @@ -2,7 +2,6 @@ #ifdef USE_TENSORPIPE -#include #include namespace tensorpipe {