From ffc6cbfaf78ca219092ce64dcf113377ae698300 Mon Sep 17 00:00:00 2001 From: fduwjj Date: Tue, 10 Jun 2025 12:39:34 -0700 Subject: [PATCH] [symm_mem] Move all symm mem code into a dedicated folder (#155573) We arrive at a point when so many files are related to symmetric memory and files are scattered around in the cpp side. Let's first put all related code (symmetric memory related) into a separate folder. We can do further refactoring later if needed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/155573 Approved by: https://github.com/fegin, https://github.com/d4l3k --- BUILD.bazel | 14 ++++---- build_variables.bzl | 36 +++++++++---------- caffe2/CMakeLists.txt | 16 ++++----- .../distributed/c10d/ProcessGroupNCCL.hpp | 2 +- torch/csrc/distributed/c10d/init.cpp | 6 ++-- .../{ => symm_mem}/CUDASymmetricMemory-inl.h | 0 .../{ => symm_mem}/CUDASymmetricMemory.cu | 6 ++-- .../{ => symm_mem}/CUDASymmetricMemory.hpp | 4 +-- .../{ => symm_mem}/CUDASymmetricMemoryOps.cu | 5 ++- .../CUDASymmetricMemoryTypes.hpp | 0 .../CUDASymmetricMemoryUtils.cpp | 2 +- .../CUDASymmetricMemoryUtils.hpp | 4 +-- .../{ => symm_mem}/CudaDMAConnectivity.cpp | 2 +- .../c10d/{ => symm_mem}/DMAConnectivity.cpp | 2 +- .../c10d/{ => symm_mem}/DMAConnectivity.hpp | 0 .../{ => symm_mem}/NVSHMEMSymmetricMemory.cu | 8 ++--- .../c10d/{ => symm_mem}/SymmetricMemory.cpp | 2 +- .../c10d/{ => symm_mem}/SymmetricMemory.hpp | 0 .../c10d/{ => symm_mem}/intra_node_comm.cpp | 5 ++- .../c10d/{ => symm_mem}/intra_node_comm.cu | 4 +-- .../c10d/{ => symm_mem}/intra_node_comm.hpp | 2 +- .../c10d/{ => symm_mem}/nvshmem_extension.cu | 9 +++-- .../c10d/{ => symm_mem}/nvshmem_extension.cuh | 0 23 files changed, 63 insertions(+), 66 deletions(-) rename torch/csrc/distributed/c10d/{ => symm_mem}/CUDASymmetricMemory-inl.h (100%) rename torch/csrc/distributed/c10d/{ => symm_mem}/CUDASymmetricMemory.cu (99%) rename torch/csrc/distributed/c10d/{ => symm_mem}/CUDASymmetricMemory.hpp (96%) rename torch/csrc/distributed/c10d/{ => symm_mem}/CUDASymmetricMemoryOps.cu (99%) rename torch/csrc/distributed/c10d/{ => symm_mem}/CUDASymmetricMemoryTypes.hpp (100%) rename torch/csrc/distributed/c10d/{ => symm_mem}/CUDASymmetricMemoryUtils.cpp (99%) rename torch/csrc/distributed/c10d/{ => symm_mem}/CUDASymmetricMemoryUtils.hpp (95%) rename torch/csrc/distributed/c10d/{ => symm_mem}/CudaDMAConnectivity.cpp (98%) rename torch/csrc/distributed/c10d/{ => symm_mem}/DMAConnectivity.cpp (97%) rename torch/csrc/distributed/c10d/{ => symm_mem}/DMAConnectivity.hpp (100%) rename torch/csrc/distributed/c10d/{ => symm_mem}/NVSHMEMSymmetricMemory.cu (97%) rename torch/csrc/distributed/c10d/{ => symm_mem}/SymmetricMemory.cpp (99%) rename torch/csrc/distributed/c10d/{ => symm_mem}/SymmetricMemory.hpp (100%) rename torch/csrc/distributed/c10d/{ => symm_mem}/intra_node_comm.cpp (97%) rename torch/csrc/distributed/c10d/{ => symm_mem}/intra_node_comm.cu (96%) rename torch/csrc/distributed/c10d/{ => symm_mem}/intra_node_comm.hpp (97%) rename torch/csrc/distributed/c10d/{ => symm_mem}/nvshmem_extension.cu (98%) rename torch/csrc/distributed/c10d/{ => symm_mem}/nvshmem_extension.cuh (100%) diff --git a/BUILD.bazel b/BUILD.bazel index 6de2683ecd6..70589e5af76 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -582,9 +582,9 @@ cc_library( cu_library( name = "torch_cuda", srcs = [ - "torch/csrc/distributed/c10d/intra_node_comm.cu", "torch/csrc/distributed/c10d/NanCheck.cu", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", + "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", ], copts = torch_cuda_half_options, visibility = ["//visibility:public"], @@ -745,15 +745,15 @@ cc_library( srcs = if_cuda(glob( libtorch_cuda_sources, exclude = [ - "torch/csrc/cuda/python_nccl.cpp", "torch/csrc/cuda/nccl.cpp", - "torch/csrc/distributed/c10d/intra_node_comm.cu", - "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu", - "torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu", - "torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp", - "torch/csrc/distributed/c10d/cuda/AsyncMM.cu", + "torch/csrc/cuda/python_nccl.cpp", "torch/csrc/distributed/c10d/NanCheck.cu", + "torch/csrc/distributed/c10d/cuda/AsyncMM.cu", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp", + "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", ], )) + torch_sources, copts = TORCH_COPTS, diff --git a/build_variables.bzl b/build_variables.bzl index a761e09112b..c557d032394 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -493,12 +493,10 @@ libtorch_core_sources = sorted( # These files are the only ones that are supported on Windows. libtorch_distributed_base_sources = [ - "torch/csrc/distributed/c10d/Backend.cpp", "torch/csrc/distributed/c10d/Backoff.cpp", - "torch/csrc/distributed/c10d/DMAConnectivity.cpp", - "torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp", - "torch/csrc/distributed/c10d/FlightRecorder.cpp", + "torch/csrc/distributed/c10d/Backend.cpp", "torch/csrc/distributed/c10d/FileStore.cpp", + "torch/csrc/distributed/c10d/FlightRecorder.cpp", "torch/csrc/distributed/c10d/Functional.cpp", "torch/csrc/distributed/c10d/GlooDeviceFactory.cpp", "torch/csrc/distributed/c10d/GroupRegistry.cpp", @@ -510,12 +508,15 @@ libtorch_distributed_base_sources = [ "torch/csrc/distributed/c10d/ProcessGroupMPI.cpp", "torch/csrc/distributed/c10d/ProcessGroupWrapper.cpp", "torch/csrc/distributed/c10d/Store.cpp", - "torch/csrc/distributed/c10d/SymmetricMemory.cpp", "torch/csrc/distributed/c10d/TCPStore.cpp", "torch/csrc/distributed/c10d/TCPStoreBackend.cpp", "torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp", "torch/csrc/distributed/c10d/Utils.cpp", + "torch/csrc/distributed/c10d/Work.cpp", "torch/csrc/distributed/c10d/comm.cpp", + "torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp", + "torch/csrc/distributed/c10d/control_plane/Handlers.cpp", + "torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp", "torch/csrc/distributed/c10d/debug.cpp", "torch/csrc/distributed/c10d/default_comm_hooks.cpp", "torch/csrc/distributed/c10d/logger.cpp", @@ -524,9 +525,8 @@ libtorch_distributed_base_sources = [ "torch/csrc/distributed/c10d/reducer.cpp", "torch/csrc/distributed/c10d/sequence_num.cpp", "torch/csrc/distributed/c10d/socket.cpp", - "torch/csrc/distributed/c10d/Work.cpp", - "torch/csrc/distributed/c10d/control_plane/Handlers.cpp", - "torch/csrc/distributed/c10d/control_plane/WorkerServer.cpp", + "torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp", + "torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp", ] # These files are only supported on Linux (and others) but not on Windows. @@ -699,24 +699,24 @@ libtorch_cuda_distributed_base_sources = [ # These files are only supported on Linux (and others) but not on Windows. libtorch_cuda_distributed_extra_sources = [ - "torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp", - "torch/csrc/distributed/c10d/NCCLUtils.cpp", "torch/csrc/distributed/c10d/FlightRecorderCuda.cpp", - "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp", + "torch/csrc/distributed/c10d/NCCLUtils.cpp", + "torch/csrc/distributed/c10d/NanCheck.cu", "torch/csrc/distributed/c10d/ProcessGroupGlooCuda.cpp", + "torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp", "torch/csrc/distributed/c10d/ProcessGroupUCC.cpp", "torch/csrc/distributed/c10d/UCCTracing.cpp", "torch/csrc/distributed/c10d/UCCUtils.cpp", - "torch/csrc/distributed/c10d/intra_node_comm.cpp", - "torch/csrc/distributed/c10d/intra_node_comm.cu", - "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu", - "torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu", - "torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp", "torch/csrc/distributed/c10d/cuda/AsyncMM.cu", "torch/csrc/distributed/c10d/cuda/utils.cpp", - "torch/csrc/distributed/c10d/NanCheck.cu", - "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu", + "torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp", + "torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp", + "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp", + "torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu", + "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp", ] libtorch_cuda_distributed_sources = libtorch_cuda_distributed_base_sources + libtorch_cuda_distributed_extra_sources diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 6e9127215b2..bf31b4c16bc 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -572,13 +572,13 @@ if(USE_CUDA) if(NOT WIN32) append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS) set_source_files_properties( - ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp - ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp ${TORCH_SRC_DIR}/csrc/distributed/c10d/CudaDMAConnectivity.cpp - ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemory.cu - ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu - ${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp + ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp + ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp + ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu + ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu + ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1" ) endif() @@ -1004,10 +1004,10 @@ elseif(USE_CUDA) # which is not viable for libtorch_cuda. So we isolate the linking of # nvshmem in nvshmem_extension. add_library(nvshmem_extension SHARED - "${TORCH_SRC_DIR}/csrc/distributed/c10d/nvshmem_extension.cu" - "${TORCH_SRC_DIR}/csrc/distributed/c10d/NVSHMEMSymmetricMemory.cu" - "${TORCH_SRC_DIR}/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp" "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp" + "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu" + "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu" + "${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp" ) set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_compile_options(nvshmem_extension PRIVATE $<$:-rdc=true>) diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index 91ab0e1e17e..c44613065c3 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -23,8 +23,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index b016464dea1..f2226047397 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -32,7 +32,7 @@ #ifdef USE_C10D_NCCL #include #include -#include +#include #endif #ifdef USE_C10D_MPI @@ -45,9 +45,9 @@ #include #include -#include #include -#include +#include +#include #include #include diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h similarity index 100% rename from torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h rename to torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu similarity index 99% rename from torch/csrc/distributed/c10d/CUDASymmetricMemory.cu rename to torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu index 87a14a5f26d..4cc29b0c347 100644 --- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.cu +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu @@ -1,6 +1,6 @@ -#include -#include -#include +#include +#include +#include #include #include diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp similarity index 96% rename from torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp rename to torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp index d1a85e3a236..c65c9677d8d 100644 --- a/torch/csrc/distributed/c10d/CUDASymmetricMemory.hpp +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp @@ -1,9 +1,9 @@ #pragma once #include -#include #include -#include +#include +#include namespace c10d::symmetric_memory { diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu similarity index 99% rename from torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu rename to torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu index 744276772bf..e9944b679d7 100644 --- a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu @@ -15,10 +15,9 @@ #include #endif - -#include -#include #include +#include +#include #if defined(USE_ROCM) || (defined(CUDART_VERSION) && CUDART_VERSION >= 12030) diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp similarity index 100% rename from torch/csrc/distributed/c10d/CUDASymmetricMemoryTypes.hpp rename to torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp similarity index 99% rename from torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp rename to torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp index ef95e47a27c..dee189d58aa 100644 --- a/torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp @@ -12,9 +12,9 @@ #include #endif -#include #include #include +#include namespace c10d::symmetric_memory { diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp similarity index 95% rename from torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp rename to torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp index 9fac3c9f698..77dd36b778a 100644 --- a/torch/csrc/distributed/c10d/CUDASymmetricMemoryUtils.hpp +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp @@ -1,8 +1,8 @@ #pragma once -#include #include -#include +#include +#include namespace c10d { namespace symmetric_memory { diff --git a/torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp b/torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp similarity index 98% rename from torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp rename to torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp index 9cff2faeee3..b5efcfeb300 100644 --- a/torch/csrc/distributed/c10d/CudaDMAConnectivity.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp @@ -1,5 +1,5 @@ #if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) -#include +#include #include #include diff --git a/torch/csrc/distributed/c10d/DMAConnectivity.cpp b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp similarity index 97% rename from torch/csrc/distributed/c10d/DMAConnectivity.cpp rename to torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp index a2bab1247a5..0d54c389dde 100644 --- a/torch/csrc/distributed/c10d/DMAConnectivity.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace { diff --git a/torch/csrc/distributed/c10d/DMAConnectivity.hpp b/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp similarity index 100% rename from torch/csrc/distributed/c10d/DMAConnectivity.hpp rename to torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp diff --git a/torch/csrc/distributed/c10d/NVSHMEMSymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu similarity index 97% rename from torch/csrc/distributed/c10d/NVSHMEMSymmetricMemory.cu rename to torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu index e49edb9a7bc..be12b4197e2 100644 --- a/torch/csrc/distributed/c10d/NVSHMEMSymmetricMemory.cu +++ b/torch/csrc/distributed/c10d/symm_mem/NVSHMEMSymmetricMemory.cu @@ -1,8 +1,8 @@ -#include -#include -#include #include -#include +#include +#include +#include +#include #include #include diff --git a/torch/csrc/distributed/c10d/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp similarity index 99% rename from torch/csrc/distributed/c10d/SymmetricMemory.cpp rename to torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp index 58943caefbd..67e74e296a4 100644 --- a/torch/csrc/distributed/c10d/SymmetricMemory.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.cpp @@ -1,4 +1,4 @@ -#include +#include namespace { diff --git a/torch/csrc/distributed/c10d/SymmetricMemory.hpp b/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp similarity index 100% rename from torch/csrc/distributed/c10d/SymmetricMemory.hpp rename to torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp similarity index 97% rename from torch/csrc/distributed/c10d/intra_node_comm.cpp rename to torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp index 2694dabbac2..b11dfa07de3 100644 --- a/torch/csrc/distributed/c10d/intra_node_comm.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp @@ -1,7 +1,6 @@ -#include - -#include #include +#include +#include #if defined(USE_ROCM) #include diff --git a/torch/csrc/distributed/c10d/intra_node_comm.cu b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu similarity index 96% rename from torch/csrc/distributed/c10d/intra_node_comm.cu rename to torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu index c490cba2021..6a6a6520e36 100644 --- a/torch/csrc/distributed/c10d/intra_node_comm.cu +++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cu @@ -1,6 +1,6 @@ -#include +#include -#include +#include namespace c10d { namespace intra_node_comm { diff --git a/torch/csrc/distributed/c10d/intra_node_comm.hpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp similarity index 97% rename from torch/csrc/distributed/c10d/intra_node_comm.hpp rename to torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp index d21ee398f1a..7b5e8ff999c 100644 --- a/torch/csrc/distributed/c10d/intra_node_comm.hpp +++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp @@ -3,8 +3,8 @@ #include #include #include -#include #include +#include namespace c10d::intra_node_comm { diff --git a/torch/csrc/distributed/c10d/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu similarity index 98% rename from torch/csrc/distributed/c10d/nvshmem_extension.cu rename to torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu index ccd452a5256..62ea0e21ced 100644 --- a/torch/csrc/distributed/c10d/nvshmem_extension.cu +++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu @@ -1,10 +1,9 @@ -#include - #include -#include -#include -#include +#include +#include +#include +#include #include // Use torch's cub wrapper instead of CUDA's , see #55292 diff --git a/torch/csrc/distributed/c10d/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh similarity index 100% rename from torch/csrc/distributed/c10d/nvshmem_extension.cuh rename to torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh