Revert "[1/N] Move NaN check onto NCCL stream (#134300)"

This reverts commit 94caba4899. Reverted https://github.com/pytorch/pytorch/pull/134300 on behalf of https://github.com/kwen2501 due to This is breaking builds of MTIA ([comment](https://github.com/pytorch/pytorch/pull/134300#issuecomment-2316559704))
2025-12-06 12:20:52 +01:00 · 2024-08-29 01:50:22 +00:00 · 2024-08-29 01:50:22 +00:00 · cbf5ba1e97
commit cbf5ba1e97
parent 33d0c11b26
6 changed files with 14 additions and 20 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -574,7 +574,7 @@ cu_library(
    name = "torch_cuda",
    srcs = [
        "torch/csrc/distributed/c10d/intra_node_comm.cu",
-        "torch/csrc/distributed/c10d/NCCLUtils.cu",
+        "torch/csrc/distributed/c10d/Utils.cu",
        "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
    ],
    copts = torch_cuda_half_options,
@ -722,7 +722,7 @@ cc_library(
            "torch/csrc/distributed/c10d/intra_node_comm.cu",
            "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
            "torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
-            "torch/csrc/distributed/c10d/NCCLUtils.cu",
+            "torch/csrc/distributed/c10d/Utils.cu",
            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        ],
    )) + torch_sources,
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -691,7 +691,7 @@ libtorch_cuda_distributed_extra_sources = [
    "torch/csrc/distributed/c10d/intra_node_comm.cu",
    "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
    "torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
-    "torch/csrc/distributed/c10d/NCCLUtils.cu",
+    "torch/csrc/distributed/c10d/Utils.cu",
    "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
    "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
 ]
--- a/torch/csrc/distributed/c10d/NCCLUtils.hpp
+++ b/torch/csrc/distributed/c10d/NCCLUtils.hpp
@ -10,7 +10,6 @@
 #include <thread>

 #include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
 #include <ATen/cuda/CUDAEvent.h>
 #include <c10/util/Exception.h>
 #include <nccl.h>
@ -715,11 +714,6 @@ struct NCCLTraceBuffer {
      bool includeStackTraces,
      bool onlyActive);
 };
-
-// Check for NaNs in a tensor on a given stream. If any are found, throw a
-// device-side error.
-void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream);
-
 } // namespace c10d

 #endif // USE_C10D_NCCL
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -2638,6 +2638,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
    OpType opType,
    const char* profilingTitle,
    bool avoidRecordStreams) {
+  if (enableNanCheck_) {
+    checkForNan(input);
+  }
  // Environment setting by the user may add onto collective call's option
  avoidRecordStreams |= avoidRecordStreams_;
  c10::cuda::CaptureStatus capture_status =
@ -2693,10 +2696,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(

  at::cuda::OptionalCUDAGuard gpuGuard;

-  if (enableNanCheck_) {
-    checkForNan(input, ncclStream);
-  }
-
  // Start event should only be recorded before the ncclGroupStart()
  if (work->timingEnabled_) {
    work->ncclStartEvent_->record(ncclStream);
@ -2998,6 +2997,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
    PreProcess pre,
    PostProcess post,
    const char* profilingTitle) {
+  if (enableNanCheck_) {
+    checkForNan(tensor);
+  }
  // avoidRecordStreams_ note:
  // send, recv, and irecv should be ok with avoidRecordStreams,
  // However, for isend, I don't think the API requires the user
@ -3126,10 +3128,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
  // is gpuGuard needed for the if block below, or can i swap them
  at::cuda::OptionalCUDAGuard gpuGuard;

-  if (enableNanCheck_) {
-    checkForNan(tensor, ncclStream);
-  }
-
  if (!coalescing_state_) {
    // Start event should only be recorded before the ncclGroupStart()
    if (work->timingEnabled_) {
--- a/torch/csrc/distributed/c10d/NCCLUtils.cu
+++ b/torch/csrc/distributed/c10d/NCCLUtils.cu
@ -1,7 +1,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
 #include <torch/torch.h>
 #include <algorithm>

@ -20,7 +20,7 @@ __global__ void checkForNaN(T* data, size_t size) {
 }

 // CHECK if a Tensor contains NAN in any of its element
-void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream) {
+void checkForNan(const at::Tensor& tensor) {
  // skip check for non float types
  if (!torch::is_floating_point(tensor)) {
    return;
@ -40,7 +40,7 @@ void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream) {
      tensor.scalar_type(),
      "checkForNaN",
      [&] {
-        checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+        checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock>>>(
            tensor.data_ptr<scalar_t>(), tensor.numel());
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      });
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@ -611,6 +611,8 @@ using SizeType = uint64_t;
 // Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
 #define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)

+void checkForNan(const at::Tensor& tensor);
+
 namespace tcputil {

 // Send and receive