Revert "[1/N] Move NaN check onto NCCL stream (#134300)"

This reverts commit 94caba4899.

Reverted https://github.com/pytorch/pytorch/pull/134300 on behalf of https://github.com/kwen2501 due to This is breaking builds of MTIA ([comment](https://github.com/pytorch/pytorch/pull/134300#issuecomment-2316559704))
This commit is contained in:
PyTorch MergeBot 2024-08-29 01:50:22 +00:00
parent 33d0c11b26
commit cbf5ba1e97
6 changed files with 14 additions and 20 deletions

View File

@ -574,7 +574,7 @@ cu_library(
name = "torch_cuda",
srcs = [
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/NCCLUtils.cu",
"torch/csrc/distributed/c10d/Utils.cu",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
],
copts = torch_cuda_half_options,
@ -722,7 +722,7 @@ cc_library(
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
"torch/csrc/distributed/c10d/NCCLUtils.cu",
"torch/csrc/distributed/c10d/Utils.cu",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
],
)) + torch_sources,

View File

@ -691,7 +691,7 @@ libtorch_cuda_distributed_extra_sources = [
"torch/csrc/distributed/c10d/intra_node_comm.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
"torch/csrc/distributed/c10d/NCCLUtils.cu",
"torch/csrc/distributed/c10d/Utils.cu",
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
]

View File

@ -10,7 +10,6 @@
#include <thread>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAEvent.h>
#include <c10/util/Exception.h>
#include <nccl.h>
@ -715,11 +714,6 @@ struct NCCLTraceBuffer {
bool includeStackTraces,
bool onlyActive);
};
// Check for NaNs in a tensor on a given stream. If any are found, throw a
// device-side error.
void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream);
} // namespace c10d
#endif // USE_C10D_NCCL

View File

@ -2638,6 +2638,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
OpType opType,
const char* profilingTitle,
bool avoidRecordStreams) {
if (enableNanCheck_) {
checkForNan(input);
}
// Environment setting by the user may add onto collective call's option
avoidRecordStreams |= avoidRecordStreams_;
c10::cuda::CaptureStatus capture_status =
@ -2693,10 +2696,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
at::cuda::OptionalCUDAGuard gpuGuard;
if (enableNanCheck_) {
checkForNan(input, ncclStream);
}
// Start event should only be recorded before the ncclGroupStart()
if (work->timingEnabled_) {
work->ncclStartEvent_->record(ncclStream);
@ -2998,6 +2997,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
PreProcess pre,
PostProcess post,
const char* profilingTitle) {
if (enableNanCheck_) {
checkForNan(tensor);
}
// avoidRecordStreams_ note:
// send, recv, and irecv should be ok with avoidRecordStreams,
// However, for isend, I don't think the API requires the user
@ -3126,10 +3128,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
// is gpuGuard needed for the if block below, or can i swap them
at::cuda::OptionalCUDAGuard gpuGuard;
if (enableNanCheck_) {
checkForNan(tensor, ncclStream);
}
if (!coalescing_state_) {
// Start event should only be recorded before the ncclGroupStart()
if (work->timingEnabled_) {

View File

@ -1,7 +1,7 @@
#include <ATen/Dispatch.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
#include <torch/csrc/distributed/c10d/Utils.hpp>
#include <torch/torch.h>
#include <algorithm>
@ -20,7 +20,7 @@ __global__ void checkForNaN(T* data, size_t size) {
}
// CHECK if a Tensor contains NAN in any of its element
void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream) {
void checkForNan(const at::Tensor& tensor) {
// skip check for non float types
if (!torch::is_floating_point(tensor)) {
return;
@ -40,7 +40,7 @@ void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream) {
tensor.scalar_type(),
"checkForNaN",
[&] {
checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock>>>(
tensor.data_ptr<scalar_t>(), tensor.numel());
C10_CUDA_KERNEL_LAUNCH_CHECK();
});

View File

@ -611,6 +611,8 @@ using SizeType = uint64_t;
// Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
#define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)
void checkForNan(const at::Tensor& tensor);
namespace tcputil {
// Send and receive