mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Revert "[1/N] Move NaN check onto NCCL stream (#134300)"
This reverts commit 94caba4899.
Reverted https://github.com/pytorch/pytorch/pull/134300 on behalf of https://github.com/kwen2501 due to This is breaking builds of MTIA ([comment](https://github.com/pytorch/pytorch/pull/134300#issuecomment-2316559704))
This commit is contained in:
parent
33d0c11b26
commit
cbf5ba1e97
|
|
@ -574,7 +574,7 @@ cu_library(
|
|||
name = "torch_cuda",
|
||||
srcs = [
|
||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||
"torch/csrc/distributed/c10d/NCCLUtils.cu",
|
||||
"torch/csrc/distributed/c10d/Utils.cu",
|
||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||
],
|
||||
copts = torch_cuda_half_options,
|
||||
|
|
@ -722,7 +722,7 @@ cc_library(
|
|||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
||||
"torch/csrc/distributed/c10d/NCCLUtils.cu",
|
||||
"torch/csrc/distributed/c10d/Utils.cu",
|
||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||
],
|
||||
)) + torch_sources,
|
||||
|
|
|
|||
|
|
@ -691,7 +691,7 @@ libtorch_cuda_distributed_extra_sources = [
|
|||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
||||
"torch/csrc/distributed/c10d/NCCLUtils.cu",
|
||||
"torch/csrc/distributed/c10d/Utils.cu",
|
||||
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@
|
|||
#include <thread>
|
||||
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <nccl.h>
|
||||
|
|
@ -715,11 +714,6 @@ struct NCCLTraceBuffer {
|
|||
bool includeStackTraces,
|
||||
bool onlyActive);
|
||||
};
|
||||
|
||||
// Check for NaNs in a tensor on a given stream. If any are found, throw a
|
||||
// device-side error.
|
||||
void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream);
|
||||
|
||||
} // namespace c10d
|
||||
|
||||
#endif // USE_C10D_NCCL
|
||||
|
|
|
|||
|
|
@ -2638,6 +2638,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
|
|||
OpType opType,
|
||||
const char* profilingTitle,
|
||||
bool avoidRecordStreams) {
|
||||
if (enableNanCheck_) {
|
||||
checkForNan(input);
|
||||
}
|
||||
// Environment setting by the user may add onto collective call's option
|
||||
avoidRecordStreams |= avoidRecordStreams_;
|
||||
c10::cuda::CaptureStatus capture_status =
|
||||
|
|
@ -2693,10 +2696,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
|
|||
|
||||
at::cuda::OptionalCUDAGuard gpuGuard;
|
||||
|
||||
if (enableNanCheck_) {
|
||||
checkForNan(input, ncclStream);
|
||||
}
|
||||
|
||||
// Start event should only be recorded before the ncclGroupStart()
|
||||
if (work->timingEnabled_) {
|
||||
work->ncclStartEvent_->record(ncclStream);
|
||||
|
|
@ -2998,6 +2997,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
|
|||
PreProcess pre,
|
||||
PostProcess post,
|
||||
const char* profilingTitle) {
|
||||
if (enableNanCheck_) {
|
||||
checkForNan(tensor);
|
||||
}
|
||||
// avoidRecordStreams_ note:
|
||||
// send, recv, and irecv should be ok with avoidRecordStreams,
|
||||
// However, for isend, I don't think the API requires the user
|
||||
|
|
@ -3126,10 +3128,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
|
|||
// is gpuGuard needed for the if block below, or can i swap them
|
||||
at::cuda::OptionalCUDAGuard gpuGuard;
|
||||
|
||||
if (enableNanCheck_) {
|
||||
checkForNan(tensor, ncclStream);
|
||||
}
|
||||
|
||||
if (!coalescing_state_) {
|
||||
// Start event should only be recorded before the ncclGroupStart()
|
||||
if (work->timingEnabled_) {
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
||||
#include <torch/csrc/distributed/c10d/Utils.hpp>
|
||||
#include <torch/torch.h>
|
||||
#include <algorithm>
|
||||
|
||||
|
|
@ -20,7 +20,7 @@ __global__ void checkForNaN(T* data, size_t size) {
|
|||
}
|
||||
|
||||
// CHECK if a Tensor contains NAN in any of its element
|
||||
void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream) {
|
||||
void checkForNan(const at::Tensor& tensor) {
|
||||
// skip check for non float types
|
||||
if (!torch::is_floating_point(tensor)) {
|
||||
return;
|
||||
|
|
@ -40,7 +40,7 @@ void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream) {
|
|||
tensor.scalar_type(),
|
||||
"checkForNaN",
|
||||
[&] {
|
||||
checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
|
||||
checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock>>>(
|
||||
tensor.data_ptr<scalar_t>(), tensor.numel());
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
});
|
||||
|
|
@ -611,6 +611,8 @@ using SizeType = uint64_t;
|
|||
// Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
|
||||
#define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)
|
||||
|
||||
void checkForNan(const at::Tensor& tensor);
|
||||
|
||||
namespace tcputil {
|
||||
|
||||
// Send and receive
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user