mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Revert "[1/N] Move NaN check onto NCCL stream (#134300)"
This reverts commit 94caba4899.
Reverted https://github.com/pytorch/pytorch/pull/134300 on behalf of https://github.com/kwen2501 due to This is breaking builds of MTIA ([comment](https://github.com/pytorch/pytorch/pull/134300#issuecomment-2316559704))
This commit is contained in:
parent
33d0c11b26
commit
cbf5ba1e97
|
|
@ -574,7 +574,7 @@ cu_library(
|
||||||
name = "torch_cuda",
|
name = "torch_cuda",
|
||||||
srcs = [
|
srcs = [
|
||||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||||
"torch/csrc/distributed/c10d/NCCLUtils.cu",
|
"torch/csrc/distributed/c10d/Utils.cu",
|
||||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||||
],
|
],
|
||||||
copts = torch_cuda_half_options,
|
copts = torch_cuda_half_options,
|
||||||
|
|
@ -722,7 +722,7 @@ cc_library(
|
||||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
||||||
"torch/csrc/distributed/c10d/NCCLUtils.cu",
|
"torch/csrc/distributed/c10d/Utils.cu",
|
||||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||||
],
|
],
|
||||||
)) + torch_sources,
|
)) + torch_sources,
|
||||||
|
|
|
||||||
|
|
@ -691,7 +691,7 @@ libtorch_cuda_distributed_extra_sources = [
|
||||||
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
"torch/csrc/distributed/c10d/intra_node_comm.cu",
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
"torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
|
||||||
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
"torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu",
|
||||||
"torch/csrc/distributed/c10d/NCCLUtils.cu",
|
"torch/csrc/distributed/c10d/Utils.cu",
|
||||||
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
"torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
|
||||||
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
"torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,6 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
|
||||||
#include <ATen/cuda/CUDAEvent.h>
|
#include <ATen/cuda/CUDAEvent.h>
|
||||||
#include <c10/util/Exception.h>
|
#include <c10/util/Exception.h>
|
||||||
#include <nccl.h>
|
#include <nccl.h>
|
||||||
|
|
@ -715,11 +714,6 @@ struct NCCLTraceBuffer {
|
||||||
bool includeStackTraces,
|
bool includeStackTraces,
|
||||||
bool onlyActive);
|
bool onlyActive);
|
||||||
};
|
};
|
||||||
|
|
||||||
// Check for NaNs in a tensor on a given stream. If any are found, throw a
|
|
||||||
// device-side error.
|
|
||||||
void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream);
|
|
||||||
|
|
||||||
} // namespace c10d
|
} // namespace c10d
|
||||||
|
|
||||||
#endif // USE_C10D_NCCL
|
#endif // USE_C10D_NCCL
|
||||||
|
|
|
||||||
|
|
@ -2638,6 +2638,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
|
||||||
OpType opType,
|
OpType opType,
|
||||||
const char* profilingTitle,
|
const char* profilingTitle,
|
||||||
bool avoidRecordStreams) {
|
bool avoidRecordStreams) {
|
||||||
|
if (enableNanCheck_) {
|
||||||
|
checkForNan(input);
|
||||||
|
}
|
||||||
// Environment setting by the user may add onto collective call's option
|
// Environment setting by the user may add onto collective call's option
|
||||||
avoidRecordStreams |= avoidRecordStreams_;
|
avoidRecordStreams |= avoidRecordStreams_;
|
||||||
c10::cuda::CaptureStatus capture_status =
|
c10::cuda::CaptureStatus capture_status =
|
||||||
|
|
@ -2693,10 +2696,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
|
||||||
|
|
||||||
at::cuda::OptionalCUDAGuard gpuGuard;
|
at::cuda::OptionalCUDAGuard gpuGuard;
|
||||||
|
|
||||||
if (enableNanCheck_) {
|
|
||||||
checkForNan(input, ncclStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start event should only be recorded before the ncclGroupStart()
|
// Start event should only be recorded before the ncclGroupStart()
|
||||||
if (work->timingEnabled_) {
|
if (work->timingEnabled_) {
|
||||||
work->ncclStartEvent_->record(ncclStream);
|
work->ncclStartEvent_->record(ncclStream);
|
||||||
|
|
@ -2998,6 +2997,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
|
||||||
PreProcess pre,
|
PreProcess pre,
|
||||||
PostProcess post,
|
PostProcess post,
|
||||||
const char* profilingTitle) {
|
const char* profilingTitle) {
|
||||||
|
if (enableNanCheck_) {
|
||||||
|
checkForNan(tensor);
|
||||||
|
}
|
||||||
// avoidRecordStreams_ note:
|
// avoidRecordStreams_ note:
|
||||||
// send, recv, and irecv should be ok with avoidRecordStreams,
|
// send, recv, and irecv should be ok with avoidRecordStreams,
|
||||||
// However, for isend, I don't think the API requires the user
|
// However, for isend, I don't think the API requires the user
|
||||||
|
|
@ -3126,10 +3128,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
|
||||||
// is gpuGuard needed for the if block below, or can i swap them
|
// is gpuGuard needed for the if block below, or can i swap them
|
||||||
at::cuda::OptionalCUDAGuard gpuGuard;
|
at::cuda::OptionalCUDAGuard gpuGuard;
|
||||||
|
|
||||||
if (enableNanCheck_) {
|
|
||||||
checkForNan(tensor, ncclStream);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!coalescing_state_) {
|
if (!coalescing_state_) {
|
||||||
// Start event should only be recorded before the ncclGroupStart()
|
// Start event should only be recorded before the ncclGroupStart()
|
||||||
if (work->timingEnabled_) {
|
if (work->timingEnabled_) {
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
#include <ATen/Dispatch.h>
|
#include <ATen/Dispatch.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
#include <torch/csrc/distributed/c10d/Utils.hpp>
|
||||||
#include <torch/torch.h>
|
#include <torch/torch.h>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
|
|
@ -20,7 +20,7 @@ __global__ void checkForNaN(T* data, size_t size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// CHECK if a Tensor contains NAN in any of its element
|
// CHECK if a Tensor contains NAN in any of its element
|
||||||
void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream) {
|
void checkForNan(const at::Tensor& tensor) {
|
||||||
// skip check for non float types
|
// skip check for non float types
|
||||||
if (!torch::is_floating_point(tensor)) {
|
if (!torch::is_floating_point(tensor)) {
|
||||||
return;
|
return;
|
||||||
|
|
@ -40,7 +40,7 @@ void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream) {
|
||||||
tensor.scalar_type(),
|
tensor.scalar_type(),
|
||||||
"checkForNaN",
|
"checkForNaN",
|
||||||
[&] {
|
[&] {
|
||||||
checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
|
checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock>>>(
|
||||||
tensor.data_ptr<scalar_t>(), tensor.numel());
|
tensor.data_ptr<scalar_t>(), tensor.numel());
|
||||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||||
});
|
});
|
||||||
|
|
@ -611,6 +611,8 @@ using SizeType = uint64_t;
|
||||||
// Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
|
// Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
|
||||||
#define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)
|
#define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)
|
||||||
|
|
||||||
|
void checkForNan(const at::Tensor& tensor);
|
||||||
|
|
||||||
namespace tcputil {
|
namespace tcputil {
|
||||||
|
|
||||||
// Send and receive
|
// Send and receive
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user