pytorch/torch/csrc/distributed/c10d/reducer_cuda.cpp
cyy f7c0c230b0 Fix compile errors (#148758)
Fix
```
  /usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/bits/unique_ptr.h:91:16: error: invalid application of 'sizeof' to an incomplete type 'torch::jit::AliasDb::WriteRegistry'
     91 |         static_assert(sizeof(_Tp)>0,
        |                       ^~~~~~~~~~~
  /usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/bits/unique_ptr.h:399:4: note: in instantiation of member function 'std::default_delete<torch::jit::AliasDb::WriteRegistry>::operator()' requested here
    399 |           get_deleter()(std::move(__ptr));
        |           ^
  ../torch/csrc/jit/ir/alias_analysis.cpp:200:10: note: in instantiation of member function 'std::unique_ptr<torch::jit::AliasDb::WriteRegistry>::~unique_ptr' requested here
    200 | AliasDb::~AliasDb() = default;
        |          ^
  ../torch/csrc/jit/ir/alias_analysis.cpp:200:23: note: in defaulted destructor for 'torch::jit::AliasDb' first required here
    200 | AliasDb::~AliasDb() = default;
        |                       ^
  ../torch/csrc/jit/ir/alias_analysis.h:298:10: note: forward declaration of 'torch::jit::AliasDb::WriteRegistry'
    298 |   struct WriteRegistry;
        |          ^
  1 error generated.
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/148758
Approved by: https://github.com/Skylion007
2025-03-08 04:56:42 +00:00

87 lines
3.0 KiB
C++

#include <torch/csrc/distributed/c10d/reducer_timer.hpp>
#include <ATen/cuda/CUDAEvent.h>
#include <c10/core/DeviceGuard.h>
namespace c10d {
namespace {
const int kMilliSecondToNanosSecond = 1000000;
class CudaTimer : public Timer {
private:
c10::Device device;
at::cuda::CUDAEvent forward_start = at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent backward_compute_start =
at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent backward_compute_end =
at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent backward_comm_start =
at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent backward_comm_end = at::cuda::CUDAEvent(cudaEventDefault);
at::cuda::CUDAEvent& getEvent(Event event) {
switch (event) {
case Event::kForwardStart:
return forward_start;
case Event::kBackwardComputeStart:
return backward_compute_start;
case Event::kBackwardComputeEnd:
return backward_compute_end;
case Event::kBackwardCommStart:
return backward_comm_start;
case Event::kBackwardCommEnd:
return backward_comm_end;
default:
TORCH_INTERNAL_ASSERT(false);
}
}
public:
explicit CudaTimer(c10::Device dev) : device(dev) {}
void record(Event event) override {
// Parent class sets the host-side time
Timer::record(event);
c10::DeviceGuard g(device);
getEvent(event).record();
}
std::optional<int64_t> measureDifference(Event start, Event end) override {
c10::DeviceGuard g(device);
at::cuda::CUDAEvent& start_event = getEvent(start);
at::cuda::CUDAEvent& end_event = getEvent(end);
// It is possible users did not call backward or run codes in
// no-sync mode, in this case, some cudaEvents like "backward_compute_end"
// or "backward_comm_start" or "backward_comm_end" will not be recorded.
// cudaEvent is created when it is first time to be recorded.
// If it is never recorded/created, skip synchronize and calculation.
// Otherwise it will throw cuda errors.
if (!start_event.isCreated() || !end_event.isCreated()) {
return std::nullopt;
}
// set_runtime_stats_and_log is called at the beginning of forward call,
// when it is cheap to synchronize the cuda events of previous iteration,
// as mostly all cuda operations are finished in previous iteration.
start_event.synchronize();
end_event.synchronize();
float milliseconds = start_event.elapsed_time(end_event);
// If gpu_end is not recorded in this iteration,
// milliseconds will have invalid value.
// For some cases like DDP runs on non-sync mode,
// gpu_end can not be recorded in this iteration and thus can not
// calculate the valid avg_time.
// In this case, skip calculating the avg_time and return.
if (milliseconds < 0) {
return std::nullopt;
}
return int64_t(milliseconds * kMilliSecondToNanosSecond);
}
};
C10_REGISTER_TYPED_CLASS(TimerRegistry, c10::kCUDA, CudaTimer)
} // namespace
} // namespace c10d