mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Fix
```
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/bits/unique_ptr.h:91:16: error: invalid application of 'sizeof' to an incomplete type 'torch::jit::AliasDb::WriteRegistry'
91 | static_assert(sizeof(_Tp)>0,
| ^~~~~~~~~~~
/usr/bin/../lib64/gcc/x86_64-pc-linux-gnu/14.2.1/../../../../include/c++/14.2.1/bits/unique_ptr.h:399:4: note: in instantiation of member function 'std::default_delete<torch::jit::AliasDb::WriteRegistry>::operator()' requested here
399 | get_deleter()(std::move(__ptr));
| ^
../torch/csrc/jit/ir/alias_analysis.cpp:200:10: note: in instantiation of member function 'std::unique_ptr<torch::jit::AliasDb::WriteRegistry>::~unique_ptr' requested here
200 | AliasDb::~AliasDb() = default;
| ^
../torch/csrc/jit/ir/alias_analysis.cpp:200:23: note: in defaulted destructor for 'torch::jit::AliasDb' first required here
200 | AliasDb::~AliasDb() = default;
| ^
../torch/csrc/jit/ir/alias_analysis.h:298:10: note: forward declaration of 'torch::jit::AliasDb::WriteRegistry'
298 | struct WriteRegistry;
| ^
1 error generated.
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/148758
Approved by: https://github.com/Skylion007
87 lines
3.0 KiB
C++
87 lines
3.0 KiB
C++
#include <torch/csrc/distributed/c10d/reducer_timer.hpp>
|
|
|
|
#include <ATen/cuda/CUDAEvent.h>
|
|
#include <c10/core/DeviceGuard.h>
|
|
|
|
namespace c10d {
|
|
namespace {
|
|
|
|
const int kMilliSecondToNanosSecond = 1000000;
|
|
|
|
class CudaTimer : public Timer {
|
|
private:
|
|
c10::Device device;
|
|
|
|
at::cuda::CUDAEvent forward_start = at::cuda::CUDAEvent(cudaEventDefault);
|
|
at::cuda::CUDAEvent backward_compute_start =
|
|
at::cuda::CUDAEvent(cudaEventDefault);
|
|
at::cuda::CUDAEvent backward_compute_end =
|
|
at::cuda::CUDAEvent(cudaEventDefault);
|
|
at::cuda::CUDAEvent backward_comm_start =
|
|
at::cuda::CUDAEvent(cudaEventDefault);
|
|
at::cuda::CUDAEvent backward_comm_end = at::cuda::CUDAEvent(cudaEventDefault);
|
|
|
|
at::cuda::CUDAEvent& getEvent(Event event) {
|
|
switch (event) {
|
|
case Event::kForwardStart:
|
|
return forward_start;
|
|
case Event::kBackwardComputeStart:
|
|
return backward_compute_start;
|
|
case Event::kBackwardComputeEnd:
|
|
return backward_compute_end;
|
|
case Event::kBackwardCommStart:
|
|
return backward_comm_start;
|
|
case Event::kBackwardCommEnd:
|
|
return backward_comm_end;
|
|
default:
|
|
TORCH_INTERNAL_ASSERT(false);
|
|
}
|
|
}
|
|
|
|
public:
|
|
explicit CudaTimer(c10::Device dev) : device(dev) {}
|
|
|
|
void record(Event event) override {
|
|
// Parent class sets the host-side time
|
|
Timer::record(event);
|
|
c10::DeviceGuard g(device);
|
|
getEvent(event).record();
|
|
}
|
|
|
|
std::optional<int64_t> measureDifference(Event start, Event end) override {
|
|
c10::DeviceGuard g(device);
|
|
at::cuda::CUDAEvent& start_event = getEvent(start);
|
|
at::cuda::CUDAEvent& end_event = getEvent(end);
|
|
// It is possible users did not call backward or run codes in
|
|
// no-sync mode, in this case, some cudaEvents like "backward_compute_end"
|
|
// or "backward_comm_start" or "backward_comm_end" will not be recorded.
|
|
// cudaEvent is created when it is first time to be recorded.
|
|
// If it is never recorded/created, skip synchronize and calculation.
|
|
// Otherwise it will throw cuda errors.
|
|
if (!start_event.isCreated() || !end_event.isCreated()) {
|
|
return std::nullopt;
|
|
}
|
|
// set_runtime_stats_and_log is called at the beginning of forward call,
|
|
// when it is cheap to synchronize the cuda events of previous iteration,
|
|
// as mostly all cuda operations are finished in previous iteration.
|
|
start_event.synchronize();
|
|
end_event.synchronize();
|
|
float milliseconds = start_event.elapsed_time(end_event);
|
|
// If gpu_end is not recorded in this iteration,
|
|
// milliseconds will have invalid value.
|
|
// For some cases like DDP runs on non-sync mode,
|
|
// gpu_end can not be recorded in this iteration and thus can not
|
|
// calculate the valid avg_time.
|
|
// In this case, skip calculating the avg_time and return.
|
|
if (milliseconds < 0) {
|
|
return std::nullopt;
|
|
}
|
|
return int64_t(milliseconds * kMilliSecondToNanosSecond);
|
|
}
|
|
};
|
|
|
|
C10_REGISTER_TYPED_CLASS(TimerRegistry, c10::kCUDA, CudaTimer)
|
|
|
|
} // namespace
|
|
} // namespace c10d
|