pytorch/torch/csrc/profiler/stubs/cuda.cpp
cyy c3d02fa390 [Reland2] Update NVTX to NVTX3 (#109843)
Another attempt to update NVTX to NVTX3. We now avoid changing NVTX header inclusion of existing code.  The advantage of NVTX3 over NVTX is that it is a header-only library so that linking with NVTX3 can greatly simplify our CMake and other building scripts for finding libraries in user environments. In addition, NVTX are indeed still present in the latest CUDA versions, but they're no longer a compiled library: It's now a header-only library. That's why there isn't a .lib file anymore.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/109843
Approved by: https://github.com/peterbell10, https://github.com/eqy

Co-authored-by: Ivan Zaitsev <108101595+izaitsevfb@users.noreply.github.com>
2024-08-20 16:33:26 +00:00

117 lines
3.5 KiB
C++

#include <sstream>
#ifdef TORCH_CUDA_USE_NVTX3
#include <nvtx3/nvtx3.hpp>
#else
#include <nvToolsExt.h>
#endif
#include <c10/cuda/CUDAGuard.h>
#include <c10/util/ApproximateClock.h>
#include <c10/util/irange.h>
#include <torch/csrc/profiler/stubs/base.h>
#include <torch/csrc/profiler/util.h>
namespace torch {
namespace profiler {
namespace impl {
namespace {
static inline void cudaCheck(cudaError_t result, const char* file, int line) {
if (result != cudaSuccess) {
std::stringstream ss;
ss << file << ":" << line << ": ";
if (result == cudaErrorInitializationError) {
// It is common for users to use DataLoader with multiple workers
// and the autograd profiler. Throw a nice error message here.
ss << "CUDA initialization error. "
<< "This can occur if one runs the profiler in CUDA mode on code "
<< "that creates a DataLoader with num_workers > 0. This operation "
<< "is currently unsupported; potential workarounds are: "
<< "(1) don't use the profiler in CUDA mode or (2) use num_workers=0 "
<< "in the DataLoader or (3) Don't profile the data loading portion "
<< "of your code. https://github.com/pytorch/pytorch/issues/6313 "
<< "tracks profiler support for multi-worker DataLoader.";
} else {
ss << cudaGetErrorString(result);
}
throw std::runtime_error(ss.str());
}
}
#define TORCH_CUDA_CHECK(result) cudaCheck(result, __FILE__, __LINE__);
struct CUDAMethods : public ProfilerStubs {
void record(
c10::DeviceIndex* device,
ProfilerVoidEventStub* event,
int64_t* cpu_ns) const override {
if (device) {
TORCH_CUDA_CHECK(c10::cuda::GetDevice(device));
}
CUevent_st* cuda_event_ptr{nullptr};
TORCH_CUDA_CHECK(cudaEventCreate(&cuda_event_ptr));
*event = std::shared_ptr<CUevent_st>(cuda_event_ptr, [](CUevent_st* ptr) {
TORCH_CUDA_CHECK(cudaEventDestroy(ptr));
});
auto stream = at::cuda::getCurrentCUDAStream();
if (cpu_ns) {
*cpu_ns = c10::getTime();
}
TORCH_CUDA_CHECK(cudaEventRecord(cuda_event_ptr, stream));
}
float elapsed(
const ProfilerVoidEventStub* event_,
const ProfilerVoidEventStub* event2_) const override {
auto event = (const ProfilerEventStub*)(event_);
auto event2 = (const ProfilerEventStub*)(event2_);
TORCH_CUDA_CHECK(cudaEventSynchronize(event->get()));
TORCH_CUDA_CHECK(cudaEventSynchronize(event2->get()));
float ms = 0;
TORCH_CUDA_CHECK(cudaEventElapsedTime(&ms, event->get(), event2->get()));
// NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions)
return ms * 1000.0;
}
void mark(const char* name) const override {
::nvtxMark(name);
}
void rangePush(const char* name) const override {
::nvtxRangePushA(name);
}
void rangePop() const override {
::nvtxRangePop();
}
void onEachDevice(std::function<void(int)> op) const override {
at::cuda::OptionalCUDAGuard device_guard;
for (const auto i : c10::irange(at::cuda::device_count())) {
device_guard.set_index(i);
op(i);
}
}
void synchronize() const override {
TORCH_CUDA_CHECK(cudaDeviceSynchronize());
}
bool enabled() const override {
return true;
}
};
struct RegisterCUDAMethods {
RegisterCUDAMethods() {
static CUDAMethods methods;
registerCUDAMethods(&methods);
}
};
RegisterCUDAMethods reg;
} // namespace
} // namespace impl
} // namespace profiler
} // namespace torch