mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[Profiler] Optimize reportMemoryUsage (#71538)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71538 `reportMemoryUsage` is kind of awful. It does a bunch of string writes and such that makes it VERY expensive. Just moving that work off the hot path reduces the overhead for `profile_memory` from ~6.5 us to ~1.2 us. (85% reduction in the kineto contribution to profiling overhead.) Test Plan: Ran ubenchmark with `--op empty --stressTestKineto --kinetoProfileMemory` Reviewed By: swolchok Differential Revision: D32730167 fbshipit-source-id: fe18e8fa3881967cad8fa1c26c71c805e9b034e5
This commit is contained in:
parent
a1d18e3e6a
commit
0d394cb252
|
|
@ -155,6 +155,19 @@ struct OpEventData {
|
||||||
torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
|
torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct MemoryEventData {
|
||||||
|
int64_t start_time;
|
||||||
|
void* ptr;
|
||||||
|
int64_t alloc_size;
|
||||||
|
int64_t total_allocated;
|
||||||
|
int64_t total_reserved;
|
||||||
|
uint64_t threadID;
|
||||||
|
torch::profiler::impl::kineto::DeviceAndResource kineto_info;
|
||||||
|
c10::DeviceType device_type;
|
||||||
|
c10::DeviceIndex device_index;
|
||||||
|
};
|
||||||
|
static_assert(std::is_pod<MemoryEventData>::value, "Non-POD member of MemoryEventData.");
|
||||||
|
|
||||||
// Assumption: Total threads number will not exceed 2^16-1, and total ops will
|
// Assumption: Total threads number will not exceed 2^16-1, and total ops will
|
||||||
// not exceed 2^48 -1.
|
// not exceed 2^48 -1.
|
||||||
static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
|
static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) {
|
||||||
|
|
@ -204,29 +217,16 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
|
||||||
int64_t total_reserved,
|
int64_t total_reserved,
|
||||||
c10::Device device) override {
|
c10::Device device) override {
|
||||||
if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
|
if (config_.profile_memory && config_.state != ProfilerState::Disabled) {
|
||||||
std::lock_guard<std::mutex> guard(state_mutex_);
|
memory_events_.push_back(
|
||||||
auto start_time = getTimeUs();
|
{getTimeUs(),
|
||||||
if (cpu_trace_) {
|
ptr,
|
||||||
torch::profiler::impl::kineto::recordThreadInfo();
|
alloc_size,
|
||||||
cpu_trace_.addMemoryUsageActivity(
|
total_allocated,
|
||||||
kMemoryEventName,
|
total_reserved,
|
||||||
torch::profiler::impl::kineto::kineto_ids(),
|
at::RecordFunction::currentThreadId(),
|
||||||
start_time,
|
torch::profiler::impl::kineto::kineto_ids(),
|
||||||
device,
|
device.type(),
|
||||||
ptr,
|
device.index()});
|
||||||
alloc_size,
|
|
||||||
total_allocated,
|
|
||||||
total_reserved);
|
|
||||||
}
|
|
||||||
|
|
||||||
kineto_events_.emplace_back();
|
|
||||||
auto& evt = kineto_events_.back();
|
|
||||||
evt.name(kMemoryEventName)
|
|
||||||
.startUs(start_time)
|
|
||||||
.deviceIndex(device.index())
|
|
||||||
.deviceType(device.type())
|
|
||||||
.nBytes(alloc_size)
|
|
||||||
.startThreadId(at::RecordFunction::currentThreadId());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -264,6 +264,28 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
|
||||||
|
|
||||||
void materializeOpEvents() {
|
void materializeOpEvents() {
|
||||||
std::lock_guard<std::mutex> guard(state_mutex_);
|
std::lock_guard<std::mutex> guard(state_mutex_);
|
||||||
|
|
||||||
|
for (const auto& e : memory_events_) {
|
||||||
|
cpu_trace_.addMemoryUsageActivity(
|
||||||
|
kMemoryEventName,
|
||||||
|
e.kineto_info,
|
||||||
|
e.start_time,
|
||||||
|
c10::Device(e.device_type, e.device_index),
|
||||||
|
e.ptr,
|
||||||
|
e.alloc_size,
|
||||||
|
e.total_allocated,
|
||||||
|
e.total_reserved);
|
||||||
|
|
||||||
|
kineto_events_.emplace_back();
|
||||||
|
auto& evt = kineto_events_.back();
|
||||||
|
evt.name(kMemoryEventName)
|
||||||
|
.startUs(e.start_time)
|
||||||
|
.deviceIndex(e.device_index)
|
||||||
|
.deviceType(e.device_type)
|
||||||
|
.nBytes(e.alloc_size)
|
||||||
|
.startThreadId(e.threadID);
|
||||||
|
}
|
||||||
|
|
||||||
for (const auto& e : op_events_) {
|
for (const auto& e : op_events_) {
|
||||||
if (e.end_us_ < e.start_us_) {
|
if (e.end_us_ < e.start_us_) {
|
||||||
// We initialize end_us_ to the smallest int64_t, so this means that
|
// We initialize end_us_ to the smallest int64_t, so this means that
|
||||||
|
|
@ -585,6 +607,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase {
|
||||||
uint64_t start_time_;
|
uint64_t start_time_;
|
||||||
std::set<torch::profiler::impl::ActivityType> activities_;
|
std::set<torch::profiler::impl::ActivityType> activities_;
|
||||||
std::deque<OpEventData> op_events_;
|
std::deque<OpEventData> op_events_;
|
||||||
|
std::deque<MemoryEventData> memory_events_;
|
||||||
torch::profiler::impl::kineto::TraceWrapper cpu_trace_;
|
torch::profiler::impl::kineto::TraceWrapper cpu_trace_;
|
||||||
std::vector<KinetoEvent> kineto_events_;
|
std::vector<KinetoEvent> kineto_events_;
|
||||||
// Optional, if event post-processing is enabled.
|
// Optional, if event post-processing is enabled.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user