pytorch/c10/core/CachingDeviceAllocator.h
Yu, Guangye 6c1da66407 [Reland] Refactor caching device allocator utils (#130923)
# Motivation
Following [[RFC] Intel GPU Runtime Upstreaming for Allocator ](https://github.com/pytorch/pytorch/issues/116322), this PR aims to refactor caching device allocator utils to improve code reuse usage.
This is the first PR, we could prepare some follow-up PRs continuing to refactor the device caching allocator.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/130923
Approved by: https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/albanD, https://github.com/eqy
2024-09-07 11:14:17 +00:00

132 lines
3.6 KiB
C++

#pragma once
#include <c10/core/Allocator.h>
#include <c10/util/irange.h>
#include <array>
namespace c10::CachingDeviceAllocator {
struct Stat {
void increase(size_t amount) {
current += static_cast<int64_t>(amount);
peak = std::max(current, peak);
allocated += static_cast<int64_t>(amount);
}
void decrease(size_t amount) {
current -= static_cast<int64_t>(amount);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
current >= 0,
"Negative tracked stat in device allocator (likely logic error).");
freed += static_cast<int64_t>(amount);
}
void reset_accumulated() {
allocated = 0;
freed = 0;
}
void reset_peak() {
peak = current;
}
int64_t current = 0;
int64_t peak = 0;
int64_t allocated = 0;
int64_t freed = 0;
};
enum struct StatType : uint64_t {
AGGREGATE = 0,
SMALL_POOL = 1,
LARGE_POOL = 2,
NUM_TYPES = 3 // remember to update this whenever a new stat type is added
};
using StatArray = std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)>;
using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>;
template <typename Func>
void for_each_selected_stat_type(const StatTypes& stat_types, Func f) {
for (const auto stat_type : c10::irange(stat_types.size())) {
if (stat_types[stat_type]) {
f(stat_type);
}
}
}
// Struct containing memory allocator summary statistics for a device.
struct DeviceStats {
// COUNT: allocations requested by client code
StatArray allocation;
// COUNT: number of allocated segments from device memory allocation.
StatArray segment;
// COUNT: number of active memory blocks (allocated or used by stream)
StatArray active;
// COUNT: number of inactive, split memory blocks (unallocated but can't be
// released via device memory deallocation)
StatArray inactive_split;
// SUM: bytes allocated by this memory alocator
StatArray allocated_bytes;
// SUM: bytes reserved by this memory allocator (both free and used)
StatArray reserved_bytes;
// SUM: bytes within active memory blocks
StatArray active_bytes;
// SUM: bytes within inactive, split memory blocks
StatArray inactive_split_bytes;
// SUM: bytes requested by client code
StatArray requested_bytes;
// COUNT: total number of failed calls to device malloc necessitating cache
// flushes.
int64_t num_alloc_retries = 0;
// COUNT: total number of OOMs (i.e. failed calls to device memory allocation
// after cache flush)
int64_t num_ooms = 0;
// COUNT: total number of oversize blocks allocated from pool
Stat oversize_allocations;
// COUNT: total number of oversize blocks requiring malloc
Stat oversize_segments;
// COUNT: total number of synchronize_and_free_events() calls
int64_t num_sync_all_streams = 0;
// COUNT: total number of device memory allocation calls. This includes both
// mapped and malloced memory.
int64_t num_device_alloc = 0;
// COUNT: total number of device memory deallocation calls. This includes both
// un-mapped and free memory.
int64_t num_device_free = 0;
// SIZE: maximum block size that is allowed to be split.
int64_t max_split_size = 0;
};
// Size pretty-printer
inline std::string format_size(uint64_t size) {
std::ostringstream os;
os.precision(2);
os << std::fixed;
if (size <= 1024) {
os << size << " bytes";
} else if (size <= 1048576) {
os << (static_cast<double>(size) / 1024.0);
os << " KiB";
} else if (size <= 1073741824ULL) {
os << static_cast<double>(size) / 1048576.0;
os << " MiB";
} else {
os << static_cast<double>(size) / 1073741824.0;
os << " GiB";
}
return os.str();
}
} // namespace c10::CachingDeviceAllocator