mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Summary: Resubmit #20698 which got messed up. Idea is that when PyTorch is used in a custom build environment (e.g. Facebook), it's useful to track usage of various APIs centrally. This PR introduces a simple very lightweight mechanism to do so - only first invocation of a trigger point would be logged. This is significantly more lightweight than #18235 and thus we can allow to put logging in e.g. TensorImpl. Also adds an initial list of trigger points. Trigger points are added in such a way that no static initialization triggers them, i.e. just linking with libtorch.so will not cause any logging. Further suggestions of what to log are welcomed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/20745 Differential Revision: D15429196 Pulled By: dzhulgakov fbshipit-source-id: a5e41a709a65b7ebccc6b95f93854e583cf20aca
633 lines
23 KiB
Plaintext
633 lines
23 KiB
Plaintext
#include <algorithm>
|
|
#include <atomic>
|
|
#include <cstdlib>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
|
|
#include <c10/cuda/CUDACachingAllocator.h>
|
|
#include "cub/util_allocator.cuh"
|
|
|
|
// Needed to be included first to check the CAFFE2_USE_CUDNN macros.
|
|
#include "caffe2/core/macros.h"
|
|
|
|
#include "caffe2/core/asan.h"
|
|
#include "caffe2/core/blob_stats.h"
|
|
#ifdef CAFFE2_USE_CUDNN
|
|
#include "caffe2/core/common_cudnn.h"
|
|
#endif // CAFFE2_USE_CUDNN
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/core/init.h"
|
|
#include "caffe2/core/logging.h"
|
|
#include "caffe2/core/tensor.h"
|
|
#include "caffe2/utils/string_utils.h"
|
|
|
|
C10_DEFINE_string(
|
|
caffe2_cuda_memory_pool,
|
|
"",
|
|
"Sets the memory pool used by caffe2. Possible values are "
|
|
"none, cnmem, thc and cub.");
|
|
|
|
// For description of CUB caching allocator configuration, see
|
|
// https://nvlabs.github.io/cub/structcub_1_1_caching_device_allocator.html
|
|
C10_DEFINE_int(
|
|
caffe2_cub_bin_growth,
|
|
8,
|
|
"If using cub as the memory allocator, sets the growth of bins "
|
|
"used by the cub pool.");
|
|
C10_DEFINE_int(
|
|
caffe2_cub_min_bin,
|
|
3,
|
|
"If using cub as the memory allocator, sets the min number of "
|
|
"bins.");
|
|
C10_DEFINE_int(
|
|
caffe2_cub_max_bin,
|
|
10,
|
|
"If using cub as the memory allocator, sets the max number of "
|
|
"bins.");
|
|
C10_DEFINE_int(
|
|
caffe2_cub_max_managed_mb,
|
|
10 * 1024,
|
|
"If using cub as the memory allocators, sets the maximum amount "
|
|
"of memory managed in gigabytes");
|
|
|
|
C10_DEFINE_bool(
|
|
caffe2_cub_print_allocation_events,
|
|
false,
|
|
"If true CachingDeviceAllocator will print allocation and deallocation "
|
|
"events to stdout.");
|
|
|
|
C10_DEFINE_bool(
|
|
caffe2_gpu_memory_tracking,
|
|
false,
|
|
"If set, logs changes in GPU memory allocations");
|
|
C10_DEFINE_int(
|
|
caffe2_gpu_memory_report_interval_mb,
|
|
128,
|
|
"The threshold in MB on how frequently to report memory changes");
|
|
|
|
namespace at {
|
|
|
|
REGISTER_CONTEXT(DeviceType::CUDA, caffe2::CUDAContext);
|
|
} // namespace at
|
|
|
|
namespace caffe2 {
|
|
|
|
// Generic implementation - CUDA will handle the right function to call for us
|
|
void CUDAContext::CopyBytesAsync(
|
|
size_t nbytes,
|
|
const void* src,
|
|
Device src_device,
|
|
void* dst,
|
|
Device dst_device) {
|
|
// TODO: verify that the CUDA handles copy from device to device correctly
|
|
// even without SetDevice()
|
|
// TODO: verify whether source or dest device should be a priority in picking
|
|
// the stream
|
|
// NB: right now the cross-device copy logic is invoked only in the contexts
|
|
// when surrounding code explicitly manages data dependencies and sets up
|
|
// events, so it's fine. In order to make it a standalone function proper
|
|
// synchronization between stream is required
|
|
int gpu_id = 0;
|
|
if (dst_device.type() == DeviceType::CUDA) {
|
|
gpu_id = dst_device.index();
|
|
} else if (src_device.type() == DeviceType::CUDA) {
|
|
gpu_id = src_device.index();
|
|
} else {
|
|
LOG(FATAL) << "shouldn't be called with non-cuda device";
|
|
}
|
|
CUDA_ENFORCE(cudaMemcpyAsync(
|
|
dst,
|
|
src,
|
|
nbytes,
|
|
cudaMemcpyDefault,
|
|
CUDAContext::getCudaObjects().GetStream(gpu_id)));
|
|
}
|
|
|
|
void CUDAContext::CopyBytesSync(
|
|
size_t nbytes,
|
|
const void* src,
|
|
Device src_device,
|
|
void* dst,
|
|
Device dst_device) {
|
|
// This emulates Caffe2 original behavior where sync copy doesn't change the
|
|
// device. It's probably better for clarity to switch to the target device
|
|
// explicitly here, but in the worst case CUDA would sync for us.
|
|
// TODO: change it to CUDAGuard
|
|
CUDAContext context(-1); // take current device
|
|
CUDA_ENFORCE(cudaMemcpyAsync(
|
|
dst, src, nbytes, cudaMemcpyDefault, context.cuda_stream()));
|
|
// destructor of context synchronizes
|
|
}
|
|
|
|
// For the CPU context, we also allow a (probably expensive) function
|
|
// to copy the data from a cuda context. Inside the function, we create
|
|
// a temporary CUDAContext object to carry out the copy. From the caller's
|
|
// side, these functions are synchronous with respect to the host, similar
|
|
// to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
|
|
template <>
|
|
inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
|
|
size_t nbytes,
|
|
const void* src,
|
|
void* dst) {
|
|
CUDAContext context(GetGPUIDForPointer(src));
|
|
context.CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
|
|
}
|
|
template <>
|
|
inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
|
|
size_t nbytes,
|
|
const void* src,
|
|
void* dst) {
|
|
CUDAContext context(GetGPUIDForPointer(dst));
|
|
context.CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
|
|
}
|
|
|
|
} // namespace caffe2
|
|
|
|
namespace caffe2 {
|
|
|
|
ThreadLocalCUDAObjects& CUDAContext::getCudaObjects() {
|
|
static thread_local ThreadLocalCUDAObjects cuda_objects_;
|
|
return cuda_objects_;
|
|
}
|
|
|
|
// TODO(jiayq): these variables shouldn't be currently accessed during static
|
|
// initialization. We should consider moving them to a Mayer's singleton to
|
|
// be totally safe against SIOF.
|
|
|
|
// Static global variables for setting up the memory pool.
|
|
CudaMemoryPoolType g_cuda_memory_pool_type;
|
|
|
|
std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
|
|
|
|
// an unordered map that holds the map from the cuda memory pointer to the
|
|
// device id that it is allocated from. This is used in the cuda memory pool
|
|
// cases, where we need the device id to carry out the deletion.
|
|
// Note(jiayq): an alternate approach is to use cudaGetPointerAttributes, but
|
|
// that is usually quite slow. We might want to benchmark the speed difference
|
|
// though.
|
|
// Note(jiayq): another alternate approach is to augment the Tensor class that
|
|
// would allow one to record the device id. However, this does not address any
|
|
// non-tensor allocation and deallocation.
|
|
// Ideally, a memory pool should already have the device id information, as
|
|
// long as we are using UVA (as of CUDA 5 and later) so the addresses are
|
|
// unique.
|
|
static std::unordered_map<void*, uint8_t> g_cuda_device_affiliation;
|
|
|
|
// Data structures for optional memory tracking. Access to these structures
|
|
// is garded by the CUDAContext::mutex.
|
|
static std::unordered_map<void*, long> g_size_map;
|
|
static std::vector<long> g_total_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
|
|
static std::vector<long> g_max_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
|
|
|
|
static long g_total_mem = 0;
|
|
static long g_last_rep = 0;
|
|
|
|
CudaMemoryPoolType GetCudaMemoryPoolType() {
|
|
return g_cuda_memory_pool_type;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// A wrapper to allow us to lazily initialize all cuda environments that Caffe
|
|
// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
|
|
// which is probably the decisive indication that this caffe2 run is going to
|
|
// use GPUs. We avoid cuda initialization with core/init.h functionalities so
|
|
// that we have minimal resource impact in case we will need to run multiple
|
|
// caffe2 instances on a GPU machine.
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void Caffe2InitializeCuda() {
|
|
// If the current run does not have any cuda devices, do nothing.
|
|
if (!HasCudaGPU()) {
|
|
VLOG(1) << "No cuda gpu present. Skipping.";
|
|
return;
|
|
}
|
|
C10_LOG_API_USAGE_ONCE("caffe2.init.cuda");
|
|
// Check if the number of GPUs matches the expected compile-time max number
|
|
// of GPUs.
|
|
CAFFE_ENFORCE_LE(
|
|
NumCudaDevices(),
|
|
C10_COMPILE_TIME_MAX_GPUS,
|
|
"Number of CUDA devices on the machine is larger than the compiled "
|
|
"max number of gpus expected (",
|
|
C10_COMPILE_TIME_MAX_GPUS,
|
|
"). Increase that and recompile.");
|
|
|
|
for (DeviceIndex i = 0; i < NumCudaDevices(); ++i) {
|
|
CUDAGuard g(i);
|
|
// Enable peer access.
|
|
const int peer_group = i / CAFFE2_CUDA_MAX_PEER_SIZE;
|
|
const int peer_start = peer_group * CAFFE2_CUDA_MAX_PEER_SIZE;
|
|
const int peer_end = std::min(
|
|
NumCudaDevices(), (peer_group + 1) * CAFFE2_CUDA_MAX_PEER_SIZE);
|
|
VLOG(1) << "Enabling peer access within group #" << peer_group
|
|
<< ", from gpuid " << peer_start << " to " << peer_end - 1
|
|
<< ", for gpuid " << i << ".";
|
|
|
|
for (int j = peer_start; j < peer_end; ++j) {
|
|
if (i == j) continue;
|
|
int can_access;
|
|
CUDA_ENFORCE(cudaDeviceCanAccessPeer(&can_access, i, j));
|
|
if (can_access) {
|
|
VLOG(1) << "Enabling peer access from " << i << " to " << j;
|
|
// Note: just for future reference, the 0 here is not a gpu id, it is
|
|
// a reserved flag for cudaDeviceEnablePeerAccess that should always be
|
|
// zero currently.
|
|
// It is ok if peer access is already enabled...
|
|
cudaError_t err = cudaDeviceEnablePeerAccess(j, 0);
|
|
if ((err != cudaErrorPeerAccessAlreadyEnabled) &&
|
|
(err != cudaSuccess)) {
|
|
CAFFE_THROW(cudaGetErrorString(err));
|
|
}
|
|
cudaGetLastError(); // reset cuda error code
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef CAFFE2_USE_CUDNN
|
|
// Check the versions of cuDNN that were compiled and linked with are compatible
|
|
CheckCuDNNVersions();
|
|
#endif // CAFFE2_USE_CUDNN
|
|
}
|
|
|
|
static void SetUpCub() {
|
|
VLOG(1) << "Setting up cub memory pool.";
|
|
// Sets up the cub memory pool
|
|
try {
|
|
g_cub_allocator.reset(new cub::CachingDeviceAllocator(
|
|
FLAGS_caffe2_cub_bin_growth,
|
|
FLAGS_caffe2_cub_min_bin,
|
|
FLAGS_caffe2_cub_max_bin,
|
|
size_t(FLAGS_caffe2_cub_max_managed_mb) * 1024L * 1024L,
|
|
false,
|
|
FLAGS_caffe2_cub_print_allocation_events));
|
|
} catch (...) {
|
|
CAFFE_THROW("Some error happened at cub initialization.");
|
|
}
|
|
VLOG(1) << "Done setting up cub memory pool.";
|
|
}
|
|
|
|
static void Caffe2SetCUDAMemoryPool() {
|
|
if (FLAGS_caffe2_cuda_memory_pool == "" ||
|
|
FLAGS_caffe2_cuda_memory_pool == "none") {
|
|
g_cuda_memory_pool_type = CudaMemoryPoolType::NONE;
|
|
} else if (FLAGS_caffe2_cuda_memory_pool == "cnmem") {
|
|
CAFFE_THROW("CNMEM is no longer used by Caffe2. Use cub instead. "
|
|
"This error message may go away in the future.");
|
|
} else if (FLAGS_caffe2_cuda_memory_pool == "cub") {
|
|
// Sets up cub.
|
|
g_cuda_memory_pool_type = CudaMemoryPoolType::CUB;
|
|
SetUpCub();
|
|
} else if (FLAGS_caffe2_cuda_memory_pool == "thc") {
|
|
g_cuda_memory_pool_type = CudaMemoryPoolType::THC;
|
|
} else {
|
|
CAFFE_THROW(
|
|
"Unrecognized cuda memory pool type: ", FLAGS_caffe2_cuda_memory_pool);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* An allocator that does the CPU memory allocation with pinned memory.
|
|
*
|
|
* This is needed because if we want to do any asynchronous cuda memcpy,
|
|
* the underlying CPU memory also needs to be allocated into pinned memory
|
|
* space. As a result, whenever Caffe2 is built with GPU and there is
|
|
* GPU present during runtime, at global initialization time we will set
|
|
* the CPU memory allocator to allocate pinned memory.
|
|
*
|
|
* NB: This behavior is probably too agressive. We should consider asking users
|
|
* to do on-demand memory pinning (like exposed in PyTorch APIs) instead.
|
|
*/
|
|
struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
|
|
PinnedCPUAllocator() {
|
|
baseAllocator_ = GetDefaultCPUAllocator();
|
|
}
|
|
~PinnedCPUAllocator() override {}
|
|
at::DataPtr allocate(size_t nbytes) const override {
|
|
if (nbytes == 0) {
|
|
// replicate c10::alloc_cpu behavior - return nullptr
|
|
return {nullptr, nullptr, &Delete, at::Device(CPU)};
|
|
}
|
|
void* data;
|
|
at::DataPtr data_ptr;
|
|
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
|
if (IsNUMAEnabled()) {
|
|
at::DeleterFnPtr expected_deleter = baseAllocator_->raw_deleter();
|
|
data_ptr = baseAllocator_->allocate(nbytes);
|
|
data = data_ptr.get();
|
|
CAFFE_ENFORCE(data);
|
|
CUDA_ENFORCE(cudaHostRegister(data, nbytes, cudaHostRegisterDefault));
|
|
CAFFE_ENFORCE(
|
|
data_ptr.compare_exchange_deleter(expected_deleter, &Delete),
|
|
"Failed to swap deleter (already swapped?)");
|
|
} else {
|
|
CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
|
|
data_ptr = {data, data, &Delete, at::Device(CPU)};
|
|
}
|
|
memset(data, 0, nbytes);
|
|
return data_ptr;
|
|
}
|
|
|
|
at::DeleterFnPtr raw_deleter() const override {
|
|
return &Delete;
|
|
}
|
|
|
|
private:
|
|
static void Delete(void* data) {
|
|
if (!data) {
|
|
return;
|
|
}
|
|
// Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
|
|
// or not. If a CUDAContext::New() call is made, inside the CUDAContext
|
|
// function we will switch the cpu side allocator to a PinnedCPUAllocator.
|
|
// But, if one calls CPUContext::New() before any cuda allocations,
|
|
// PinnedCPUAllocator can still delete the corresponding memory.
|
|
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
|
if (IsNUMAEnabled()) {
|
|
CUDA_ENFORCE(cudaHostUnregister(data));
|
|
GetDefaultCPUAllocator()->raw_deleter()(data);
|
|
} else {
|
|
cudaError_t err = cudaFreeHost(data);
|
|
if (err == cudaErrorInvalidValue) {
|
|
free(data);
|
|
// Calling cudaGetLastError will reset the cuda error.
|
|
cudaError_t _err = cudaGetLastError();
|
|
} else {
|
|
// For all other errors, still do a cuda check.
|
|
CUDA_ENFORCE(err);
|
|
}
|
|
}
|
|
}
|
|
|
|
at::Allocator* baseAllocator_;
|
|
};
|
|
|
|
static PinnedCPUAllocator g_pinned_cpu_alloc;
|
|
|
|
// An initialization function that sets the CPU side to use pinned cpu
|
|
// allocator.
|
|
void Caffe2UsePinnedCPUAllocator() {
|
|
#if CAFFE2_ASAN_ENABLED
|
|
// Note(jiayq): for more details, see
|
|
// https://github.com/google/sanitizers/issues/629
|
|
LOG(WARNING) << "There are known issues between address sanitizer and "
|
|
"cudaMallocHost. As a result, caffe2 will not enable pinned "
|
|
"memory allocation in asan mode. If you are expecting any "
|
|
"behavior that depends on asan, be advised that it is not "
|
|
"turned on.";
|
|
#else
|
|
if (!HasCudaGPU()) {
|
|
VLOG(1) << "No GPU present. I won't use pinned allocator then.";
|
|
return;
|
|
}
|
|
VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
|
|
SetCPUAllocator(&g_pinned_cpu_alloc);
|
|
#endif
|
|
}
|
|
|
|
// Caffe2CudaInitializerHelper is a minimal struct whose sole purpose is to
|
|
// detect the first hint that this Caffe2 run is going to use GPU: either
|
|
// CUDAContext is initialized or CUDAContext::New is called. It then runs
|
|
// all the related cuda initialization functions.
|
|
namespace {
|
|
struct Caffe2CudaInitializerHelper {
|
|
Caffe2CudaInitializerHelper() {
|
|
// We cannot use bool because nvcc changes bool to __nv_bool which does
|
|
// not have a std::atomic instantiation.
|
|
static std::atomic<char> first_call(1);
|
|
if (first_call.fetch_and((char)0)) {
|
|
Caffe2InitializeCuda();
|
|
Caffe2SetCUDAMemoryPool();
|
|
Caffe2UsePinnedCPUAllocator();
|
|
}
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
/**
|
|
* A utility function to rectify the gpu id. If the context specifies the
|
|
* gpu id to be -1, it means that we will just use the current gpu id when
|
|
* the function is being called.
|
|
*/
|
|
static inline DeviceIndex RectifyGPUID(DeviceIndex gpu_id) {
|
|
return gpu_id == -1 ? CaffeCudaGetDevice() : gpu_id;
|
|
}
|
|
|
|
CUDAContext::CUDAContext(DeviceIndex gpu_id)
|
|
: gpu_id_(RectifyGPUID(gpu_id)), random_seed_(RandomNumberSeed()) {
|
|
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
|
}
|
|
|
|
CUDAContext::CUDAContext(const DeviceOption& option)
|
|
: gpu_id_(
|
|
option.has_device_id() ? RectifyGPUID(option.device_id())
|
|
: CaffeCudaGetDevice()),
|
|
random_seed_(
|
|
option.has_random_seed() ? option.random_seed()
|
|
: RandomNumberSeed()) {
|
|
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
|
DCHECK_EQ(option.device_type(), PROTO_CUDA);
|
|
}
|
|
|
|
// shared mutex to lock out alloc / free during NCCL launches
|
|
std::mutex& CUDAContext::mutex() {
|
|
static std::mutex m;
|
|
return m;
|
|
}
|
|
|
|
std::vector<long> CUDAContext::TotalMemoryByGpu() {
|
|
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
|
CAFFE_ENFORCE(
|
|
FLAGS_caffe2_gpu_memory_tracking,
|
|
"Pass --caffe2_gpu_memory_tracking to enable memory stats");
|
|
return g_total_by_gpu_map;
|
|
}
|
|
|
|
std::vector<long> CUDAContext::MaxMemoryByGpu() {
|
|
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
|
CAFFE_ENFORCE(
|
|
FLAGS_caffe2_gpu_memory_tracking,
|
|
"Pass --caffe2_gpu_memory_tracking to enable memory stats");
|
|
return g_max_by_gpu_map;
|
|
}
|
|
|
|
namespace {
|
|
void TrackMemoryAlloc(size_t nbytes) {
|
|
int this_gpu = CaffeCudaGetDevice();
|
|
g_total_by_gpu_map[this_gpu] += nbytes;
|
|
g_max_by_gpu_map[this_gpu] =
|
|
max(g_max_by_gpu_map[this_gpu], g_total_by_gpu_map[this_gpu]);
|
|
g_total_mem += nbytes;
|
|
if (g_total_mem - g_last_rep >
|
|
FLAGS_caffe2_gpu_memory_report_interval_mb * 1024 * 1024) {
|
|
for (int gpu = 0; gpu < g_total_by_gpu_map.size(); gpu++) {
|
|
long t = g_total_by_gpu_map[gpu];
|
|
long max_t = g_max_by_gpu_map[gpu];
|
|
if (max_t > 0) {
|
|
if (max_t != t) {
|
|
VLOG(1) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB"
|
|
<< " (max: " << max_t / 1024 / 1024 << " MB)";
|
|
} else {
|
|
VLOG(1) << "GPU " << gpu << ": " << t / 1024 / 1024 << " MB";
|
|
}
|
|
}
|
|
}
|
|
VLOG(1) << "Total: " << g_total_mem / 1024 / 1024 << " MB";
|
|
g_last_rep = g_total_mem;
|
|
}
|
|
}
|
|
}
|
|
|
|
struct DefaultCUDAAllocator final : public at::Allocator {
|
|
DefaultCUDAAllocator() {}
|
|
~DefaultCUDAAllocator() override {}
|
|
at::DataPtr allocate(size_t nbytes) const override {
|
|
// Lock the mutex
|
|
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
|
// A one-time caffe2 cuda initializer.
|
|
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
|
void* ptr = nullptr;
|
|
|
|
if (FLAGS_caffe2_gpu_memory_tracking) {
|
|
TrackMemoryAlloc(nbytes);
|
|
}
|
|
switch (g_cuda_memory_pool_type) {
|
|
case CudaMemoryPoolType::NONE:
|
|
if (nbytes != 0) {
|
|
CUDA_ENFORCE(cudaMalloc(&ptr, nbytes));
|
|
}
|
|
if (FLAGS_caffe2_gpu_memory_tracking) {
|
|
g_size_map[ptr] = nbytes;
|
|
g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
|
|
}
|
|
return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
|
|
case CudaMemoryPoolType::CUB:
|
|
if (nbytes != 0) {
|
|
CUDA_ENFORCE(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
|
|
}
|
|
g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
|
|
VLOG(2) << "CUB allocating pointer " << ptr << " on device "
|
|
<< CaffeCudaGetDevice();
|
|
if (FLAGS_caffe2_gpu_memory_tracking) {
|
|
g_size_map[ptr] = nbytes;
|
|
}
|
|
return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
|
|
case CudaMemoryPoolType::THC:
|
|
{
|
|
// The reason we have this stream guard here is to preserve
|
|
// the historical behavior of the 'thc' allocator in Caffe2,
|
|
// which is to put all allocations on the same (default)
|
|
// stream. This behavior is morally wrong (since passing
|
|
// allocations between streams allows for the possibility
|
|
// of you handing out some memory that an old stream
|
|
// is still working on), but it doesn't seem to cause issues
|
|
// in Caffe2 today. Our hypothesis for why this is the case
|
|
// is that Caffe2 doesn't really do very many allocations
|
|
// on the fly; instead they allocate once and then reuse
|
|
// the allocations for the whole program. In this case,
|
|
// the hazard is avoided.
|
|
//
|
|
// We intend to remove this stream guard, but the benefit
|
|
// to putting all allocations on the same stream is it
|
|
// reduces per-stream fragmentation, and this helps
|
|
// some models that are currently running with the thc
|
|
// allocator fit in memory. We will need to find some
|
|
// way of resolving this problem.
|
|
cuda::CUDAStreamGuard g(
|
|
Stream(
|
|
Stream::DEFAULT,
|
|
Device(kCUDA, CaffeCudaGetDevice())
|
|
));
|
|
ptr = cuda::CUDACachingAllocator::raw_alloc(nbytes);
|
|
}
|
|
if (FLAGS_caffe2_gpu_memory_tracking) {
|
|
g_size_map[ptr] = nbytes;
|
|
g_cuda_device_affiliation[ptr] = CaffeCudaGetDevice();
|
|
}
|
|
return {ptr, ptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
|
|
}
|
|
return {nullptr, nullptr, &Delete, at::Device(CUDA, CaffeCudaGetDevice())};
|
|
}
|
|
|
|
at::DeleterFnPtr raw_deleter() const override {
|
|
return &Delete;
|
|
}
|
|
|
|
private:
|
|
static void Delete(void* ptr) {
|
|
// lock the mutex
|
|
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
|
if (FLAGS_caffe2_gpu_memory_tracking) {
|
|
auto sz_it = g_size_map.find(ptr);
|
|
DCHECK(sz_it != g_size_map.end());
|
|
auto aff_it = g_cuda_device_affiliation.find(ptr);
|
|
DCHECK(aff_it != g_cuda_device_affiliation.end());
|
|
g_total_mem -= sz_it->second;
|
|
g_total_by_gpu_map[aff_it->second] -= sz_it->second;
|
|
g_size_map.erase(sz_it);
|
|
}
|
|
|
|
switch (g_cuda_memory_pool_type) {
|
|
case CudaMemoryPoolType::NONE: {
|
|
// If memory pool is not set up, use simple cudaFree.
|
|
cudaError_t error = cudaFree(ptr);
|
|
// For some reason, in Python runtime we sometimes delete a data pointer
|
|
// after the cuda runtime exits - this is odd but is probably caused by
|
|
// a static workspace that pycaffe2 uses, and the destruction got
|
|
// entangled in some race condition. Anyway, since cuda runtime is
|
|
// exiting anyway, we will not need to worry about memory leak, so we
|
|
// basically ignore it. This is definitely not ideal but works for now.
|
|
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
|
|
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
|
|
<< cudaGetErrorString(error);
|
|
}
|
|
|
|
if (FLAGS_caffe2_gpu_memory_tracking) {
|
|
g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
|
|
}
|
|
|
|
break;
|
|
}
|
|
case CudaMemoryPoolType::CUB: {
|
|
auto it = g_cuda_device_affiliation.find(ptr);
|
|
DCHECK(it != g_cuda_device_affiliation.end());
|
|
VLOG(2) << "CUB freeing pointer " << ptr << " on device " << it->second;
|
|
CUDA_ENFORCE(g_cub_allocator->DeviceFree(it->second, ptr));
|
|
g_cuda_device_affiliation.erase(it);
|
|
break;
|
|
}
|
|
case CudaMemoryPoolType::THC: {
|
|
cuda::CUDACachingAllocator::raw_delete(ptr);
|
|
if (FLAGS_caffe2_gpu_memory_tracking) {
|
|
g_cuda_device_affiliation.erase(g_cuda_device_affiliation.find(ptr));
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
static DefaultCUDAAllocator g_cuda_alloc;
|
|
REGISTER_ALLOCATOR(CUDA, &g_cuda_alloc);
|
|
|
|
} // namespace caffe2
|
|
|
|
namespace at {
|
|
REGISTER_COPY_BYTES_FUNCTION(
|
|
DeviceType::CUDA,
|
|
DeviceType::CUDA,
|
|
caffe2::CUDAContext::CopyBytesSync,
|
|
caffe2::CUDAContext::CopyBytesAsync);
|
|
|
|
REGISTER_COPY_BYTES_FUNCTION(
|
|
DeviceType::CUDA,
|
|
DeviceType::CPU,
|
|
caffe2::CUDAContext::CopyBytesSync,
|
|
caffe2::CUDAContext::CopyBytesAsync);
|
|
|
|
REGISTER_COPY_BYTES_FUNCTION(
|
|
DeviceType::CPU,
|
|
DeviceType::CUDA,
|
|
caffe2::CUDAContext::CopyBytesSync,
|
|
caffe2::CUDAContext::CopyBytesAsync);
|
|
} // namespace at
|