diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 25058f87264..48413e7a6f3 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -1260,9 +1260,6 @@ class DeviceCachingAllocator { // thread local compile context for each device static thread_local std::stack compile_context; - // thread local user metadata for annotating allocations - static thread_local std::string user_metadata; - public: // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) explicit DeviceCachingAllocator(c10::DeviceIndex id) @@ -1305,14 +1302,6 @@ class DeviceCachingAllocator { } } - void setUserMetadata(const std::string& metadata) { - user_metadata = metadata; - } - - std::string getUserMetadata() { - return user_metadata; - } - bool checkPoolLiveAllocations( MempoolId_t mempool_id, const std::unordered_set& expected_live_allocations) const { @@ -3693,8 +3682,7 @@ class DeviceCachingAllocator { mempool_id, getApproximateTime(), record_context_ >= RecordContext::ALLOC ? std::move(context) : nullptr, - compile_string, - user_metadata); + compile_string); // Callbacks should not include any Pytorch call for (const auto& cb : trace_trackers_) { @@ -3749,7 +3737,6 @@ static void uncached_delete(void* ptr) { static void local_raw_delete(void* ptr); thread_local std::stack DeviceCachingAllocator::compile_context; -thread_local std::string DeviceCachingAllocator::user_metadata; #ifdef __cpp_lib_hardware_interference_size using std::hardware_destructive_interference_size; #else @@ -3947,18 +3934,6 @@ class NativeCachingAllocator : public CUDAAllocator { device_allocator[device]->popCompileContext(); } - void setUserMetadata(const std::string& metadata) override { - c10::DeviceIndex device = 0; - C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); - device_allocator[device]->setUserMetadata(metadata); - } - - std::string getUserMetadata() override { - c10::DeviceIndex device = 0; - C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); - return device_allocator[device]->getUserMetadata(); - } - bool isHistoryEnabled() override { c10::DeviceIndex device = 0; C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); diff --git a/c10/cuda/CUDACachingAllocator.h b/c10/cuda/CUDACachingAllocator.h index fbe5dab18e0..89274c9f994 100644 --- a/c10/cuda/CUDACachingAllocator.h +++ b/c10/cuda/CUDACachingAllocator.h @@ -118,8 +118,7 @@ struct TraceEntry { MempoolId_t mempool, approx_time_t time, std::shared_ptr context = nullptr, - std::string compile_context = "", - std::string user_metadata = "") + std::string compile_context = "") : action_(action), device_(device), addr_(addr), @@ -127,8 +126,7 @@ struct TraceEntry { stream_(stream), size_(size), mempool_(std::move(mempool)), - compile_context_(std::move(compile_context)), - user_metadata_(std::move(user_metadata)) { + compile_context_(std::move(compile_context)) { time_.approx_t_ = time; } Action action_; @@ -140,7 +138,6 @@ struct TraceEntry { MempoolId_t mempool_; trace_time_ time_{}; std::string compile_context_; - std::string user_metadata_; }; // Calls made by record_function will save annotations @@ -300,10 +297,6 @@ class CUDAAllocator : public DeviceAllocator { const std::vector>& /*md*/) {} virtual void pushCompileContext(std::string& md) {} virtual void popCompileContext() {} - virtual void setUserMetadata(const std::string& metadata) {} - virtual std::string getUserMetadata() { - return ""; - } virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0; // Attached AllocatorTraceTracker callbacks will be called while the @@ -543,14 +536,6 @@ inline void enablePeerAccess( get()->enablePeerAccess(dev, dev_to_access); } -inline void setUserMetadata(const std::string& metadata) { - get()->setUserMetadata(metadata); -} - -inline std::string getUserMetadata() { - return get()->getUserMetadata(); -} - } // namespace c10::cuda::CUDACachingAllocator namespace c10::cuda { diff --git a/test/test_cuda.py b/test/test_cuda.py index 05302ad9766..667bccd82c2 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -4378,28 +4378,6 @@ class TestCudaMallocAsync(TestCase): finally: torch.cuda.memory._record_memory_history(None) - @unittest.skipIf( - TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync" - ) - @requiresCppContext - def test_memory_plots_metadata(self): - for context in ["alloc", "all", "state"]: - try: - torch._C._cuda_clearCublasWorkspaces() - torch.cuda.memory.empty_cache() - torch.cuda.memory._set_memory_metadata("metadata test") - torch.cuda.memory._record_memory_history(context="all") - x = torch.rand(3, 4, device="cuda") - del x - torch.cuda.memory.empty_cache() - torch.cuda.memory._set_memory_metadata("") - - ss = torch.cuda.memory._snapshot() - for event in ss["device_traces"][0]: - self.assertTrue(event["user_metadata"] == "metadata test") - finally: - torch.cuda.memory._record_memory_history(None) - @unittest.skipIf( TEST_CUDAMALLOCASYNC, "setContextRecorder not supported by CUDAMallocAsync" ) diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index b99fd3f2b80..244200216ec 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -2081,8 +2081,6 @@ def _cuda_hostMemoryStats() -> dict[str, Any]: ... def _cuda_resetAccumulatedHostMemoryStats() -> None: ... def _cuda_resetPeakHostMemoryStats() -> None: ... def _cuda_memorySnapshot(mempool_id: tuple[_int, _int] | None) -> dict[str, Any]: ... -def _cuda_setMemoryMetadata(metadata: str) -> None: ... -def _cuda_getMemoryMetadata() -> str: ... def _cuda_record_memory_history_legacy( enabled: _bool, record_context: _bool, diff --git a/torch/csrc/cuda/Module.cpp b/torch/csrc/cuda/Module.cpp index 32ade368098..0950192457d 100644 --- a/torch/csrc/cuda/Module.cpp +++ b/torch/csrc/cuda/Module.cpp @@ -765,7 +765,6 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) { py::str frames_s = "frames"; py::str time_us_s = "time_us"; py::str compile_context_s = "compile_context"; - py::str user_metadata_s = "user_metadata"; py::list empty_frames; std::vector to_gather_frames; @@ -883,7 +882,6 @@ PyObject* THCPModule_memorySnapshot(PyObject* _unused, PyObject* arg) { trace_entry[stream_s] = int64_t(te.stream_); trace_entry[time_us_s] = te.time_.t_; trace_entry[compile_context_s] = te.compile_context_; - trace_entry[user_metadata_s] = te.user_metadata_; trace.append(trace_entry); } traces.append(trace); @@ -1139,14 +1137,6 @@ static void registerCudaDeviceProperties(PyObject* module) { return c10::cuda::CUDACachingAllocator::isHistoryEnabled(); }); - m.def("_cuda_setMemoryMetadata", [](const std::string& metadata) { - c10::cuda::CUDACachingAllocator::setUserMetadata(metadata); - }); - - m.def("_cuda_getMemoryMetadata", []() { - return c10::cuda::CUDACachingAllocator::getUserMetadata(); - }); - m.def("_cuda_get_conv_benchmark_empty_cache", []() { return at::native::_cudnn_get_conv_benchmark_empty_cache(); }); diff --git a/torch/csrc/cuda/memory_snapshot.cpp b/torch/csrc/cuda/memory_snapshot.cpp index 830159d0a91..d4382aa8cb3 100644 --- a/torch/csrc/cuda/memory_snapshot.cpp +++ b/torch/csrc/cuda/memory_snapshot.cpp @@ -311,7 +311,6 @@ std::string _memory_snapshot_pickled() { IValue is_expandable_s = "is_expandable"; IValue time_us_s = "time_us"; IValue compile_contexts_s = "compile_context"; - IValue user_metadata_s = "user_metadata"; auto empty_frames = new_list(); @@ -429,7 +428,6 @@ std::string _memory_snapshot_pickled() { trace_entry.insert(size_s, (int64_t)te.size_); trace_entry.insert(stream_s, int64_t(te.stream_)); trace_entry.insert(compile_contexts_s, te.compile_context_); - trace_entry.insert(user_metadata_s, te.user_metadata_); if (te.context_) { auto sc = getFromContext(te.context_); frame_tracebacks.push_back(sc); diff --git a/torch/cuda/memory.py b/torch/cuda/memory.py index e4b125eb425..5eeaf3a8253 100644 --- a/torch/cuda/memory.py +++ b/torch/cuda/memory.py @@ -1063,36 +1063,6 @@ def _dump_snapshot(filename="dump_snapshot.pickle"): pickle.dump(s, f) -def _set_memory_metadata(metadata: str): - """ - Set custom metadata that will be attached to all subsequent CUDA memory allocations. - - This metadata will be recorded in the memory snapshot for all allocations made - after this call until the metadata is cleared or changed. - - Args: - metadata (str): Custom metadata string to attach to allocations. - Pass an empty string to clear the metadata. - - Example: - >>> torch.cuda.memory._set_memory_metadata("training_phase") - >>> # All allocations here will have "training_phase" metadata - >>> x = torch.randn(100, 100, device="cuda") - >>> torch.cuda.memory._set_memory_metadata("") # Clear metadata - """ - torch._C._cuda_setMemoryMetadata(metadata) - - -def _get_memory_metadata() -> str: - """ - Get the current custom metadata that is being attached to CUDA memory allocations. - - Returns: - str: The current metadata string, or empty string if no metadata is set. - """ - return torch._C._cuda_getMemoryMetadata() - - def _save_segment_usage(filename="output.svg", snapshot=None): if snapshot is None: snapshot = _snapshot()