[PyTorch] Fix write-after-free (TSAN) in GraphTask::set_error() (#33156)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33156 When dist_autograd_spawn_thrift's 'test_backward_node_failure_python_udf' test is run, it was encountering a TSAN error related to holding the mutex while the underlying datastructure was being dealloced. In this change, we simply get a shared_ptr<> reference to the future, and set_exception() without having the lock held, to avoid deallocing underneath the lock. ghstack-source-id: 98303434 Test Plan: buck test mode/opt-tsan //caffe2/test/distributed/rpc:dist_autograd_spawn_thrift -- 'test_backward_node_failure_python_udf \(test_dist_autograd_spawn\.DistAutogradTestWithSpawn\)' Differential Revision: D19821362 fbshipit-source-id: 82f735e33f8e608552418ae71592400fa3621e40
2025-12-07 12:21:27 +01:00 · 2020-02-14 09:29:43 -08:00 · 2020-02-14 09:29:43 -08:00 · 1b2d2ba504
commit 1b2d2ba504
parent 0c98939b7b
1 changed files with 10 additions and 4 deletions
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@ -386,16 +386,22 @@ void Engine::thread_on_exception(
 void GraphTask::set_exception(
    std::exception& e,
    const std::shared_ptr<Node>& fn) {
-  std::lock_guard<std::mutex> lock(mutex_);
+  std::unique_lock<std::mutex> lock(mutex_);
  if (!has_error_.load()) {
    if (AnomalyMode::is_enabled() && fn) {
      fn->metadata()->print_stack();
    }
    has_error_ = true;
-    if (!future_result_->completed()) {
-      future_result_->setError(e.what());
+    // Careful: setting the future_result_ can trigger DistAutogradContext to
+    // resetGraphTask(), sometimes deleting this underlying GraphTask.
+    // Don't touch *this after setError() below, and release the lock early, to
+    // avoid unlocking this->mutex_ after setting the future.
+    std::shared_ptr<FutureVariableList> future_result = future_result_;
+    lock.unlock();
+    if (!future_result->completed()) {
+      future_result->setError(e.what());
    } else {
-      TORCH_INTERNAL_ASSERT(future_result_->hasError());
+      TORCH_INTERNAL_ASSERT(future_result->hasError());
    }
  }
 }