mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
[PyTorch] Fix write-after-free (TSAN) in GraphTask::set_error() (#33156)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/33156 When dist_autograd_spawn_thrift's 'test_backward_node_failure_python_udf' test is run, it was encountering a TSAN error related to holding the mutex while the underlying datastructure was being dealloced. In this change, we simply get a shared_ptr<> reference to the future, and set_exception() without having the lock held, to avoid deallocing underneath the lock. ghstack-source-id: 98303434 Test Plan: buck test mode/opt-tsan //caffe2/test/distributed/rpc:dist_autograd_spawn_thrift -- 'test_backward_node_failure_python_udf \(test_dist_autograd_spawn\.DistAutogradTestWithSpawn\)' Differential Revision: D19821362 fbshipit-source-id: 82f735e33f8e608552418ae71592400fa3621e40
This commit is contained in:
parent
0c98939b7b
commit
1b2d2ba504
|
|
@ -386,16 +386,22 @@ void Engine::thread_on_exception(
|
|||
void GraphTask::set_exception(
|
||||
std::exception& e,
|
||||
const std::shared_ptr<Node>& fn) {
|
||||
std::lock_guard<std::mutex> lock(mutex_);
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
if (!has_error_.load()) {
|
||||
if (AnomalyMode::is_enabled() && fn) {
|
||||
fn->metadata()->print_stack();
|
||||
}
|
||||
has_error_ = true;
|
||||
if (!future_result_->completed()) {
|
||||
future_result_->setError(e.what());
|
||||
// Careful: setting the future_result_ can trigger DistAutogradContext to
|
||||
// resetGraphTask(), sometimes deleting this underlying GraphTask.
|
||||
// Don't touch *this after setError() below, and release the lock early, to
|
||||
// avoid unlocking this->mutex_ after setting the future.
|
||||
std::shared_ptr<FutureVariableList> future_result = future_result_;
|
||||
lock.unlock();
|
||||
if (!future_result->completed()) {
|
||||
future_result->setError(e.what());
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT(future_result_->hasError());
|
||||
TORCH_INTERNAL_ASSERT(future_result->hasError());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user