[RELAND] [CUDA graphs] Avoid sync errors when graph capturing cudnn rnn calls that use cudnn dropout (#57373)

Summary: https://github.com/pytorch/pytorch/pull/56433 was reverted because the test perceived internal dropout state creation as a memory leak. This PR resubmits with the leak check skipped. Pull Request resolved: https://github.com/pytorch/pytorch/pull/57373 Reviewed By: anjali411 Differential Revision: D28152186 Pulled By: ezyang fbshipit-source-id: 9a593fcdbbabbb09dc4e4221191663e94b697503
2025-12-07 12:21:27 +01:00 · 2021-05-03 11:40:44 -07:00 · 2021-05-03 11:40:44 -07:00 · e841f335aa
commit e841f335aa
parent 1b745efbe8
2 changed files with 84 additions and 1 deletions
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -2,6 +2,7 @@
 #include <ATen/Config.h>
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/CUDAEvent.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
 #include <ATen/cuda/Exceptions.h>
 #include <ATen/InitialTensorOptions.h>
 #include <ATen/MatrixRef.h>
@ -1373,6 +1374,30 @@ std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor&
  return std::make_tuple(hx, cx);
 }

+/**
+ * Note [DropoutState and CUDA graph capture]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * (1) Telling a capturing stream to wait on an event recorded in a non-capturing stream is an error.
+ * (2) Telling a non-capturing stream to wait on an event recorded during capture is also an error.
+ *
+ * So DropoutState's usage syncs could error if an RNN with dropout is called in an uncaptured region
+ * then called in a captured region (triggering 1), or called in a captured region then called
+ # in an uncaptured region (triggering 2).
+ *
+ * To prevent 1 and 2, lock() only syncs on the last usage event if it was recorded in the same
+ * capture state as the current state (which also means the same graph, if capture is in progress).
+ *
+ * The solution should be safe as long as capture obeys the following restrictions:
+ *  - Only one capture may be underway at a time in a given process.
+ *  - While a capture is underway, no calls to eager ops on noncapturing streams (on any thread)
+ *    may interleave with the captured ops.
+ *
+ * TODO: As people experiment with capture, keep an eye out for use cases that might need to
+ * relax those restrictions.
+ *
+ * See https://github.com/pytorch/pytorch/pull/56433 for more discussion.
+ */
+
 struct DropoutState {
  // Both buffer and event are lazily instantiated when a dropout state is needed
  // for the first time. Note that in this case needed != used, as we don't need
@ -1380,6 +1405,12 @@ struct DropoutState {
  at::Tensor buffer;
  c10::optional<cuda::CUDAEvent> event;
  std::mutex mutex;
+#if CUDA_VERSION >= 11000
+  // cudaStreamGetCaptureInfo will never give back a capture id of 0, so 0 can serve
+  // as a sentinel value that capture was not underway.
+  cuda::CaptureId_t capture_id_last_lock = 0;
+  cuda::CaptureId_t capture_id_last_unlock = 0;
+#endif

  // Every time we use a dropout state, we need to synchronize with its event,
  // to make sure all previous uses finish running before this one starts. Once
@ -1392,13 +1423,38 @@ struct DropoutState {
    // could then define it before we get to unlock().
    mutex.lock();
    if (event) {
+#if CUDA_VERSION >= 11000
+      // See Note [DropoutState and CUDA graph capture]
+      cudaStreamCaptureStatus status;
+      AT_CUDA_CHECK(cudaStreamGetCaptureInfo(cuda::getCurrentCUDAStream(),
+                                             &status,
+                                             &capture_id_last_lock));
+      if (status == cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) {
+        capture_id_last_lock = 0;
+      }
+      if (capture_id_last_lock == capture_id_last_unlock) {
+        event->block(cuda::getCurrentCUDAStream());
+      }
+#else
      event->block(cuda::getCurrentCUDAStream());
+#endif
    }
  }

  void unlock() {
    if (event) {
      event->record();
+#if CUDA_VERSION >= 11000
+      // See Note [DropoutState and CUDA graph capture]
+      cudaStreamCaptureStatus status;
+      AT_CUDA_CHECK(cudaStreamGetCaptureInfo(cuda::getCurrentCUDAStream(),
+                                             &status,
+                                             &capture_id_last_unlock));
+      if (status == cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) {
+        capture_id_last_unlock = 0;
+      }
+      TORCH_INTERNAL_ASSERT(capture_id_last_unlock == capture_id_last_lock);
+#endif
    }
    mutex.unlock();
  }
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -25,7 +25,7 @@ from torch.testing._internal.common_methods_invocations import tri_tests_args, t
    _compare_trilu_indices, _compare_large_trilu_indices
 from torch.testing._internal.common_utils import TestCase, freeze_rng_state, run_tests, \
    NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_REMOTE_GPU, IS_SANDCASTLE, IS_WINDOWS, \
-    slowTest, skipCUDANonDefaultStreamIf, TEST_WITH_ROCM, TEST_NUMPY
+    slowTest, skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, TEST_WITH_ROCM, TEST_NUMPY
 from torch.testing._internal.autocast_test_lists import AutocastTestLists

 # load_tests from common_utils is used to automatically filter tests for
@ -3457,6 +3457,33 @@ torch.cuda.synchronize()
        # dummy allocation triggers process_events, Hopefully successfully processes b's end-of-life event.
        c = torch.zeros((3,), device="cuda")

+    @unittest.skipIf((not TEST_CUDA) or
+                     TEST_WITH_ROCM or
+                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")
+    # If this test is the first in the process to try cudnn rnns with dropout, it'll initialize
+    # DropoutState's long-lived internal buffer. Calling code perceives this (correct) behavior
+    # as a memory leak unless we skip the leak check.
+    @skipCUDAMemoryLeakCheckIf(True)
+    def test_graph_cudnn_dropout(self):
+        # Tests the interaction of cuda graph capture with DropoutState's syncs in ATen/native/cudnn/RNN.cpp.
+        # In particular, if user runs a sequence of captured and noncaptured cudnn rnns, DropoutState should
+        # avoid syncing noncapturing streams with captured events or vice versa.
+        model = torch.nn.LSTM(512, 512, 2, dropout=0.5).cuda()
+        x = torch.ones(100, 192, 512, device="cuda")
+
+        y = model(x)
+
+        g = torch.cuda._Graph()
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            g.capture_begin()
+            y = model(x)
+            g.capture_end()
+        torch.cuda.current_stream().wait_stream(s)
+
+        y = model(x)
+
    @unittest.skipIf((not TEST_CUDA) or
                     TEST_WITH_ROCM or
                     int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs")