Turn on static cuda launcher in OSS (#151691)

After a few small bugfixes on tests (to make it so we throw/catch similar exceptions to triton), I think we're ready to flip the switch and use StaticCudaLauncher on by default in OSS. Initial round of benchmarks look good, with average compilation time going down by a few percent: <img width="828" alt="image" src="https://github.com/user-attachments/assets/cad03e09-b4d6-49a7-a9e5-6068d1c0bd5c" /> With no changes to runtime perf: <img width="823" alt="image" src="https://github.com/user-attachments/assets/3fcd435e-1057-43f4-878b-8d66a3812a10" /> There are a few noisy models I want to double check, though, so will run some more tests before accepting review. Full benchmark results, showing a ~5% compile time improvement across the board: https://hud.pytorch.org/benchmark/huggingface/inductor_with_cudagraphs?dashboard=torchinductor&startTime=Wed%2C%2016%20Apr%202025%2002%3A31%3A12%20GMT&stopTime=Wed%2C%2023%20Apr%202025%2002%3A31%3A12%20GMT&granularity=hour&mode=training&dtype=amp&deviceName=cuda%20(a100)&lBranch=gh/jamesjwu/139/orig&lCommit=cc45c8667fa23dec16ca50002d9504a34688ca5c&rBranch=main&rCommit=2a9afdae81d0dde98e96d7e3c9ca840e241e5405 <img width="1482" alt="image" src="https://github.com/user-attachments/assets/6e6a7f39-7f44-459f-9845-9a37f084ea82" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/151691 Approved by: https://github.com/oulgen, https://github.com/jansel, https://github.com/EikanWang
2025-12-06 12:20:52 +01:00 · 2025-04-24 13:19:03 -07:00 · 2025-04-24 13:19:03 -07:00 · 0dae27d75b
commit 0dae27d75b
parent c03359de2d
6 changed files with 21 additions and 8 deletions
--- a/test/inductor/test_profiler.py
+++ b/test/inductor/test_profiler.py
@ -179,6 +179,9 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
            self.assertTrue(event_found)

    @unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
+    @config.patch(
+        "compile_threads", 1
+    )  # This test monkey patches global variables, which workers don't see
    def test_inductor_profiling_triton_hooks(self):
        from triton.compiler import CompiledKernel  # @manual

--- a/torch/_inductor/async_compile.py
+++ b/torch/_inductor/async_compile.py
@ -314,6 +314,8 @@ class AsyncCompile:
            counters["inductor"]["async_compile_cache_hit"] += 1
            # Set reload_kernel_from_src properly based on source_code
            if isinstance(future, StaticAutotunerFuture):
+                # Remove the future now that we've cache hit
+                CompiledTritonKernels.remove_future(source_code)
                future.reload_kernel_from_src = reload_kernel_in_parent
            if is_parallel:
                return future
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@ -39,6 +39,14 @@ def bundle_triton_into_fx_graph_cache_default() -> Optional[bool]:
    )


+def static_cuda_launcher_default() -> bool:
+    result = get_tristate_env(
+        "TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER", True if not is_fbcode() else False
+    )
+    assert result is not None
+    return result
+
+
 def prologue_fusion_enabled() -> bool:
    ENABLE_PROLOGUE_FUSION_VERSION = 0

@ -734,9 +742,7 @@ compile_threads: Optional[int] = None if is_fbcode() else decide_compile_threads

 # Whether or not to enable statically launching CUDA kernels
 # compiled by triton (instead of using triton's own launcher)
-use_static_cuda_launcher: bool = (
-    os.environ.get("TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER", "0") == "1"
-)
+use_static_cuda_launcher: bool = static_cuda_launcher_default()

 # Raise error if we bypass the launcher
 strict_static_cuda_launcher: bool = (
--- a/torch/_inductor/runtime/static_cuda_launcher.py
+++ b/torch/_inductor/runtime/static_cuda_launcher.py
@ -90,7 +90,6 @@ class StaticallyLaunchedCudaKernel:

        assert hasattr(self, "cubin_path")
        assert self.cubin_path is not None
-
        (self.function, self.n_regs, self.n_spills) = _StaticCudaLauncher._load_kernel(
            self.cubin_path, self.name, self.shared, device
        )
@ -204,7 +203,6 @@ class StaticallyLaunchedCudaKernel:

        # TODO: can handle grid functions here or in C++, so
        # that we don't need the grid handler above.
-
        _StaticCudaLauncher._launch_kernel(
            self.function,
            grid_x,
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@ -478,7 +478,7 @@ class CachingAutotuner(KernelInterface):
                try:
                    launchers.append(result.make_launcher())

-                except (OutOfResources, PTXASError) as e:
+                except (OutOfResources, PTXASError, torch.cuda.OutOfMemoryError) as e:
                    exc = e
        if len(launchers) == 0:
            raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}")
@ -1274,7 +1274,10 @@ class StaticTritonCompileResult(CompileResult[StaticallyLaunchedCudaKernel]):
        # Load the binary on the parent
        if not self.kernel.cubin_path:
            self.reload_cubin_path()
-        self.kernel.load_kernel(self.compile_meta.get("device", 0))
+        device = self.compile_meta.get("device", 0)
+        if device is None:
+            device = 0
+        self.kernel.load_kernel(device)
        scope = {
            "runner": self.kernel.run,
        }
--- a/torch/csrc/inductor/static_cuda_launcher.cpp
+++ b/torch/csrc/inductor/static_cuda_launcher.cpp
@ -112,7 +112,8 @@ CUfunction loadKernel(
  // we set maximum dynamic shared memory to the difference between
  // the static shared memory and total max shared memory allowed on the device.
  // This prevents us from setting shared memory above the maximum
-  TORCH_CHECK(
+  TORCH_CHECK_WITH(
+      OutOfMemoryError,
      sharedMemBytes < static_cast<uint32_t>(shared_optin),
      "out of resource: ",
      funcName,