Turn on static cuda launcher in OSS (#151691)

After a few small bugfixes on tests (to make it so we throw/catch similar exceptions to triton), I think we're ready to flip the switch and use StaticCudaLauncher on by default in OSS.

Initial round of benchmarks look good, with average compilation time going down by a few percent:
<img width="828" alt="image" src="https://github.com/user-attachments/assets/cad03e09-b4d6-49a7-a9e5-6068d1c0bd5c" />

With no changes to runtime perf:
<img width="823" alt="image" src="https://github.com/user-attachments/assets/3fcd435e-1057-43f4-878b-8d66a3812a10" />

There are a few noisy models I want to double check, though, so will run some more tests before accepting review.

Full benchmark results, showing a ~5% compile time improvement across the board:
https://hud.pytorch.org/benchmark/huggingface/inductor_with_cudagraphs?dashboard=torchinductor&startTime=Wed%2C%2016%20Apr%202025%2002%3A31%3A12%20GMT&stopTime=Wed%2C%2023%20Apr%202025%2002%3A31%3A12%20GMT&granularity=hour&mode=training&dtype=amp&deviceName=cuda%20(a100)&lBranch=gh/jamesjwu/139/orig&lCommit=cc45c8667fa23dec16ca50002d9504a34688ca5c&rBranch=main&rCommit=2a9afdae81d0dde98e96d7e3c9ca840e241e5405
<img width="1482" alt="image" src="https://github.com/user-attachments/assets/6e6a7f39-7f44-459f-9845-9a37f084ea82" />

Pull Request resolved: https://github.com/pytorch/pytorch/pull/151691
Approved by: https://github.com/oulgen, https://github.com/jansel, https://github.com/EikanWang
This commit is contained in:
James Wu 2025-04-24 13:19:03 -07:00 committed by PyTorch MergeBot
parent c03359de2d
commit 0dae27d75b
6 changed files with 21 additions and 8 deletions

View File

@ -179,6 +179,9 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
self.assertTrue(event_found)
@unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
@config.patch(
"compile_threads", 1
) # This test monkey patches global variables, which workers don't see
def test_inductor_profiling_triton_hooks(self):
from triton.compiler import CompiledKernel # @manual

View File

@ -314,6 +314,8 @@ class AsyncCompile:
counters["inductor"]["async_compile_cache_hit"] += 1
# Set reload_kernel_from_src properly based on source_code
if isinstance(future, StaticAutotunerFuture):
# Remove the future now that we've cache hit
CompiledTritonKernels.remove_future(source_code)
future.reload_kernel_from_src = reload_kernel_in_parent
if is_parallel:
return future

View File

@ -39,6 +39,14 @@ def bundle_triton_into_fx_graph_cache_default() -> Optional[bool]:
)
def static_cuda_launcher_default() -> bool:
result = get_tristate_env(
"TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER", True if not is_fbcode() else False
)
assert result is not None
return result
def prologue_fusion_enabled() -> bool:
ENABLE_PROLOGUE_FUSION_VERSION = 0
@ -734,9 +742,7 @@ compile_threads: Optional[int] = None if is_fbcode() else decide_compile_threads
# Whether or not to enable statically launching CUDA kernels
# compiled by triton (instead of using triton's own launcher)
use_static_cuda_launcher: bool = (
os.environ.get("TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER", "0") == "1"
)
use_static_cuda_launcher: bool = static_cuda_launcher_default()
# Raise error if we bypass the launcher
strict_static_cuda_launcher: bool = (

View File

@ -90,7 +90,6 @@ class StaticallyLaunchedCudaKernel:
assert hasattr(self, "cubin_path")
assert self.cubin_path is not None
(self.function, self.n_regs, self.n_spills) = _StaticCudaLauncher._load_kernel(
self.cubin_path, self.name, self.shared, device
)
@ -204,7 +203,6 @@ class StaticallyLaunchedCudaKernel:
# TODO: can handle grid functions here or in C++, so
# that we don't need the grid handler above.
_StaticCudaLauncher._launch_kernel(
self.function,
grid_x,

View File

@ -478,7 +478,7 @@ class CachingAutotuner(KernelInterface):
try:
launchers.append(result.make_launcher())
except (OutOfResources, PTXASError) as e:
except (OutOfResources, PTXASError, torch.cuda.OutOfMemoryError) as e:
exc = e
if len(launchers) == 0:
raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}")
@ -1274,7 +1274,10 @@ class StaticTritonCompileResult(CompileResult[StaticallyLaunchedCudaKernel]):
# Load the binary on the parent
if not self.kernel.cubin_path:
self.reload_cubin_path()
self.kernel.load_kernel(self.compile_meta.get("device", 0))
device = self.compile_meta.get("device", 0)
if device is None:
device = 0
self.kernel.load_kernel(device)
scope = {
"runner": self.kernel.run,
}

View File

@ -112,7 +112,8 @@ CUfunction loadKernel(
// we set maximum dynamic shared memory to the difference between
// the static shared memory and total max shared memory allowed on the device.
// This prevents us from setting shared memory above the maximum
TORCH_CHECK(
TORCH_CHECK_WITH(
OutOfMemoryError,
sharedMemBytes < static_cast<uint32_t>(shared_optin),
"out of resource: ",
funcName,