mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Turn on static cuda launcher in OSS (#151691)
After a few small bugfixes on tests (to make it so we throw/catch similar exceptions to triton), I think we're ready to flip the switch and use StaticCudaLauncher on by default in OSS. Initial round of benchmarks look good, with average compilation time going down by a few percent: <img width="828" alt="image" src="https://github.com/user-attachments/assets/cad03e09-b4d6-49a7-a9e5-6068d1c0bd5c" /> With no changes to runtime perf: <img width="823" alt="image" src="https://github.com/user-attachments/assets/3fcd435e-1057-43f4-878b-8d66a3812a10" /> There are a few noisy models I want to double check, though, so will run some more tests before accepting review. Full benchmark results, showing a ~5% compile time improvement across the board: https://hud.pytorch.org/benchmark/huggingface/inductor_with_cudagraphs?dashboard=torchinductor&startTime=Wed%2C%2016%20Apr%202025%2002%3A31%3A12%20GMT&stopTime=Wed%2C%2023%20Apr%202025%2002%3A31%3A12%20GMT&granularity=hour&mode=training&dtype=amp&deviceName=cuda%20(a100)&lBranch=gh/jamesjwu/139/orig&lCommit=cc45c8667fa23dec16ca50002d9504a34688ca5c&rBranch=main&rCommit=2a9afdae81d0dde98e96d7e3c9ca840e241e5405 <img width="1482" alt="image" src="https://github.com/user-attachments/assets/6e6a7f39-7f44-459f-9845-9a37f084ea82" /> Pull Request resolved: https://github.com/pytorch/pytorch/pull/151691 Approved by: https://github.com/oulgen, https://github.com/jansel, https://github.com/EikanWang
This commit is contained in:
parent
c03359de2d
commit
0dae27d75b
|
|
@ -179,6 +179,9 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
|||
self.assertTrue(event_found)
|
||||
|
||||
@unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
|
||||
@config.patch(
|
||||
"compile_threads", 1
|
||||
) # This test monkey patches global variables, which workers don't see
|
||||
def test_inductor_profiling_triton_hooks(self):
|
||||
from triton.compiler import CompiledKernel # @manual
|
||||
|
||||
|
|
|
|||
|
|
@ -314,6 +314,8 @@ class AsyncCompile:
|
|||
counters["inductor"]["async_compile_cache_hit"] += 1
|
||||
# Set reload_kernel_from_src properly based on source_code
|
||||
if isinstance(future, StaticAutotunerFuture):
|
||||
# Remove the future now that we've cache hit
|
||||
CompiledTritonKernels.remove_future(source_code)
|
||||
future.reload_kernel_from_src = reload_kernel_in_parent
|
||||
if is_parallel:
|
||||
return future
|
||||
|
|
|
|||
|
|
@ -39,6 +39,14 @@ def bundle_triton_into_fx_graph_cache_default() -> Optional[bool]:
|
|||
)
|
||||
|
||||
|
||||
def static_cuda_launcher_default() -> bool:
|
||||
result = get_tristate_env(
|
||||
"TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER", True if not is_fbcode() else False
|
||||
)
|
||||
assert result is not None
|
||||
return result
|
||||
|
||||
|
||||
def prologue_fusion_enabled() -> bool:
|
||||
ENABLE_PROLOGUE_FUSION_VERSION = 0
|
||||
|
||||
|
|
@ -734,9 +742,7 @@ compile_threads: Optional[int] = None if is_fbcode() else decide_compile_threads
|
|||
|
||||
# Whether or not to enable statically launching CUDA kernels
|
||||
# compiled by triton (instead of using triton's own launcher)
|
||||
use_static_cuda_launcher: bool = (
|
||||
os.environ.get("TORCHINDUCTOR_USE_STATIC_CUDA_LAUNCHER", "0") == "1"
|
||||
)
|
||||
use_static_cuda_launcher: bool = static_cuda_launcher_default()
|
||||
|
||||
# Raise error if we bypass the launcher
|
||||
strict_static_cuda_launcher: bool = (
|
||||
|
|
|
|||
|
|
@ -90,7 +90,6 @@ class StaticallyLaunchedCudaKernel:
|
|||
|
||||
assert hasattr(self, "cubin_path")
|
||||
assert self.cubin_path is not None
|
||||
|
||||
(self.function, self.n_regs, self.n_spills) = _StaticCudaLauncher._load_kernel(
|
||||
self.cubin_path, self.name, self.shared, device
|
||||
)
|
||||
|
|
@ -204,7 +203,6 @@ class StaticallyLaunchedCudaKernel:
|
|||
|
||||
# TODO: can handle grid functions here or in C++, so
|
||||
# that we don't need the grid handler above.
|
||||
|
||||
_StaticCudaLauncher._launch_kernel(
|
||||
self.function,
|
||||
grid_x,
|
||||
|
|
|
|||
|
|
@ -478,7 +478,7 @@ class CachingAutotuner(KernelInterface):
|
|||
try:
|
||||
launchers.append(result.make_launcher())
|
||||
|
||||
except (OutOfResources, PTXASError) as e:
|
||||
except (OutOfResources, PTXASError, torch.cuda.OutOfMemoryError) as e:
|
||||
exc = e
|
||||
if len(launchers) == 0:
|
||||
raise RuntimeError(f"No valid triton configs. {type(exc).__name__}: {exc}")
|
||||
|
|
@ -1274,7 +1274,10 @@ class StaticTritonCompileResult(CompileResult[StaticallyLaunchedCudaKernel]):
|
|||
# Load the binary on the parent
|
||||
if not self.kernel.cubin_path:
|
||||
self.reload_cubin_path()
|
||||
self.kernel.load_kernel(self.compile_meta.get("device", 0))
|
||||
device = self.compile_meta.get("device", 0)
|
||||
if device is None:
|
||||
device = 0
|
||||
self.kernel.load_kernel(device)
|
||||
scope = {
|
||||
"runner": self.kernel.run,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -112,7 +112,8 @@ CUfunction loadKernel(
|
|||
// we set maximum dynamic shared memory to the difference between
|
||||
// the static shared memory and total max shared memory allowed on the device.
|
||||
// This prevents us from setting shared memory above the maximum
|
||||
TORCH_CHECK(
|
||||
TORCH_CHECK_WITH(
|
||||
OutOfMemoryError,
|
||||
sharedMemBytes < static_cast<uint32_t>(shared_optin),
|
||||
"out of resource: ",
|
||||
funcName,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user