mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[RelEng] Define BUILD_BUNDLE_PTXAS (#119750)
That would bundle PTXAS into a `bin` folder
When compiling for Triton, define `TRITION_PTXAS_PATH` if `ptxas` is bundled with PyTorch Needed to make PyTorch compiled against CUDA-11.8 usable with 11.8 driver, as Triton is bundled with latest (CUDA-12.3 at time of PyTorch-2.2 release) ptxas
Needs 5c814e2527 to produce valid binary builds
Test plan:
- Create dummy ptxas in `torch/bin` folder and observe `torch.compile` fail with backtrace in Triton module.
- Run following script (to be added to binary tests ) against CUDA-11.8 wheel:
```python
import torch
import triton
@torch.compile
def foo(x: torch.Tensor) -> torch.Tensor:
return torch.sin(x) + torch.cos(x)
x=torch.rand(3, 3, device="cuda")
print(foo(x))
# And check that CUDA versions match
cuda_version = torch.version.cuda
ptxas_version = triton.backends.nvidia.compiler.get_ptxas_version().decode("ascii")
assert cuda_version in ptxas_version, f"CUDA version mismatch: torch build with {cuda_version}, but Triton uses ptxs {ptxas_version}"
```
Fixes https://github.com/pytorch/pytorch/issues/119054
Pull Request resolved: https://github.com/pytorch/pytorch/pull/119750
Approved by: https://github.com/jansel, https://github.com/atalman
This commit is contained in:
parent
a07fd51b6b
commit
516f38a144
|
|
@ -351,6 +351,8 @@ cmake_dependent_option(
|
||||||
"NOT INTERN_BUILD_MOBILE" OFF)
|
"NOT INTERN_BUILD_MOBILE" OFF)
|
||||||
cmake_dependent_option(
|
cmake_dependent_option(
|
||||||
BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
|
BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
|
||||||
|
cmake_dependent_option(
|
||||||
|
BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF)
|
||||||
|
|
||||||
option(USE_MIMALLOC "Use mimalloc" OFF)
|
option(USE_MIMALLOC "Use mimalloc" OFF)
|
||||||
# Enable third party mimalloc library to improve memory allocation performance on Windows.
|
# Enable third party mimalloc library to improve memory allocation performance on Windows.
|
||||||
|
|
@ -1241,3 +1243,12 @@ if(DEFINED USE_CUSTOM_DEBINFO)
|
||||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -g")
|
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -g")
|
||||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -g")
|
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -g")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Bundle PTXAS if needed
|
||||||
|
if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
|
||||||
|
if(NOT EXISTS "${PROJECT_SOURCE_DIR}/build/bin/ptxas")
|
||||||
|
message(STATUS "Copying PTXAS into the bin folder")
|
||||||
|
file(COPY "${CUDAToolkit_BIN_DIR}/ptxas" DESTINATION "${PROJECT_BINARY_DIR}")
|
||||||
|
endif()
|
||||||
|
install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||||
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -2391,6 +2391,20 @@ def caching_device_properties():
|
||||||
device_interface.Worker.get_device_properties()
|
device_interface.Worker.get_device_properties()
|
||||||
|
|
||||||
|
|
||||||
|
def _set_triton_ptxas_path() -> None:
|
||||||
|
if os.environ.get("TRITON_PTXAS_PATH") is not None:
|
||||||
|
return
|
||||||
|
ptxas_path = os.path.abspath(
|
||||||
|
os.path.join(os.path.dirname(__file__), "..", "bin", "ptxas")
|
||||||
|
)
|
||||||
|
if not os.path.exists(ptxas_path):
|
||||||
|
return
|
||||||
|
if os.path.isfile(ptxas_path) and os.access(ptxas_path, os.X_OK):
|
||||||
|
os.environ["TRITON_PTXAS_PATH"] = ptxas_path
|
||||||
|
else:
|
||||||
|
warnings.warn(f"{ptxas_path} exists but is not an executable")
|
||||||
|
|
||||||
|
|
||||||
def _worker_compile(
|
def _worker_compile(
|
||||||
kernel_name: str, source_code: str, cc: int, device: torch.device
|
kernel_name: str, source_code: str, cc: int, device: torch.device
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
@ -2401,6 +2415,7 @@ def _worker_compile(
|
||||||
|
|
||||||
|
|
||||||
def _load_kernel(kernel_name: str, source_code: str) -> ModuleType:
|
def _load_kernel(kernel_name: str, source_code: str) -> ModuleType:
|
||||||
|
_set_triton_ptxas_path()
|
||||||
kernel = TritonCodeCache.load(kernel_name, source_code)
|
kernel = TritonCodeCache.load(kernel_name, source_code)
|
||||||
kernel.precompile()
|
kernel.precompile()
|
||||||
return kernel
|
return kernel
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user