mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Notable new features/optimizations for SDPA operators on AMD systems from AOTriton 0.9b: * Optimize these Non-power-of-two head dimensions: 48, 80, 96, 160, 192, 224. Inputs with these head dimensions do not need padding to power-of-two anymore. * `is_causal=True` cases are now supported with persistent dynamic algorithm, which requires an atomic tensor but does load balance between different CTAs * `dropout_p > 0.0` cases now support full 64-bit offsets and use all i64x4 PRNG outputs * The precise AOTriton shared library version can now be identified with `readelf -p .comment libaotriton_v2.so` + However, this does not guarantee the GPU images stored under `aotriton.images` have the same version, since they can be overwritten. * The newly added fused backward kernel will be used for smaller workloads, due to less kernel invocation overhead. * Support gfx1201 (RX 9070XT). Need to be enabled at runtime with `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` Pull Request resolved: https://github.com/pytorch/pytorch/pull/148433 Approved by: https://github.com/jeffdaily
122 lines
5.3 KiB
CMake
122 lines
5.3 KiB
CMake
macro(get_target_gpus_from_pytorch target_gpus)
|
|
set(gfx90a_key MI200)
|
|
set(gfx942_key MI300X)
|
|
set(gfx1100_key Navi31)
|
|
|
|
foreach(X IN LISTS PYTORCH_ROCM_ARCH)
|
|
set(key ${X})
|
|
string(APPEND key "_key")
|
|
string(APPEND target_gpus ${${key}})
|
|
string(APPEND target_gpus "|")
|
|
endforeach()
|
|
endmacro()
|
|
|
|
if(NOT __AOTRITON_INCLUDED)
|
|
set(__AOTRITON_INCLUDED TRUE)
|
|
|
|
set(__AOTRITON_EXTERN_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/aotriton")
|
|
set(__AOTRITON_INSTALL_DIR "${PROJECT_SOURCE_DIR}/torch")
|
|
add_library(__caffe2_aotriton INTERFACE)
|
|
|
|
# AOTriton package information from GitHub Release Pages
|
|
# Replaces .ci/docker/aotriton_version.txt
|
|
# Note packages information may have versions skipped (due to no ABI breaks)
|
|
# But they must be listed from lower version to higher version
|
|
set(__AOTRITON_VER "0.9.2b")
|
|
set(__AOTRITON_MANYLINUX_LIST
|
|
"manylinux_2_28" # rocm6.2
|
|
"manylinux_2_28" # rocm6.3
|
|
"manylinux_2_28" # rocm6.4
|
|
)
|
|
set(__AOTRITON_ROCM_LIST
|
|
"rocm6.2"
|
|
"rocm6.3"
|
|
"rocm6.4"
|
|
)
|
|
set(__AOTRITON_CI_COMMIT "b388d223d8c7213545603e00f6f3148c54d1f525")
|
|
set(__AOTRITON_SHA256_LIST
|
|
"08d84f96f4c984179f80f517c0431c7511ee26bb0ce9bd05a827573ddd78cc79" # rocm6.2
|
|
"9094d59717e7e6eace9126ca100dd0e86510f07fc6c3a349569fc4e2d9056604" # rocm6.3
|
|
"41190202c2736d5ff75b13a3abc0fb52ebfbb67226cf85dc3de7699c7000db44" # rocm6.4
|
|
)
|
|
set(__AOTRITON_Z "gz")
|
|
|
|
# Note it is INSTALL"ED"
|
|
if(DEFINED ENV{AOTRITON_INSTALLED_PREFIX})
|
|
install(DIRECTORY
|
|
$ENV{AOTRITON_INSTALLED_PREFIX}/lib
|
|
$ENV{AOTRITON_INSTALLED_PREFIX}/include
|
|
DESTINATION ${__AOTRITON_INSTALL_DIR})
|
|
set(__AOTRITON_INSTALL_DIR "$ENV{AOTRITON_INSTALLED_PREFIX}")
|
|
message(STATUS "Using Preinstalled AOTriton at ${__AOTRITON_INSTALL_DIR}")
|
|
elseif(DEFINED ENV{AOTRITON_INSTALL_FROM_SOURCE})
|
|
set(target_gpus "")
|
|
get_target_gpus_from_pytorch(target_gpus)
|
|
ExternalProject_Add(aotriton_external
|
|
GIT_REPOSITORY https://github.com/ROCm/aotriton.git
|
|
GIT_TAG ${__AOTRITON_CI_COMMIT}
|
|
PREFIX ${__AOTRITON_EXTERN_PREFIX}
|
|
INSTALL_DIR ${__AOTRITON_INSTALL_DIR}
|
|
LIST_SEPARATOR |
|
|
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${__AOTRITON_INSTALL_DIR}
|
|
-DTARGET_GPUS:STRING=${target_gpus}
|
|
-DAOTRITON_COMPRESS_KERNEL=ON
|
|
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
|
|
-DAOTRITON_NO_PYTHON=ON
|
|
-DAOTRITON_NO_SHARED=OFF
|
|
# CONFIGURE_COMMAND ""
|
|
BUILD_COMMAND "" # No build, install command will repeat the build process due to problems in the build system.
|
|
BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
|
|
USES_TERMINAL_DOWNLOAD TRUE
|
|
USES_TERMINAL_CONFIGURE TRUE
|
|
USES_TERMINAL_BUILD TRUE
|
|
USES_TERMINAL_INSTALL TRUE
|
|
# INSTALL_COMMAND ${MAKE_COMMAND} install
|
|
)
|
|
add_dependencies(__caffe2_aotriton aotriton_external)
|
|
message(STATUS "Using AOTriton compiled from source directory ${__AOTRITON_EXTERN_PREFIX}")
|
|
else()
|
|
set(__AOTRITON_SYSTEM_ROCM "${ROCM_VERSION_DEV_MAJOR}.${ROCM_VERSION_DEV_MINOR}")
|
|
list(GET __AOTRITON_ROCM_LIST 0 __AOTRITON_ROCM_DEFAULT_STR)
|
|
# Initialize __AOTRITON_ROCM to lowest version, in case all builds > system's ROCM
|
|
string(SUBSTRING ${__AOTRITON_ROCM_DEFAULT_STR} 4 -1 __AOTRITON_ROCM)
|
|
foreach(AOTRITON_ROCM_BUILD_STR IN LISTS __AOTRITON_ROCM_LIST)
|
|
# len("rocm") == 4
|
|
string(SUBSTRING ${AOTRITON_ROCM_BUILD_STR} 4 -1 AOTRITON_ROCM_BUILD)
|
|
# Find the last build that <= system's ROCM
|
|
# Assume the list is from lower to higher
|
|
if(AOTRITON_ROCM_BUILD VERSION_GREATER __AOTRITON_SYSTEM_ROCM)
|
|
break()
|
|
endif()
|
|
set(__AOTRITON_ROCM ${AOTRITON_ROCM_BUILD})
|
|
endforeach()
|
|
list(FIND __AOTRITON_ROCM_LIST "rocm${__AOTRITON_ROCM}" __AOTRITON_ROCM_INDEX)
|
|
list(GET __AOTRITON_SHA256_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_SHA256)
|
|
list(GET __AOTRITON_MANYLINUX_LIST ${__AOTRITON_ROCM_INDEX} __AOTRITON_MANYLINUX)
|
|
set(__AOTRITON_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
|
|
string(CONCAT __AOTRITON_FILE "aotriton-"
|
|
"${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
|
|
"_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
|
|
"-shared.tar.${__AOTRITON_Z}")
|
|
string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/"
|
|
"${__AOTRITON_VER}/${__AOTRITON_FILE}")
|
|
ExternalProject_Add(aotriton_external
|
|
URL "${__AOTRITON_URL}"
|
|
URL_HASH SHA256=${__AOTRITON_SHA256}
|
|
SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball
|
|
CONFIGURE_COMMAND ""
|
|
BUILD_COMMAND ""
|
|
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
|
|
"${CMAKE_CURRENT_BINARY_DIR}/aotriton_tarball"
|
|
"${__AOTRITON_INSTALL_DIR}"
|
|
BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so"
|
|
)
|
|
add_dependencies(__caffe2_aotriton aotriton_external)
|
|
message(STATUS "Using AOTriton from pre-compiled binary ${__AOTRITON_URL}.\
|
|
Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
|
|
endif()
|
|
target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so)
|
|
target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
|
|
set(AOTRITON_FOUND TRUE)
|
|
endif() # __AOTRITON_INCLUDED
|