mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
This reverts commit840d63c12d. Reverted https://github.com/pytorch/pytorch/pull/165913 on behalf of https://github.com/clee2000 due to I think something here is causing CI tests to segfault at exit on cuda, ex [GH job link](https://github.com/pytorch/pytorch/actions/runs/18857880394/job/53811917713) [HUD commit link](9a91486e45) says no tests failed but it segfaulted afterwards. I can't tell if it's because of this change, or an unpinned dependency in docker that got triggered by this. Note to self, would have been bad TD except trunk didn't run either ([comment](https://github.com/pytorch/pytorch/pull/165913#issuecomment-3457293475))
186 lines
4.8 KiB
Bash
186 lines
4.8 KiB
Bash
#!/bin/bash
|
|
|
|
set -ex
|
|
|
|
arch_path=''
|
|
targetarch=${TARGETARCH:-$(uname -m)}
|
|
if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
|
|
arch_path='x86_64'
|
|
else
|
|
arch_path='sbsa'
|
|
fi
|
|
|
|
NVSHMEM_VERSION=3.3.24
|
|
|
|
function install_cuda {
|
|
version=$1
|
|
runfile=$2
|
|
major_minor=${version%.*}
|
|
rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda
|
|
if [[ ${arch_path} == 'sbsa' ]]; then
|
|
runfile="${runfile}_sbsa"
|
|
fi
|
|
runfile="${runfile}.run"
|
|
wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
|
|
chmod +x ${runfile}
|
|
./${runfile} --toolkit --silent
|
|
rm -f ${runfile}
|
|
rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
|
|
}
|
|
|
|
function install_cudnn {
|
|
cuda_major_version=$1
|
|
cudnn_version=$2
|
|
mkdir tmp_cudnn && cd tmp_cudnn
|
|
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
|
filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive"
|
|
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz
|
|
tar xf ${filepath}.tar.xz
|
|
cp -a ${filepath}/include/* /usr/local/cuda/include/
|
|
cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
|
|
cd ..
|
|
rm -rf tmp_cudnn
|
|
}
|
|
|
|
function install_nvshmem {
|
|
cuda_major_version=$1 # e.g. "12"
|
|
nvshmem_version=$2 # e.g. "3.3.9"
|
|
|
|
case "${arch_path}" in
|
|
sbsa)
|
|
dl_arch="aarch64"
|
|
;;
|
|
x86_64)
|
|
dl_arch="x64"
|
|
;;
|
|
*)
|
|
dl_arch="${arch}"
|
|
;;
|
|
esac
|
|
|
|
tmpdir="tmp_nvshmem"
|
|
mkdir -p "${tmpdir}" && cd "${tmpdir}"
|
|
|
|
# nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
|
|
# This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
|
|
filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
|
|
suffix=".tar.xz"
|
|
url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"
|
|
|
|
# download, unpack, install
|
|
wget -q "${url}"
|
|
tar xf "${filename}${suffix}"
|
|
cp -a "${filename}/include/"* /usr/local/cuda/include/
|
|
cp -a "${filename}/lib/"* /usr/local/cuda/lib64/
|
|
|
|
# cleanup
|
|
cd ..
|
|
rm -rf "${tmpdir}"
|
|
|
|
echo "nvSHMEM ${nvshmem_version} for CUDA ${cuda_major_version} (${arch_path}) installed."
|
|
}
|
|
|
|
function install_124 {
|
|
CUDNN_VERSION=9.1.0.70
|
|
echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2"
|
|
install_cuda 12.4.1 cuda_12.4.1_550.54.15_linux
|
|
|
|
install_cudnn 12 $CUDNN_VERSION
|
|
|
|
CUDA_VERSION=12.4 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=12.4 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
function install_126 {
|
|
CUDNN_VERSION=9.10.2.21
|
|
echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
|
install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
|
|
|
|
install_cudnn 12 $CUDNN_VERSION
|
|
|
|
install_nvshmem 12 $NVSHMEM_VERSION
|
|
|
|
CUDA_VERSION=12.6 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=12.6 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
function install_129 {
|
|
CUDNN_VERSION=9.10.2.21
|
|
echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
|
# install CUDA 12.9.1 in the same container
|
|
install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
|
|
|
|
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
|
install_cudnn 12 $CUDNN_VERSION
|
|
|
|
install_nvshmem 12 $NVSHMEM_VERSION
|
|
|
|
CUDA_VERSION=12.9 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=12.9 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
function install_128 {
|
|
CUDNN_VERSION=9.8.0.87
|
|
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
|
# install CUDA 12.8.1 in the same container
|
|
install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
|
|
|
|
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
|
install_cudnn 12 $CUDNN_VERSION
|
|
|
|
install_nvshmem 12 $NVSHMEM_VERSION
|
|
|
|
CUDA_VERSION=12.8 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=12.8 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
function install_130 {
|
|
CUDNN_VERSION=9.13.0.50
|
|
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
|
# install CUDA 13.0 in the same container
|
|
install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
|
|
|
|
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
|
install_cudnn 13 $CUDNN_VERSION
|
|
|
|
install_nvshmem 13 $NVSHMEM_VERSION
|
|
|
|
CUDA_VERSION=13.0 bash install_nccl.sh
|
|
|
|
CUDA_VERSION=13.0 bash install_cusparselt.sh
|
|
|
|
ldconfig
|
|
}
|
|
|
|
# idiomatic parameter and option handling in sh
|
|
while test $# -gt 0
|
|
do
|
|
case "$1" in
|
|
12.4) install_124;
|
|
;;
|
|
12.6|12.6.*) install_126;
|
|
;;
|
|
12.8|12.8.*) install_128;
|
|
;;
|
|
12.9|12.9.*) install_129;
|
|
;;
|
|
13.0|13.0.*) install_130;
|
|
;;
|
|
*) echo "bad argument $1"; exit 1
|
|
;;
|
|
esac
|
|
shift
|
|
done
|