Build vLLM aarch64 nightly wheels (#162664)

PyTorch has published its aarch64 nightly wheels for all CUDA version after https://github.com/pytorch/pytorch/pull/162364 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162664 Approved by: https://github.com/atalman
2025-12-06 12:20:52 +01:00 · 2025-09-13 03:43:55 +00:00 · 2025-09-13 03:43:55 +00:00 · 66133b1ab7
commit 66133b1ab7
parent 543d50db2b
4 changed files with 44 additions and 30 deletions
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -21,6 +21,7 @@ self-hosted-runner:
    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
    - linux.arm64.r7g.12xlarge.memory
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -82,16 +82,10 @@ RUN if command -v apt-get >/dev/null; then \
        apt-get update -y \
        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
    else \
-        dnf install -y git curl wget sudo vim; \
+        dnf install -y git curl wget sudo; \
    fi \
    && python3 --version && python3 -m pip --version
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version >/dev/null 2>&1; then \
@ -220,11 +214,16 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
+        echo "Installing sccache..."; \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
+        if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
            SCCACHE_ARCHIVE="sccache-v0.8.1-aarch64-unknown-linux-musl"; \
        else \
            SCCACHE_ARCHIVE="sccache-v0.8.1-x86_64-unknown-linux-musl"; \
        fi; \
        curl -L -o sccache.tar.gz "https://github.com/mozilla/sccache/releases/download/v0.8.1/${SCCACHE_ARCHIVE}.tar.gz" \
        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
+        && sudo mv "${SCCACHE_ARCHIVE}"/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
+        && rm -rf sccache.tar.gz "${SCCACHE_ARCHIVE}" \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
@ -285,7 +284,7 @@ RUN if command -v apt-get >/dev/null; then \
        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
    else \
-        dnf install -y git curl wget sudo vim; \
+        dnf install -y git curl wget sudo; \
    fi \
    && python3 --version && python3 -m pip --version
@ -298,12 +297,6 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
--- a/.github/scripts/prepare_vllm_wheels.sh
+++ b/.github/scripts/prepare_vllm_wheels.sh
@ -84,6 +84,9 @@ repackage_wheel() {
  rm -rf $package
 }
 # Require to re-package the wheel
 ${PYTHON_EXECUTABLE} -mpip install wheel==0.45.1
 pushd externals/vllm/wheels
 for package in xformers flashinfer-python vllm; do
  repackage_wheel $package
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -12,6 +12,9 @@ on:
    paths:
      - .github/workflows/build-vllm-wheel.yml
      - .github/ci_commit_pins/vllm.txt
  schedule:
    # every morning at 01:30PM UTC, 9:30AM EST, 6:30AM PST
    - cron: 30 13 * * *
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -24,21 +27,33 @@ jobs:
      fail-fast: false
      matrix:
        python-version: [ '3.12' ]
-        # TODO (huydhn): Add cu130 https://github.com/pytorch/pytorch/pull/162000#issuecomment-3261541554
+        # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
        device: [ 'cu128', 'cu129' ]
        runner: [ 'linux.12xlarge.memory' ]
        include:
-          - device: cu128
+          - platform: manylinux_2_28_x86_64
            device: cu128
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.8'
-          - device: cu129
+            runner: linux.12xlarge.memory
          - platform: manylinux_2_28_x86_64
            device: cu129
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
-    name: "Build ${{ matrix.device }} vLLM wheel"
+            runner: linux.12xlarge.memory
          - platform: manylinux_2_28_aarch64
            device: cu128
            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
            runner: linux.arm64.r7g.12xlarge.memory
          - platform: manylinux_2_28_aarch64
            device: cu129
            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
            runner: linux.arm64.r7g.12xlarge.memory
    name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
    runs-on: ${{ matrix.runner }}
    timeout-minutes: 480
    env:
      PY_VERS: ${{ matrix.python-version }}
      MANYLINUX_IMAGE: ${{ matrix.manylinux-image }}
-      PLATFORM: 'manylinux_2_28_x86_64'
+      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
    steps:
      - name: Setup SSH (Click me for login details)
@ -136,7 +151,7 @@ jobs:
      - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
        with:
-          name: vllm-wheel-${{ matrix.device }}-${{ matrix.python-version }}-${{ env.PLATFORM }}
+          name: vllm-wheel-${{ matrix.device }}-${{ matrix.platform }}-${{ matrix.python-version }}
          if-no-files-found: error
          path: ${{ runner.temp }}/artifacts/externals/vllm/wheels/*.whl
@ -146,15 +161,17 @@ jobs:
  # Copied from build-triton-wheel workflow (mostly)
  upload-wheel:
-    name: "Upload ${{ matrix.device }} vLLM wheel"
+    name: "Upload ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
    needs:
      - build-wheel
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
        device: [ 'cu128', 'cu129' ]
    env:
      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
    permissions:
      id-token: write
@ -190,15 +207,15 @@ jobs:
        run: |
          set -eux
          mkdir -p "${RUNNER_TEMP}/artifacts/"
-          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-*/* "${RUNNER_TEMP}/artifacts/"
+          mv "${RUNNER_TEMP}"/artifacts-all/vllm-wheel-"${BUILD_DEVICE}"-"${PLATFORM}"-*/* "${RUNNER_TEMP}/artifacts/"
-      - name: Set DRY_RUN (only for tagged pushes)
+      - name: Set DRY_RUN
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
+        if: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
        shell: bash
        run: |
          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
+      - name: Set UPLOAD_CHANNEL
        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
        shell: bash
        run: |