[CD] Enable Inductor performance test for xpu (#166289)

Add Dynamo benchmark performance tests for XPU backend

Pull Request resolved: https://github.com/pytorch/pytorch/pull/166289
Approved by: https://github.com/EikanWang, https://github.com/atalman
This commit is contained in:
Wang, Chuanqi 2025-10-31 10:52:07 +00:00 committed by PyTorch MergeBot
parent 108bb224f7
commit 0d3a4f7155
11 changed files with 190 additions and 4 deletions

View File

@ -195,13 +195,16 @@ case "$tag" in
NINJA_VERSION=1.9.0 NINJA_VERSION=1.9.0
TRITON=yes TRITON=yes
;; ;;
pytorch-linux-jammy-xpu-n-py3) pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10 ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11 GCC_VERSION=11
VISION=yes VISION=yes
XPU_VERSION=2025.2 XPU_VERSION=2025.2
NINJA_VERSION=1.9.0 NINJA_VERSION=1.9.0
TRITON=yes TRITON=yes
if [[ $tag =~ "benchmarks" ]]; then
INDUCTOR_BENCHMARKS=yes
fi
;; ;;
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks) pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
ANACONDA_PYTHON_VERSION=3.10 ANACONDA_PYTHON_VERSION=3.10

View File

@ -54,12 +54,15 @@ ENV OPENSSL_DIR /opt/openssl
RUN rm install_openssl.sh RUN rm install_openssl.sh
ARG INDUCTOR_BENCHMARKS ARG INDUCTOR_BENCHMARKS
ARG ANACONDA_PYTHON_VERSION
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
COPY ./common/common_utils.sh common_utils.sh COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/timm.txt timm.txt COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
# Install XPU Dependencies # Install XPU Dependencies
ARG XPU_VERSION ARG XPU_VERSION

View File

@ -426,7 +426,7 @@ fi
if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
# export test times so that potential sharded tests that'll branch off this build will use consistent data # export test times so that potential sharded tests that'll branch off this build will use consistent data
# don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
python tools/stats/export_test_times.py PYTHONPATH=. python tools/stats/export_test_times.py
fi fi
# don't do this for bazel or s390x or riscv64 as they don't use sccache # don't do this for bazel or s390x or riscv64 as they don't use sccache
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then

View File

@ -572,6 +572,8 @@ fi
if [[ "${TEST_CONFIG}" == *cpu* ]]; then if [[ "${TEST_CONFIG}" == *cpu* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--device cpu) DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
else else
DYNAMO_BENCHMARK_FLAGS+=(--device cuda) DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
fi fi
@ -665,6 +667,8 @@ test_perf_for_dashboard() {
device=cuda_b200 device=cuda_b200
elif [[ "${TEST_CONFIG}" == *rocm* ]]; then elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
device=rocm device=rocm
elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
device=xpu
fi fi
for mode in "${modes[@]}"; do for mode in "${modes[@]}"; do
@ -1757,7 +1761,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
else else
# Do this after checkout_install_torchbench to ensure we clobber any # Do this after checkout_install_torchbench to ensure we clobber any
# nightlies that torchbench may pull in # nightlies that torchbench may pull in
if [[ "${TEST_CONFIG}" != *cpu* ]]; then if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
install_torchrec_and_fbgemm install_torchrec_and_fbgemm
fi fi
PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id" PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"

View File

@ -19,6 +19,7 @@ ciflow_push_tags:
- ciflow/inductor-perf-test-nightly-rocm-mi300 - ciflow/inductor-perf-test-nightly-rocm-mi300
- ciflow/inductor-perf-test-nightly-rocm-mi355 - ciflow/inductor-perf-test-nightly-rocm-mi355
- ciflow/inductor-perf-test-nightly-x86-zen - ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-perf-test-nightly-xpu
- ciflow/inductor-periodic - ciflow/inductor-periodic
- ciflow/inductor-rocm - ciflow/inductor-rocm
- ciflow/linux-aarch64 - ciflow/linux-aarch64

View File

@ -38,6 +38,10 @@ on:
default: "" default: ""
description: | description: |
List of tests to include (empty string implies default list) List of tests to include (empty string implies default list)
dashboard-tag:
required: false
type: string
default: ""
disable-monitor: disable-monitor:
description: | description: |
[Experimental] Disable utilization monitoring for tests. [Experimental] Disable utilization monitoring for tests.
@ -58,6 +62,11 @@ on:
required: false required: false
type: number type: number
default: 1 default: 1
secrets:
HUGGING_FACE_HUB_TOKEN:
required: false
description: |
HF Auth token to avoid rate limits when downloading models or datasets from hub
permissions: permissions:
id-token: write id-token: write
contents: read contents: read
@ -196,6 +205,8 @@ jobs:
PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }} TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
run: | run: |
# Fetch aws credential from IMDs # Fetch aws credential from IMDs
@ -246,6 +257,8 @@ jobs:
-e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-e TESTS_TO_INCLUDE \ -e TESTS_TO_INCLUDE \
-e ZE_AFFINITY_MASK \ -e ZE_AFFINITY_MASK \
-e HUGGING_FACE_HUB_TOKEN \
-e DASHBOARD_TAG \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--ulimit stack=10485760:83886080 \ --ulimit stack=10485760:83886080 \
--ulimit core=0 \ --ulimit core=0 \

View File

@ -67,6 +67,7 @@ jobs:
pytorch-linux-jammy-py3.12-halide, pytorch-linux-jammy-py3.12-halide,
pytorch-linux-jammy-xpu-n-1-py3, pytorch-linux-jammy-xpu-n-1-py3,
pytorch-linux-jammy-xpu-n-py3, pytorch-linux-jammy-xpu-n-py3,
pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang18-asan,
pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-py3-clang12-onnx,
pytorch-linux-jammy-linter, pytorch-linux-jammy-linter,

View File

@ -0,0 +1,148 @@
name: inductor-perf-nightly-xpu
on:
push:
tags:
- ciflow/inductor-perf-test-nightly-xpu/*
schedule:
- cron: 30 17 * * *
workflow_dispatch:
inputs:
training:
description: Run training (on by default)?
required: false
type: boolean
default: true
inference:
description: Run inference (on by default)?
required: false
type: boolean
default: true
default:
description: Run inductor_default?
required: false
type: boolean
default: false
dynamic:
description: Run inductor_dynamic_shapes?
required: false
type: boolean
default: false
cppwrapper:
description: Run inductor_cpp_wrapper?
required: false
type: boolean
default: false
cudagraphs:
description: Run inductor_cudagraphs?
required: false
type: boolean
default: false
freezing_cudagraphs:
description: Run inductor_cudagraphs with freezing for inference?
required: false
type: boolean
default: false
aotinductor:
description: Run aot_inductor for inference?
required: false
type: boolean
default: false
maxautotune:
description: Run inductor_max_autotune?
required: false
type: boolean
default: false
benchmark_configs:
description: The list of configs used the benchmark
required: false
type: string
default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
permissions: read-all
jobs:
get-label-type:
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
opt_out_experiments: lf
xpu-n-py3_10-inductor-benchmark-build:
name: xpu-n-py3.10-inductor-benchmark
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-xpu-n-py3.10
docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
runner: linux.c7i.12xlarge
test-matrix: |
{ include: [
{ config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
{ config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
]}
secrets: inherit
xpu-n-py3_10-inductor-benchmark-test-nightly:
permissions:
id-token: write
contents: read
if: github.event_name != 'workflow_dispatch'
name: xpu-n-py3.10-inductor-benchmark
uses: ./.github/workflows/_xpu-test.yml
needs: xpu-n-py3_10-inductor-benchmark-build
with:
build-environment: linux-jammy-xpu-n-py3.10
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
timeout-minutes: 720
# Disable monitor in perf tests for more investigation
disable-monitor: true
monitor-log-interval: 10
monitor-data-collect-interval: 2
secrets: inherit
xpu-n-py3_10-inductor-benchmark-test:
permissions:
id-token: write
contents: read
if: github.event_name == 'workflow_dispatch'
name: xpu-n-py3.10-inductor-test
uses: ./.github/workflows/_xpu-test.yml
needs: xpu-n-py3_10-inductor-benchmark-build
with:
build-environment: linux-jammy-xpu-n-py3.10
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
timeout-minutes: 720
disable-monitor: false
monitor-log-interval: 15
monitor-data-collect-interval: 4
secrets: inherit

View File

@ -1837,6 +1837,10 @@ class BenchmarkRunner:
def skip_models_for_cuda(self): def skip_models_for_cuda(self):
return set() return set()
@property
def skip_models_for_xpu(self):
return set()
@property @property
def skip_models_for_cpu(self): def skip_models_for_cpu(self):
return set() return set()
@ -3927,6 +3931,8 @@ def run(runner, args, original_dir=None):
runner.skip_models.update(runner.skip_models_for_cpu_aarch64) runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
elif args.devices == ["cuda"]: elif args.devices == ["cuda"]:
runner.skip_models.update(runner.skip_models_for_cuda) runner.skip_models.update(runner.skip_models_for_cuda)
elif args.devices == ["xpu"]:
runner.skip_models.update(runner.skip_models_for_xpu)
if not args.multiprocess: if not args.multiprocess:
runner.skip_models.update(runner.skip_multiprocess_models) runner.skip_models.update(runner.skip_multiprocess_models)

View File

@ -124,6 +124,10 @@ class TorchBenchmarkRunner(BenchmarkRunner):
def skip_models_for_cuda(self): def skip_models_for_cuda(self):
return self._skip["device"]["cuda"] return self._skip["device"]["cuda"]
@property
def skip_models_for_xpu(self):
return self._skip["device"]["xpu"]
@property @property
def skip_models_for_freezing_cuda(self): def skip_models_for_freezing_cuda(self):
return self._skip["freezing"]["cuda"] return self._skip["freezing"]["cuda"]

View File

@ -217,6 +217,9 @@ skip:
cuda: [] cuda: []
xpu:
- *DETECTRON2_MODELS
test: test:
training: training:
- *DETECTRON2_MODELS - *DETECTRON2_MODELS