diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index a23c85bc60a..d0500b89780 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -195,13 +195,16 @@ case "$tag" in NINJA_VERSION=1.9.0 TRITON=yes ;; - pytorch-linux-jammy-xpu-n-py3) + pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks) ANACONDA_PYTHON_VERSION=3.10 GCC_VERSION=11 VISION=yes XPU_VERSION=2025.2 NINJA_VERSION=1.9.0 TRITON=yes + if [[ $tag =~ "benchmarks" ]]; then + INDUCTOR_BENCHMARKS=yes + fi ;; pytorch-linux-jammy-py3-gcc11-inductor-benchmarks) ANACONDA_PYTHON_VERSION=3.10 diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile index 8765249688c..af11992a916 100644 --- a/.ci/docker/ubuntu-xpu/Dockerfile +++ b/.ci/docker/ubuntu-xpu/Dockerfile @@ -54,12 +54,15 @@ ENV OPENSSL_DIR /opt/openssl RUN rm install_openssl.sh ARG INDUCTOR_BENCHMARKS +ARG ANACONDA_PYTHON_VERSION +ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh COPY ./common/common_utils.sh common_utils.sh COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt COPY ci_commit_pins/timm.txt timm.txt +COPY ci_commit_pins/torchbench.txt torchbench.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi -RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt +RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt # Install XPU Dependencies ARG XPU_VERSION diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index cae81a2568d..d66aa1120fb 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -426,7 +426,7 @@ fi if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then # export test times so that potential sharded tests that'll branch off this build will use consistent data # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build - python tools/stats/export_test_times.py + PYTHONPATH=. python tools/stats/export_test_times.py fi # don't do this for bazel or s390x or riscv64 as they don't use sccache if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 40dc90f2eb2..3d72684914b 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -572,6 +572,8 @@ fi if [[ "${TEST_CONFIG}" == *cpu* ]]; then DYNAMO_BENCHMARK_FLAGS+=(--device cpu) +elif [[ "${TEST_CONFIG}" == *xpu* ]]; then + DYNAMO_BENCHMARK_FLAGS+=(--device xpu) else DYNAMO_BENCHMARK_FLAGS+=(--device cuda) fi @@ -665,6 +667,8 @@ test_perf_for_dashboard() { device=cuda_b200 elif [[ "${TEST_CONFIG}" == *rocm* ]]; then device=rocm + elif [[ "${TEST_CONFIG}" == *xpu* ]]; then + device=xpu fi for mode in "${modes[@]}"; do @@ -1757,7 +1761,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then else # Do this after checkout_install_torchbench to ensure we clobber any # nightlies that torchbench may pull in - if [[ "${TEST_CONFIG}" != *cpu* ]]; then + if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then install_torchrec_and_fbgemm fi PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id" diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 5d9f065264d..c15ba606398 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -19,6 +19,7 @@ ciflow_push_tags: - ciflow/inductor-perf-test-nightly-rocm-mi300 - ciflow/inductor-perf-test-nightly-rocm-mi355 - ciflow/inductor-perf-test-nightly-x86-zen +- ciflow/inductor-perf-test-nightly-xpu - ciflow/inductor-periodic - ciflow/inductor-rocm - ciflow/linux-aarch64 diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index 7aa76089244..e68bc6ead3a 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -38,6 +38,10 @@ on: default: "" description: | List of tests to include (empty string implies default list) + dashboard-tag: + required: false + type: string + default: "" disable-monitor: description: | [Experimental] Disable utilization monitoring for tests. @@ -58,6 +62,11 @@ on: required: false type: number default: 1 + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub permissions: id-token: write contents: read @@ -196,6 +205,8 @@ jobs: PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }} + DASHBOARD_TAG: ${{ inputs.dashboard-tag }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} run: | # Fetch aws credential from IMDs @@ -246,6 +257,8 @@ jobs: -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ -e TESTS_TO_INCLUDE \ -e ZE_AFFINITY_MASK \ + -e HUGGING_FACE_HUB_TOKEN \ + -e DASHBOARD_TAG \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --ulimit stack=10485760:83886080 \ --ulimit core=0 \ diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 76ac14f509c..6fbe2e846d4 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -67,6 +67,7 @@ jobs: pytorch-linux-jammy-py3.12-halide, pytorch-linux-jammy-xpu-n-1-py3, pytorch-linux-jammy-xpu-n-py3, + pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks, pytorch-linux-jammy-py3-clang18-asan, pytorch-linux-jammy-py3-clang12-onnx, pytorch-linux-jammy-linter, diff --git a/.github/workflows/inductor-perf-test-nightly-xpu.yml b/.github/workflows/inductor-perf-test-nightly-xpu.yml new file mode 100644 index 00000000000..c2db8c310e3 --- /dev/null +++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml @@ -0,0 +1,148 @@ +name: inductor-perf-nightly-xpu + +on: + push: + tags: + - ciflow/inductor-perf-test-nightly-xpu/* + schedule: + - cron: 30 17 * * * + workflow_dispatch: + inputs: + training: + description: Run training (on by default)? + required: false + type: boolean + default: true + inference: + description: Run inference (on by default)? + required: false + type: boolean + default: true + default: + description: Run inductor_default? + required: false + type: boolean + default: false + dynamic: + description: Run inductor_dynamic_shapes? + required: false + type: boolean + default: false + cppwrapper: + description: Run inductor_cpp_wrapper? + required: false + type: boolean + default: false + cudagraphs: + description: Run inductor_cudagraphs? + required: false + type: boolean + default: false + freezing_cudagraphs: + description: Run inductor_cudagraphs with freezing for inference? + required: false + type: boolean + default: false + aotinductor: + description: Run aot_inductor for inference? + required: false + type: boolean + default: false + maxautotune: + description: Run inductor_max_autotune? + required: false + type: boolean + default: false + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: read-all + +jobs: + get-label-type: + name: get-label-type + uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main + if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }} + with: + triggering_actor: ${{ github.triggering_actor }} + issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }} + curr_branch: ${{ github.head_ref || github.ref_name }} + curr_ref_type: ${{ github.ref_type }} + opt_out_experiments: lf + + xpu-n-py3_10-inductor-benchmark-build: + name: xpu-n-py3.10-inductor-benchmark + uses: ./.github/workflows/_linux-build.yml + needs: get-label-type + with: + runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" + build-environment: linux-jammy-xpu-n-py3.10 + docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks + runner: linux.c7i.12xlarge + test-matrix: | + { include: [ + { config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" }, + { config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" }, + { config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" }, + { config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" }, + { config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" }, + { config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" }, + { config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" }, + ]} + secrets: inherit + + xpu-n-py3_10-inductor-benchmark-test-nightly: + permissions: + id-token: write + contents: read + if: github.event_name != 'workflow_dispatch' + name: xpu-n-py3.10-inductor-benchmark + uses: ./.github/workflows/_xpu-test.yml + needs: xpu-n-py3_10-inductor-benchmark-build + with: + build-environment: linux-jammy-xpu-n-py3.10 + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false + docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }} + timeout-minutes: 720 + # Disable monitor in perf tests for more investigation + disable-monitor: true + monitor-log-interval: 10 + monitor-data-collect-interval: 2 + secrets: inherit + + xpu-n-py3_10-inductor-benchmark-test: + permissions: + id-token: write + contents: read + if: github.event_name == 'workflow_dispatch' + name: xpu-n-py3.10-inductor-test + uses: ./.github/workflows/_xpu-test.yml + needs: xpu-n-py3_10-inductor-benchmark-build + with: + build-environment: linux-jammy-xpu-n-py3.10 + dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} + docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }} + timeout-minutes: 720 + disable-monitor: false + monitor-log-interval: 15 + monitor-data-collect-interval: 4 + secrets: inherit diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py index 54900de1ed9..e0681f52586 100644 --- a/benchmarks/dynamo/common.py +++ b/benchmarks/dynamo/common.py @@ -1837,6 +1837,10 @@ class BenchmarkRunner: def skip_models_for_cuda(self): return set() + @property + def skip_models_for_xpu(self): + return set() + @property def skip_models_for_cpu(self): return set() @@ -3927,6 +3931,8 @@ def run(runner, args, original_dir=None): runner.skip_models.update(runner.skip_models_for_cpu_aarch64) elif args.devices == ["cuda"]: runner.skip_models.update(runner.skip_models_for_cuda) + elif args.devices == ["xpu"]: + runner.skip_models.update(runner.skip_models_for_xpu) if not args.multiprocess: runner.skip_models.update(runner.skip_multiprocess_models) diff --git a/benchmarks/dynamo/torchbench.py b/benchmarks/dynamo/torchbench.py index da6a3e1336a..ac4ddb40884 100755 --- a/benchmarks/dynamo/torchbench.py +++ b/benchmarks/dynamo/torchbench.py @@ -124,6 +124,10 @@ class TorchBenchmarkRunner(BenchmarkRunner): def skip_models_for_cuda(self): return self._skip["device"]["cuda"] + @property + def skip_models_for_xpu(self): + return self._skip["device"]["xpu"] + @property def skip_models_for_freezing_cuda(self): return self._skip["freezing"]["cuda"] diff --git a/benchmarks/dynamo/torchbench.yaml b/benchmarks/dynamo/torchbench.yaml index c2324eddc38..b31a85ae267 100644 --- a/benchmarks/dynamo/torchbench.yaml +++ b/benchmarks/dynamo/torchbench.yaml @@ -217,6 +217,9 @@ skip: cuda: [] + xpu: + - *DETECTRON2_MODELS + test: training: - *DETECTRON2_MODELS