[symm_mem] Create a dedicated ci flow for symmetric memory and only use 4 GPUs (#157181)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157181 Approved by: https://github.com/kwen2501, https://github.com/huydhn
2025-12-06 12:20:52 +01:00 · 2025-06-27 22:10:44 -07:00 · 2025-06-27 22:10:44 -07:00 · 8147c4a904
commit 8147c4a904
parent 88c6199db0
3 changed files with 61 additions and 0 deletions
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -327,6 +327,10 @@ test_h100_distributed() {
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  # This test requires multicast support
  time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
+test_h100_symm_mem() {
  # symmetric memory test
  time python test/run_test.py --include distributed/test_symmetric_memory.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -1748,6 +1752,8 @@ elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
+elif [[ "${TEST_CONFIG}" == test_h100_symm_mem ]]; then
+  test_h100_symm_mem
 else
  install_torchvision
  install_monkeytype
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -31,6 +31,7 @@ ciflow_push_tags:
 - ciflow/pull
 - ciflow/h100
 - ciflow/h100-distributed
+- ciflow/h100-symm-mem
 retryable_workflows:
 - pull
 - trunk
--- a/.github/workflows/h100-symm-mem.yml
+++ b/.github/workflows/h100-symm-mem.yml
@ -0,0 +1,54 @@
+name: Limited CI for symmetric memory tests on H100
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/h100-symm-mem.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/h100-symm-mem/*
+  schedule:
+    - cron: 22 8 * * *  # about 1:22am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '9.0'
+      test-matrix: |
+        { include: [
+          { config: "h100_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm.outputs.test-matrix }}
+    secrets: inherit