[symm_mem] Create a dedicated ci flow for symmetric memory and only use 4 GPUs (#157181)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157181
Approved by: https://github.com/kwen2501, https://github.com/huydhn
This commit is contained in:
fduwjj 2025-06-27 22:10:44 -07:00 committed by PyTorch MergeBot
parent 88c6199db0
commit 8147c4a904
3 changed files with 61 additions and 0 deletions

View File

@ -327,6 +327,10 @@ test_h100_distributed() {
time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
# This test requires multicast support
time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty
}
test_h100_symm_mem() {
# symmetric memory test
time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -1748,6 +1752,8 @@ elif [[ "${TEST_CONFIG}" == smoke ]]; then
test_python_smoke
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
test_h100_distributed
elif [[ "${TEST_CONFIG}" == test_h100_symm_mem ]]; then
test_h100_symm_mem
else
install_torchvision
install_monkeytype

View File

@ -31,6 +31,7 @@ ciflow_push_tags:
- ciflow/pull
- ciflow/h100
- ciflow/h100-distributed
- ciflow/h100-symm-mem
retryable_workflows:
- pull
- trunk

54
.github/workflows/h100-symm-mem.yml vendored Normal file
View File

@ -0,0 +1,54 @@
name: Limited CI for symmetric memory tests on H100
on:
pull_request:
paths:
- .github/workflows/h100-symm-mem.yml
workflow_dispatch:
push:
tags:
- ciflow/h100-symm-mem/*
schedule:
- cron: 22 8 * * * # about 1:22am PDT
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
cancel-in-progress: true
jobs:
get-label-type:
if: github.repository_owner == 'pytorch'
name: get-label-type
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
with:
triggering_actor: ${{ github.triggering_actor }}
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
curr_branch: ${{ github.head_ref || github.ref_name }}
curr_ref_type: ${{ github.ref_type }}
linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
{ config: "h100_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4" },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm.outputs.test-matrix }}
secrets: inherit