mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
[symm_mem] Create a dedicated ci flow for symmetric memory and only use 4 GPUs (#157181)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/157181 Approved by: https://github.com/kwen2501, https://github.com/huydhn
This commit is contained in:
parent
88c6199db0
commit
8147c4a904
|
|
@ -327,6 +327,10 @@ test_h100_distributed() {
|
|||
time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
# This test requires multicast support
|
||||
time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
test_h100_symm_mem() {
|
||||
# symmetric memory test
|
||||
time python test/run_test.py --include distributed/test_symmetric_memory.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
time TORCH_SYMMMEM=NVSHMEM python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
|
|
@ -1748,6 +1752,8 @@ elif [[ "${TEST_CONFIG}" == smoke ]]; then
|
|||
test_python_smoke
|
||||
elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
|
||||
test_h100_distributed
|
||||
elif [[ "${TEST_CONFIG}" == test_h100_symm_mem ]]; then
|
||||
test_h100_symm_mem
|
||||
else
|
||||
install_torchvision
|
||||
install_monkeytype
|
||||
|
|
|
|||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
|
|
@ -31,6 +31,7 @@ ciflow_push_tags:
|
|||
- ciflow/pull
|
||||
- ciflow/h100
|
||||
- ciflow/h100-distributed
|
||||
- ciflow/h100-symm-mem
|
||||
retryable_workflows:
|
||||
- pull
|
||||
- trunk
|
||||
|
|
|
|||
54
.github/workflows/h100-symm-mem.yml
vendored
Normal file
54
.github/workflows/h100-symm-mem.yml
vendored
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
name: Limited CI for symmetric memory tests on H100
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- .github/workflows/h100-symm-mem.yml
|
||||
workflow_dispatch:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/h100-symm-mem/*
|
||||
schedule:
|
||||
- cron: 22 8 * * * # about 1:22am PDT
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
|
||||
get-label-type:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "h100_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
Loading…
Reference in New Issue
Block a user