Upload artifacts during test run (#125799)

Zip and upload artifacts while run_test is running
Upgrade boto3 because I get errors about not having `botocore.vendored.six.move` if I don't
Pull Request resolved: https://github.com/pytorch/pytorch/pull/125799
Approved by: https://github.com/huydhn
This commit is contained in:
Catherine Lee 2024-10-22 16:48:57 +00:00 committed by PyTorch MergeBot
parent 2e48788a35
commit cc93c1e5e4
15 changed files with 133 additions and 14 deletions

View File

@ -5,7 +5,7 @@
#Pinned versions: 1.6 #Pinned versions: 1.6
#test that import: #test that import:
boto3==1.19.12 boto3==1.35.42
#Description: AWS SDK for python #Description: AWS SDK for python
#Pinned versions: 1.19.12, 1.16.34 #Pinned versions: 1.19.12, 1.16.34
#test that import: #test that import:

View File

@ -284,7 +284,7 @@ test_python_shard() {
# modify LD_LIBRARY_PATH to ensure it has the conda env. # modify LD_LIBRARY_PATH to ensure it has the conda env.
# This set of tests has been shown to be buggy without it for the split-build # This set of tests has been shown to be buggy without it for the split-build
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
assert_git_not_dirty assert_git_not_dirty
} }
@ -310,7 +310,8 @@ test_dynamo_shard() {
--exclude-distributed-tests \ --exclude-distributed-tests \
--exclude-torch-export-tests \ --exclude-torch-export-tests \
--shard "$1" "$NUM_TEST_SHARDS" \ --shard "$1" "$NUM_TEST_SHARDS" \
--verbose --verbose \
--upload-artifacts-while-running
assert_git_not_dirty assert_git_not_dirty
} }

View File

@ -26,7 +26,7 @@ runs:
retry_wait_seconds: 30 retry_wait_seconds: 30
command: | command: |
set -eu set -eu
python3 -m pip install boto3==1.19.12 python3 -m pip install boto3==1.35.42
- name: Download the cache - name: Download the cache
shell: bash shell: bash

View File

@ -33,7 +33,7 @@ runs:
retry_wait_seconds: 30 retry_wait_seconds: 30
command: | command: |
set -eu set -eu
python3 -m pip install boto3==1.19.12 python3 -m pip install boto3==1.35.42
- name: Upload the cache - name: Upload the cache
shell: bash shell: bash

View File

@ -4,7 +4,7 @@
# docs/cpp/requirements.txt # docs/cpp/requirements.txt
# functorch/docs/requirements.txt # functorch/docs/requirements.txt
# .ci/docker/requirements-ci.txt # .ci/docker/requirements-ci.txt
boto3==1.19.12 boto3==1.35.42
jinja2==3.1.4 jinja2==3.1.4
lintrunner==0.10.7 lintrunner==0.10.7
ninja==1.10.0.post1 ninja==1.10.0.post1

View File

@ -1,4 +1,4 @@
boto3==1.19.12 boto3==1.35.42
hypothesis==6.56.4 hypothesis==6.56.4
expecttest==0.2.1 expecttest==0.2.1
fbscribelogger==0.1.6 fbscribelogger==0.1.6

View File

@ -230,7 +230,7 @@ jobs:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }} IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
run: | run: |
set -x set -x
@ -289,6 +289,7 @@ jobs:
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \ -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
-e DASHBOARD_TAG \ -e DASHBOARD_TAG \
-e IS_A100_RUNNER \ -e IS_A100_RUNNER \
-e ARTIFACTS_FILE_SUFFIX \
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
--security-opt seccomp=unconfined \ --security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \ --cap-add=SYS_PTRACE \

View File

@ -32,7 +32,7 @@ jobs:
cache: pip cache: pip
- run: | - run: |
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12 pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
- name: Upload external contribution stats - name: Upload external contribution stats
uses: nick-fields/retry@v3.0.0 uses: nick-fields/retry@v3.0.0

View File

@ -70,7 +70,7 @@ jobs:
PR_NUMBER: ${{ github.event.pull_request.number }} PR_NUMBER: ${{ github.event.pull_request.number }}
run: | run: |
unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true
python3 -m pip install boto3==1.19.12 python3 -m pip install boto3==1.35.42
python3 tools/testing/do_target_determination_for_s3.py python3 tools/testing/do_target_determination_for_s3.py
- name: Upload TD results to s3 - name: Upload TD results to s3

View File

@ -29,5 +29,5 @@ jobs:
aws-region: us-east-1 aws-region: us-east-1
- name: Update PyTorch labels list in S3 - name: Update PyTorch labels list in S3
run: | run: |
python3 -m pip install boto3==1.19.12 python3 -m pip install boto3==1.35.42
.github/scripts/export_pytorch_labels.py pytorch pytorch .github/scripts/export_pytorch_labels.py pytorch pytorch

View File

@ -53,7 +53,7 @@ jobs:
cache: pip cache: pip
- run: | - run: |
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12 pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
- name: Upload test artifacts - name: Upload test artifacts
id: upload-s3 id: upload-s3

View File

@ -49,7 +49,7 @@ jobs:
cache: pip cache: pip
- run: | - run: |
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12 pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
- name: Upload torch dynamo performance stats to S3 - name: Upload torch dynamo performance stats to S3
id: upload-s3 id: upload-s3

View File

@ -28,7 +28,7 @@ jobs:
cache: pip cache: pip
- run: | - run: |
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12 pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
- name: Upload test stats - name: Upload test stats
env: env:

View File

@ -70,6 +70,7 @@ from tools.testing.test_selections import (
ShardedTest, ShardedTest,
THRESHOLD, THRESHOLD,
) )
from tools.testing.upload_artifacts import zip_and_upload_artifacts
# Make sure to remove REPO_ROOT after import is done # Make sure to remove REPO_ROOT after import is done
@ -1331,6 +1332,10 @@ def parse_args():
action="store_false", action="store_false",
help="Run tests without translation validation.", help="Run tests without translation validation.",
) )
parser.add_argument(
"--upload-artifacts-while-running",
action="store_true",
)
group = parser.add_mutually_exclusive_group() group = parser.add_mutually_exclusive_group()
group.add_argument( group.add_argument(
@ -1677,6 +1682,8 @@ def run_tests(
def parallel_test_completion_callback(failure): def parallel_test_completion_callback(failure):
test_failed = handle_error_messages(failure) test_failed = handle_error_messages(failure)
if IS_CI and options.upload_artifacts_while_running:
zip_and_upload_artifacts(test_failed)
if ( if (
test_failed test_failed
and not options.continue_through_error and not options.continue_through_error

View File

@ -0,0 +1,110 @@
import glob
import os
import time
import zipfile
from functools import lru_cache
from pathlib import Path
from typing import Any, List
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
LAST_UPDATED = 0.0
@lru_cache(maxsize=1)
def get_s3_resource() -> Any:
import boto3 # type: ignore[import]
return boto3.client("s3")
def zip_artifact(file_name: str, paths: List[str]) -> None:
"""Zip the files in the paths listed into file_name. The paths will be used
in a glob and should be relative to REPO_ROOT."""
with zipfile.ZipFile(file_name, "w") as f:
for path in paths:
for file in glob.glob(f"{REPO_ROOT}/{path}", recursive=True):
f.write(file, os.path.relpath(file, REPO_ROOT))
def upload_to_s3_artifacts() -> None:
"""Upload the file to S3."""
workflow_id = os.environ.get("GITHUB_RUN_ID")
workflow_run_attempt = os.environ.get("GITHUB_RUN_ATTEMPT")
file_suffix = os.environ.get("ARTIFACTS_FILE_SUFFIX")
if not workflow_id or not workflow_run_attempt or not file_suffix:
print(
"GITHUB_RUN_ID, GITHUB_RUN_ATTEMPT, or ARTIFACTS_FILE_SUFFIX not set, not uploading"
)
return
test_reports_zip_path = f"{REPO_ROOT}/test-reports-{file_suffix}.zip"
zip_artifact(
test_reports_zip_path,
["test/test-reports/**/*.xml", "test/test-reports/**/*.csv"],
)
test_logs_zip_path = f"{REPO_ROOT}/logs-{file_suffix}.zip"
zip_artifact(test_logs_zip_path, ["test/test-reports/**/*.log"])
jsons_zip_path = f"{REPO_ROOT}/test-jsons-{file_suffix}.zip"
zip_artifact(jsons_zip_path, ["test/test-reports/**/*.json"])
s3_prefix = f"pytorch/pytorch/{workflow_id}/{workflow_run_attempt}/artifact"
get_s3_resource().upload_file(
test_reports_zip_path,
"gha-artifacts",
f"{s3_prefix}/{Path(test_reports_zip_path).name}",
)
get_s3_resource().upload_file(
test_logs_zip_path,
"gha-artifacts",
f"{s3_prefix}/{Path(test_logs_zip_path).name}",
)
get_s3_resource().upload_file(
test_logs_zip_path,
"gha-artifacts",
f"{s3_prefix}/{Path(jsons_zip_path).name}",
)
get_s3_resource().put_object(
Body=b"",
Bucket="gha-artifacts",
Key=f"workflows_failing_pending_upload/{workflow_id}.txt",
)
def zip_and_upload_artifacts(failed: bool) -> None:
# not thread safe but correctness of the LAST_UPDATED var doesn't really
# matter for this
# Upload if a test failed or every 20 minutes
global LAST_UPDATED
if failed or time.time() - LAST_UPDATED > 20 * 60:
start = time.time()
try:
upload_to_s3_artifacts()
LAST_UPDATED = time.time()
except Exception as e:
print(f"Failed to upload artifacts: {e}")
print(f"Uploading artifacts took {time.time() - start:.2f} seconds")
def trigger_upload_test_stats_intermediate_workflow() -> None:
import requests
# The GITHUB_TOKEN cannot trigger workflow so this isn't used for now
print("Triggering upload_test_stats_intermediate workflow")
x = requests.post(
"https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches",
headers={
"Accept": "application/vnd.github.v3+json",
"Authorization": f"Bearer {os.environ.get('GITHUB_TOKEN')}",
},
json={
"ref": "main",
"inputs": {
"workflow_run_id": os.environ.get("GITHUB_RUN_ID"),
"workflow_run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
},
},
)
print(x.text)