mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Upload artifacts during test run (#125799)
Zip and upload artifacts while run_test is running Upgrade boto3 because I get errors about not having `botocore.vendored.six.move` if I don't Pull Request resolved: https://github.com/pytorch/pytorch/pull/125799 Approved by: https://github.com/huydhn
This commit is contained in:
parent
2e48788a35
commit
cc93c1e5e4
|
|
@ -5,7 +5,7 @@
|
||||||
#Pinned versions: 1.6
|
#Pinned versions: 1.6
|
||||||
#test that import:
|
#test that import:
|
||||||
|
|
||||||
boto3==1.19.12
|
boto3==1.35.42
|
||||||
#Description: AWS SDK for python
|
#Description: AWS SDK for python
|
||||||
#Pinned versions: 1.19.12, 1.16.34
|
#Pinned versions: 1.19.12, 1.16.34
|
||||||
#test that import:
|
#test that import:
|
||||||
|
|
|
||||||
|
|
@ -284,7 +284,7 @@ test_python_shard() {
|
||||||
|
|
||||||
# modify LD_LIBRARY_PATH to ensure it has the conda env.
|
# modify LD_LIBRARY_PATH to ensure it has the conda env.
|
||||||
# This set of tests has been shown to be buggy without it for the split-build
|
# This set of tests has been shown to be buggy without it for the split-build
|
||||||
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION
|
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||||
|
|
||||||
assert_git_not_dirty
|
assert_git_not_dirty
|
||||||
}
|
}
|
||||||
|
|
@ -310,7 +310,8 @@ test_dynamo_shard() {
|
||||||
--exclude-distributed-tests \
|
--exclude-distributed-tests \
|
||||||
--exclude-torch-export-tests \
|
--exclude-torch-export-tests \
|
||||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||||
--verbose
|
--verbose \
|
||||||
|
--upload-artifacts-while-running
|
||||||
assert_git_not_dirty
|
assert_git_not_dirty
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ runs:
|
||||||
retry_wait_seconds: 30
|
retry_wait_seconds: 30
|
||||||
command: |
|
command: |
|
||||||
set -eu
|
set -eu
|
||||||
python3 -m pip install boto3==1.19.12
|
python3 -m pip install boto3==1.35.42
|
||||||
|
|
||||||
- name: Download the cache
|
- name: Download the cache
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@ runs:
|
||||||
retry_wait_seconds: 30
|
retry_wait_seconds: 30
|
||||||
command: |
|
command: |
|
||||||
set -eu
|
set -eu
|
||||||
python3 -m pip install boto3==1.19.12
|
python3 -m pip install boto3==1.35.42
|
||||||
|
|
||||||
- name: Upload the cache
|
- name: Upload the cache
|
||||||
shell: bash
|
shell: bash
|
||||||
|
|
|
||||||
2
.github/requirements-gha-cache.txt
vendored
2
.github/requirements-gha-cache.txt
vendored
|
|
@ -4,7 +4,7 @@
|
||||||
# docs/cpp/requirements.txt
|
# docs/cpp/requirements.txt
|
||||||
# functorch/docs/requirements.txt
|
# functorch/docs/requirements.txt
|
||||||
# .ci/docker/requirements-ci.txt
|
# .ci/docker/requirements-ci.txt
|
||||||
boto3==1.19.12
|
boto3==1.35.42
|
||||||
jinja2==3.1.4
|
jinja2==3.1.4
|
||||||
lintrunner==0.10.7
|
lintrunner==0.10.7
|
||||||
ninja==1.10.0.post1
|
ninja==1.10.0.post1
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
boto3==1.19.12
|
boto3==1.35.42
|
||||||
hypothesis==6.56.4
|
hypothesis==6.56.4
|
||||||
expecttest==0.2.1
|
expecttest==0.2.1
|
||||||
fbscribelogger==0.1.6
|
fbscribelogger==0.1.6
|
||||||
|
|
|
||||||
3
.github/workflows/_linux-test.yml
vendored
3
.github/workflows/_linux-test.yml
vendored
|
|
@ -230,7 +230,7 @@ jobs:
|
||||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||||
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
||||||
IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
|
IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
|
||||||
|
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
||||||
run: |
|
run: |
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
|
|
@ -289,6 +289,7 @@ jobs:
|
||||||
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
||||||
-e DASHBOARD_TAG \
|
-e DASHBOARD_TAG \
|
||||||
-e IS_A100_RUNNER \
|
-e IS_A100_RUNNER \
|
||||||
|
-e ARTIFACTS_FILE_SUFFIX \
|
||||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||||
--security-opt seccomp=unconfined \
|
--security-opt seccomp=unconfined \
|
||||||
--cap-add=SYS_PTRACE \
|
--cap-add=SYS_PTRACE \
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ jobs:
|
||||||
cache: pip
|
cache: pip
|
||||||
|
|
||||||
- run: |
|
- run: |
|
||||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
|
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
|
||||||
|
|
||||||
- name: Upload external contribution stats
|
- name: Upload external contribution stats
|
||||||
uses: nick-fields/retry@v3.0.0
|
uses: nick-fields/retry@v3.0.0
|
||||||
|
|
|
||||||
2
.github/workflows/target_determination.yml
vendored
2
.github/workflows/target_determination.yml
vendored
|
|
@ -70,7 +70,7 @@ jobs:
|
||||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||||
run: |
|
run: |
|
||||||
unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true
|
unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true
|
||||||
python3 -m pip install boto3==1.19.12
|
python3 -m pip install boto3==1.35.42
|
||||||
python3 tools/testing/do_target_determination_for_s3.py
|
python3 tools/testing/do_target_determination_for_s3.py
|
||||||
|
|
||||||
- name: Upload TD results to s3
|
- name: Upload TD results to s3
|
||||||
|
|
|
||||||
2
.github/workflows/update_pytorch_labels.yml
vendored
2
.github/workflows/update_pytorch_labels.yml
vendored
|
|
@ -29,5 +29,5 @@ jobs:
|
||||||
aws-region: us-east-1
|
aws-region: us-east-1
|
||||||
- name: Update PyTorch labels list in S3
|
- name: Update PyTorch labels list in S3
|
||||||
run: |
|
run: |
|
||||||
python3 -m pip install boto3==1.19.12
|
python3 -m pip install boto3==1.35.42
|
||||||
.github/scripts/export_pytorch_labels.py pytorch pytorch
|
.github/scripts/export_pytorch_labels.py pytorch pytorch
|
||||||
|
|
|
||||||
2
.github/workflows/upload-test-stats.yml
vendored
2
.github/workflows/upload-test-stats.yml
vendored
|
|
@ -53,7 +53,7 @@ jobs:
|
||||||
cache: pip
|
cache: pip
|
||||||
|
|
||||||
- run: |
|
- run: |
|
||||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
|
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
|
||||||
|
|
||||||
- name: Upload test artifacts
|
- name: Upload test artifacts
|
||||||
id: upload-s3
|
id: upload-s3
|
||||||
|
|
|
||||||
|
|
@ -49,7 +49,7 @@ jobs:
|
||||||
cache: pip
|
cache: pip
|
||||||
|
|
||||||
- run: |
|
- run: |
|
||||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
|
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
|
||||||
|
|
||||||
- name: Upload torch dynamo performance stats to S3
|
- name: Upload torch dynamo performance stats to S3
|
||||||
id: upload-s3
|
id: upload-s3
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ jobs:
|
||||||
cache: pip
|
cache: pip
|
||||||
|
|
||||||
- run: |
|
- run: |
|
||||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
|
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
|
||||||
|
|
||||||
- name: Upload test stats
|
- name: Upload test stats
|
||||||
env:
|
env:
|
||||||
|
|
|
||||||
|
|
@ -70,6 +70,7 @@ from tools.testing.test_selections import (
|
||||||
ShardedTest,
|
ShardedTest,
|
||||||
THRESHOLD,
|
THRESHOLD,
|
||||||
)
|
)
|
||||||
|
from tools.testing.upload_artifacts import zip_and_upload_artifacts
|
||||||
|
|
||||||
|
|
||||||
# Make sure to remove REPO_ROOT after import is done
|
# Make sure to remove REPO_ROOT after import is done
|
||||||
|
|
@ -1331,6 +1332,10 @@ def parse_args():
|
||||||
action="store_false",
|
action="store_false",
|
||||||
help="Run tests without translation validation.",
|
help="Run tests without translation validation.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--upload-artifacts-while-running",
|
||||||
|
action="store_true",
|
||||||
|
)
|
||||||
|
|
||||||
group = parser.add_mutually_exclusive_group()
|
group = parser.add_mutually_exclusive_group()
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
|
|
@ -1677,6 +1682,8 @@ def run_tests(
|
||||||
|
|
||||||
def parallel_test_completion_callback(failure):
|
def parallel_test_completion_callback(failure):
|
||||||
test_failed = handle_error_messages(failure)
|
test_failed = handle_error_messages(failure)
|
||||||
|
if IS_CI and options.upload_artifacts_while_running:
|
||||||
|
zip_and_upload_artifacts(test_failed)
|
||||||
if (
|
if (
|
||||||
test_failed
|
test_failed
|
||||||
and not options.continue_through_error
|
and not options.continue_through_error
|
||||||
|
|
|
||||||
110
tools/testing/upload_artifacts.py
Normal file
110
tools/testing/upload_artifacts.py
Normal file
|
|
@ -0,0 +1,110 @@
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import zipfile
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, List
|
||||||
|
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||||
|
LAST_UPDATED = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_s3_resource() -> Any:
|
||||||
|
import boto3 # type: ignore[import]
|
||||||
|
|
||||||
|
return boto3.client("s3")
|
||||||
|
|
||||||
|
|
||||||
|
def zip_artifact(file_name: str, paths: List[str]) -> None:
|
||||||
|
"""Zip the files in the paths listed into file_name. The paths will be used
|
||||||
|
in a glob and should be relative to REPO_ROOT."""
|
||||||
|
|
||||||
|
with zipfile.ZipFile(file_name, "w") as f:
|
||||||
|
for path in paths:
|
||||||
|
for file in glob.glob(f"{REPO_ROOT}/{path}", recursive=True):
|
||||||
|
f.write(file, os.path.relpath(file, REPO_ROOT))
|
||||||
|
|
||||||
|
|
||||||
|
def upload_to_s3_artifacts() -> None:
|
||||||
|
"""Upload the file to S3."""
|
||||||
|
workflow_id = os.environ.get("GITHUB_RUN_ID")
|
||||||
|
workflow_run_attempt = os.environ.get("GITHUB_RUN_ATTEMPT")
|
||||||
|
file_suffix = os.environ.get("ARTIFACTS_FILE_SUFFIX")
|
||||||
|
if not workflow_id or not workflow_run_attempt or not file_suffix:
|
||||||
|
print(
|
||||||
|
"GITHUB_RUN_ID, GITHUB_RUN_ATTEMPT, or ARTIFACTS_FILE_SUFFIX not set, not uploading"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
test_reports_zip_path = f"{REPO_ROOT}/test-reports-{file_suffix}.zip"
|
||||||
|
zip_artifact(
|
||||||
|
test_reports_zip_path,
|
||||||
|
["test/test-reports/**/*.xml", "test/test-reports/**/*.csv"],
|
||||||
|
)
|
||||||
|
test_logs_zip_path = f"{REPO_ROOT}/logs-{file_suffix}.zip"
|
||||||
|
zip_artifact(test_logs_zip_path, ["test/test-reports/**/*.log"])
|
||||||
|
jsons_zip_path = f"{REPO_ROOT}/test-jsons-{file_suffix}.zip"
|
||||||
|
zip_artifact(jsons_zip_path, ["test/test-reports/**/*.json"])
|
||||||
|
|
||||||
|
s3_prefix = f"pytorch/pytorch/{workflow_id}/{workflow_run_attempt}/artifact"
|
||||||
|
get_s3_resource().upload_file(
|
||||||
|
test_reports_zip_path,
|
||||||
|
"gha-artifacts",
|
||||||
|
f"{s3_prefix}/{Path(test_reports_zip_path).name}",
|
||||||
|
)
|
||||||
|
get_s3_resource().upload_file(
|
||||||
|
test_logs_zip_path,
|
||||||
|
"gha-artifacts",
|
||||||
|
f"{s3_prefix}/{Path(test_logs_zip_path).name}",
|
||||||
|
)
|
||||||
|
get_s3_resource().upload_file(
|
||||||
|
test_logs_zip_path,
|
||||||
|
"gha-artifacts",
|
||||||
|
f"{s3_prefix}/{Path(jsons_zip_path).name}",
|
||||||
|
)
|
||||||
|
get_s3_resource().put_object(
|
||||||
|
Body=b"",
|
||||||
|
Bucket="gha-artifacts",
|
||||||
|
Key=f"workflows_failing_pending_upload/{workflow_id}.txt",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def zip_and_upload_artifacts(failed: bool) -> None:
|
||||||
|
# not thread safe but correctness of the LAST_UPDATED var doesn't really
|
||||||
|
# matter for this
|
||||||
|
# Upload if a test failed or every 20 minutes
|
||||||
|
global LAST_UPDATED
|
||||||
|
|
||||||
|
if failed or time.time() - LAST_UPDATED > 20 * 60:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
upload_to_s3_artifacts()
|
||||||
|
LAST_UPDATED = time.time()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to upload artifacts: {e}")
|
||||||
|
print(f"Uploading artifacts took {time.time() - start:.2f} seconds")
|
||||||
|
|
||||||
|
|
||||||
|
def trigger_upload_test_stats_intermediate_workflow() -> None:
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# The GITHUB_TOKEN cannot trigger workflow so this isn't used for now
|
||||||
|
print("Triggering upload_test_stats_intermediate workflow")
|
||||||
|
x = requests.post(
|
||||||
|
"https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches",
|
||||||
|
headers={
|
||||||
|
"Accept": "application/vnd.github.v3+json",
|
||||||
|
"Authorization": f"Bearer {os.environ.get('GITHUB_TOKEN')}",
|
||||||
|
},
|
||||||
|
json={
|
||||||
|
"ref": "main",
|
||||||
|
"inputs": {
|
||||||
|
"workflow_run_id": os.environ.get("GITHUB_RUN_ID"),
|
||||||
|
"workflow_run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
print(x.text)
|
||||||
Loading…
Reference in New Issue
Block a user