diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index 17e6e8525e1..4a92c47173d 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -5,7 +5,7 @@ #Pinned versions: 1.6 #test that import: -boto3==1.19.12 +boto3==1.35.42 #Description: AWS SDK for python #Pinned versions: 1.19.12, 1.16.34 #test that import: diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index 10b7ed8b343..651de019fe4 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -284,7 +284,7 @@ test_python_shard() { # modify LD_LIBRARY_PATH to ensure it has the conda env. # This set of tests has been shown to be buggy without it for the split-build - time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running assert_git_not_dirty } @@ -310,7 +310,8 @@ test_dynamo_shard() { --exclude-distributed-tests \ --exclude-torch-export-tests \ --shard "$1" "$NUM_TEST_SHARDS" \ - --verbose + --verbose \ + --upload-artifacts-while-running assert_git_not_dirty } diff --git a/.github/actions/pytest-cache-download/action.yml b/.github/actions/pytest-cache-download/action.yml index 1e75da9731d..1406f962c4c 100644 --- a/.github/actions/pytest-cache-download/action.yml +++ b/.github/actions/pytest-cache-download/action.yml @@ -26,7 +26,7 @@ runs: retry_wait_seconds: 30 command: | set -eu - python3 -m pip install boto3==1.19.12 + python3 -m pip install boto3==1.35.42 - name: Download the cache shell: bash diff --git a/.github/actions/pytest-cache-upload/action.yml b/.github/actions/pytest-cache-upload/action.yml index 3b2a89dee7c..2652d019075 100644 --- a/.github/actions/pytest-cache-upload/action.yml +++ b/.github/actions/pytest-cache-upload/action.yml @@ -33,7 +33,7 @@ runs: retry_wait_seconds: 30 command: | set -eu - python3 -m pip install boto3==1.19.12 + python3 -m pip install boto3==1.35.42 - name: Upload the cache shell: bash diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt index 5d1e4516056..e24a81cbfbc 100644 --- a/.github/requirements-gha-cache.txt +++ b/.github/requirements-gha-cache.txt @@ -4,7 +4,7 @@ # docs/cpp/requirements.txt # functorch/docs/requirements.txt # .ci/docker/requirements-ci.txt -boto3==1.19.12 +boto3==1.35.42 jinja2==3.1.4 lintrunner==0.10.7 ninja==1.10.0.post1 diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt index 03107d54164..f33bb515bd6 100644 --- a/.github/requirements/pip-requirements-macOS.txt +++ b/.github/requirements/pip-requirements-macOS.txt @@ -1,4 +1,4 @@ -boto3==1.19.12 +boto3==1.35.42 hypothesis==6.56.4 expecttest==0.2.1 fbscribelogger==0.1.6 diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index cb577a5b094..eed89ae6ffa 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -230,7 +230,7 @@ jobs: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }} - + ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} run: | set -x @@ -289,6 +289,7 @@ jobs: -e SCRIBE_GRAPHQL_ACCESS_TOKEN \ -e DASHBOARD_TAG \ -e IS_A100_RUNNER \ + -e ARTIFACTS_FILE_SUFFIX \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ diff --git a/.github/workflows/nightly-rockset-uploads.yml b/.github/workflows/nightly-rockset-uploads.yml index 4bcf6548a6b..b80c9d1c917 100644 --- a/.github/workflows/nightly-rockset-uploads.yml +++ b/.github/workflows/nightly-rockset-uploads.yml @@ -32,7 +32,7 @@ jobs: cache: pip - run: | - pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12 + pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42 - name: Upload external contribution stats uses: nick-fields/retry@v3.0.0 diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml index 686ecf41689..5556edcba5d 100644 --- a/.github/workflows/target_determination.yml +++ b/.github/workflows/target_determination.yml @@ -70,7 +70,7 @@ jobs: PR_NUMBER: ${{ github.event.pull_request.number }} run: | unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true - python3 -m pip install boto3==1.19.12 + python3 -m pip install boto3==1.35.42 python3 tools/testing/do_target_determination_for_s3.py - name: Upload TD results to s3 diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml index db09474fb21..7e017278955 100644 --- a/.github/workflows/update_pytorch_labels.yml +++ b/.github/workflows/update_pytorch_labels.yml @@ -29,5 +29,5 @@ jobs: aws-region: us-east-1 - name: Update PyTorch labels list in S3 run: | - python3 -m pip install boto3==1.19.12 + python3 -m pip install boto3==1.35.42 .github/scripts/export_pytorch_labels.py pytorch pytorch diff --git a/.github/workflows/upload-test-stats.yml b/.github/workflows/upload-test-stats.yml index f9e5593bf66..8d5072e054f 100644 --- a/.github/workflows/upload-test-stats.yml +++ b/.github/workflows/upload-test-stats.yml @@ -53,7 +53,7 @@ jobs: cache: pip - run: | - pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12 + pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42 - name: Upload test artifacts id: upload-s3 diff --git a/.github/workflows/upload-torch-dynamo-perf-stats.yml b/.github/workflows/upload-torch-dynamo-perf-stats.yml index b4b55a7b473..27a39ec3424 100644 --- a/.github/workflows/upload-torch-dynamo-perf-stats.yml +++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml @@ -49,7 +49,7 @@ jobs: cache: pip - run: | - pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12 + pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42 - name: Upload torch dynamo performance stats to S3 id: upload-s3 diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml index d560f619db4..0c02e3c3723 100644 --- a/.github/workflows/upload_test_stats_intermediate.yml +++ b/.github/workflows/upload_test_stats_intermediate.yml @@ -28,7 +28,7 @@ jobs: cache: pip - run: | - pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12 + pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42 - name: Upload test stats env: diff --git a/test/run_test.py b/test/run_test.py index ce3f3a1ebe9..10be33fcfad 100755 --- a/test/run_test.py +++ b/test/run_test.py @@ -70,6 +70,7 @@ from tools.testing.test_selections import ( ShardedTest, THRESHOLD, ) +from tools.testing.upload_artifacts import zip_and_upload_artifacts # Make sure to remove REPO_ROOT after import is done @@ -1331,6 +1332,10 @@ def parse_args(): action="store_false", help="Run tests without translation validation.", ) + parser.add_argument( + "--upload-artifacts-while-running", + action="store_true", + ) group = parser.add_mutually_exclusive_group() group.add_argument( @@ -1677,6 +1682,8 @@ def run_tests( def parallel_test_completion_callback(failure): test_failed = handle_error_messages(failure) + if IS_CI and options.upload_artifacts_while_running: + zip_and_upload_artifacts(test_failed) if ( test_failed and not options.continue_through_error diff --git a/tools/testing/upload_artifacts.py b/tools/testing/upload_artifacts.py new file mode 100644 index 00000000000..2a226b1896d --- /dev/null +++ b/tools/testing/upload_artifacts.py @@ -0,0 +1,110 @@ +import glob +import os +import time +import zipfile +from functools import lru_cache +from pathlib import Path +from typing import Any, List + + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +LAST_UPDATED = 0.0 + + +@lru_cache(maxsize=1) +def get_s3_resource() -> Any: + import boto3 # type: ignore[import] + + return boto3.client("s3") + + +def zip_artifact(file_name: str, paths: List[str]) -> None: + """Zip the files in the paths listed into file_name. The paths will be used + in a glob and should be relative to REPO_ROOT.""" + + with zipfile.ZipFile(file_name, "w") as f: + for path in paths: + for file in glob.glob(f"{REPO_ROOT}/{path}", recursive=True): + f.write(file, os.path.relpath(file, REPO_ROOT)) + + +def upload_to_s3_artifacts() -> None: + """Upload the file to S3.""" + workflow_id = os.environ.get("GITHUB_RUN_ID") + workflow_run_attempt = os.environ.get("GITHUB_RUN_ATTEMPT") + file_suffix = os.environ.get("ARTIFACTS_FILE_SUFFIX") + if not workflow_id or not workflow_run_attempt or not file_suffix: + print( + "GITHUB_RUN_ID, GITHUB_RUN_ATTEMPT, or ARTIFACTS_FILE_SUFFIX not set, not uploading" + ) + return + + test_reports_zip_path = f"{REPO_ROOT}/test-reports-{file_suffix}.zip" + zip_artifact( + test_reports_zip_path, + ["test/test-reports/**/*.xml", "test/test-reports/**/*.csv"], + ) + test_logs_zip_path = f"{REPO_ROOT}/logs-{file_suffix}.zip" + zip_artifact(test_logs_zip_path, ["test/test-reports/**/*.log"]) + jsons_zip_path = f"{REPO_ROOT}/test-jsons-{file_suffix}.zip" + zip_artifact(jsons_zip_path, ["test/test-reports/**/*.json"]) + + s3_prefix = f"pytorch/pytorch/{workflow_id}/{workflow_run_attempt}/artifact" + get_s3_resource().upload_file( + test_reports_zip_path, + "gha-artifacts", + f"{s3_prefix}/{Path(test_reports_zip_path).name}", + ) + get_s3_resource().upload_file( + test_logs_zip_path, + "gha-artifacts", + f"{s3_prefix}/{Path(test_logs_zip_path).name}", + ) + get_s3_resource().upload_file( + test_logs_zip_path, + "gha-artifacts", + f"{s3_prefix}/{Path(jsons_zip_path).name}", + ) + get_s3_resource().put_object( + Body=b"", + Bucket="gha-artifacts", + Key=f"workflows_failing_pending_upload/{workflow_id}.txt", + ) + + +def zip_and_upload_artifacts(failed: bool) -> None: + # not thread safe but correctness of the LAST_UPDATED var doesn't really + # matter for this + # Upload if a test failed or every 20 minutes + global LAST_UPDATED + + if failed or time.time() - LAST_UPDATED > 20 * 60: + start = time.time() + try: + upload_to_s3_artifacts() + LAST_UPDATED = time.time() + except Exception as e: + print(f"Failed to upload artifacts: {e}") + print(f"Uploading artifacts took {time.time() - start:.2f} seconds") + + +def trigger_upload_test_stats_intermediate_workflow() -> None: + import requests + + # The GITHUB_TOKEN cannot trigger workflow so this isn't used for now + print("Triggering upload_test_stats_intermediate workflow") + x = requests.post( + "https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches", + headers={ + "Accept": "application/vnd.github.v3+json", + "Authorization": f"Bearer {os.environ.get('GITHUB_TOKEN')}", + }, + json={ + "ref": "main", + "inputs": { + "workflow_run_id": os.environ.get("GITHUB_RUN_ID"), + "workflow_run_attempt": os.environ.get("GITHUB_RUN_ATTEMPT"), + }, + }, + ) + print(x.text)