diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py index 3a42298cdf3..0fc60cb31e2 100644 --- a/.github/scripts/github_utils.py +++ b/.github/scripts/github_utils.py @@ -128,7 +128,7 @@ def gh_fetch_json_dict( def gh_graphql(query: str, **kwargs: Any) -> dict[str, Any]: rc = gh_fetch_url( - "https://api.github.com/graphql", + "https://api.github.com/graphql", # @lint-ignore data={"query": query, "variables": kwargs}, reader=json.load, ) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9e23c0fb72..bfff2e0ce1b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,7 @@ endif() # This define is needed to preserve behavior given anticpated changes to # cccl/thrust -# https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html +# https://nvidia.github.io/cccl/libcudacxx/standard_api/numerics_library/complex.html string(APPEND CMAKE_CUDA_FLAGS " -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS") diff --git a/README.md b/README.md index 58327c176f3..47841da7368 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ If you want to compile with CUDA support, [select a supported version of CUDA fr - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA -Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware +Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware If you want to disable CUDA support, export the environment variable `USE_CUDA=0`. Other potentially useful environment variables may be found in `setup.py`. diff --git a/RELEASE.md b/RELEASE.md index 005a154d54b..caeab37b57d 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -221,7 +221,7 @@ Release candidates are currently stored in the following places: * Wheels: https://download.pytorch.org/whl/test/ * Conda: https://anaconda.org/pytorch-test -* Libtorch: https://download.pytorch.org/libtorch/test +* Libtorch: https://download.pytorch.org/libtorch/test Backups are stored in a non-public S3 bucket at [`s3://pytorch-backup`](https://s3.console.aws.amazon.com/s3/buckets/pytorch-backup?region=us-east-1&tab=objects) @@ -322,7 +322,7 @@ Promotion should occur in two steps: * Promote S3 artifacts (wheels, libtorch) and Conda packages * Promote S3 wheels to PyPI -**NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypa/warehouse/issues/726 for a discussion on potential draft releases within PyPI) +**NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypi/warehouse/issues/726 for a discussion on potential draft releases within PyPI) ## Additional Steps to prepare for release day diff --git a/aten/src/ATen/core/boxing/KernelFunction.cpp b/aten/src/ATen/core/boxing/KernelFunction.cpp index b13f827b8f1..c099c456814 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction.cpp @@ -28,7 +28,7 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D "Autograd dispatch key for the backend.\n", "If you only want to run inference instead of training, in C++, add `c10::InferenceMode mode;` " "before model.forward(); in Python, use `torch.inference_mode()` as a context manager (see " - "https://pytorch.org/docs/stable/generated/torch.inference_mode.html).", + "https://pytorch.org/docs/stable/generated/torch.autograd.grad_mode.inference_mode.html).", "\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n"); } diff --git a/aten/src/ATen/cuda/Atomic.cuh b/aten/src/ATen/cuda/Atomic.cuh index 4106ab6f730..f16be30f8b7 100644 --- a/aten/src/ATen/cuda/Atomic.cuh +++ b/aten/src/ATen/cuda/Atomic.cuh @@ -410,7 +410,7 @@ template __host__ __device__ T safe_max(T a, T b) { #if defined(__HIPCC__) // TODO: remove this special case for HIP when issue is fixed: - // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + // https://github.com/ROCm/hip/issues/2209 T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); #else T max = at::_isnan(b) ? b : std::max(a, b); @@ -470,7 +470,7 @@ template __host__ __device__ T safe_min(T a, T b) { #if defined(__HIPCC__) // TODO: remove this special case for HIP when issue is fixed: - // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + // https://github.com/ROCm/hip/issues/2209 T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); #else T min = at::_isnan(b) ? b : std::min(a, b); diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h index 47c0a2be030..b6d44fca590 100644 --- a/aten/src/ATen/native/Math.h +++ b/aten/src/ATen/native/Math.h @@ -1680,7 +1680,7 @@ inline C10_HOST_DEVICE T calc_ndtri(T y0) { return x; } -/* The next function is taken from http://ab-initio.mit.edu/Faddeev */ +/* The next function is taken from http://ab-initio.mit.edu/faddeeva */ /* Copyright (c) 2012 Massachusetts Institute of Technology * diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h index edaa106fc83..1de72abd588 100644 --- a/aten/src/ATen/native/SharedReduceOps.h +++ b/aten/src/ATen/native/SharedReduceOps.h @@ -26,7 +26,7 @@ template inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) { #if defined(__HIPCC__) // TODO: remove this special case for HIP when issue is fixed: - // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + // https://github.com/ROCm/hip/issues/2209 scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); #else scalar_t max = at::_isnan(b) ? b : std::max(a, b); @@ -37,7 +37,7 @@ template inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) { #if defined(__HIPCC__) // TODO: remove this special case for HIP when issue is fixed: - // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + // https://github.com/ROCm/hip/issues/2209 scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); #else scalar_t min = at::_isnan(b) ? b : std::min(a, b); diff --git a/aten/src/ATen/native/cuda/AmpKernels.cu b/aten/src/ATen/native/cuda/AmpKernels.cu index 8c161ca6272..2be6e47df3c 100644 --- a/aten/src/ATen/native/cuda/AmpKernels.cu +++ b/aten/src/ATen/native/cuda/AmpKernels.cu @@ -13,7 +13,7 @@ namespace { -// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e, +// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH__SINGLE.html, // to ensure the Cuda math library's isfinite is actually what gets called in // _amp_non_finite_check_and_unscale_cuda_'s gpu_kernel lambda. // diff --git a/aten/src/ATen/native/cuda/Math.cuh b/aten/src/ATen/native/cuda/Math.cuh index 2fe8f5dd2e3..89308177bfe 100644 --- a/aten/src/ATen/native/cuda/Math.cuh +++ b/aten/src/ATen/native/cuda/Math.cuh @@ -766,7 +766,7 @@ const auto sinc_string = jiterator_stringify( ); // sinc_string const auto erfcx_string = jiterator_stringify( - /* The next function is taken from http://ab-initio.mit.edu/Faddeev */ + /* The next function is taken from http://ab-initio.mit.edu/faddeeva */ /* Copyright (c) 2012 Massachusetts Institute of Technology * diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp index 0853c02d6df..3cf47804e91 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp @@ -1865,8 +1865,6 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) { // We require to perform ?geqrf_gpu again due to this bug in MAGMA: // - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly. // - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu - // Refer to the below link for more details: - // http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=1015&p=2800&hilit=geqrf_gpu#p2800 case at::LinalgBackend::Magma: return geqrf_magma(input, tau); case at::LinalgBackend::Cusolver: diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp index b617cf44473..740b54d6772 100644 --- a/aten/src/ATen/native/cudnn/Conv_v8.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp @@ -347,7 +347,7 @@ struct BenchmarkCache { // @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to // be thread safe across all engines see Limitations in -// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html +// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html thread_local BenchmarkCache benchmark_cache; thread_local BenchmarkCache diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp index c38d4a095c0..5d146edb90b 100644 --- a/aten/src/ATen/native/cudnn/MHA.cpp +++ b/aten/src/ATen/native/cudnn/MHA.cpp @@ -296,7 +296,7 @@ struct MHAGraphCache { // @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to // be thread safe across all engines see Limitations in -// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html +// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html thread_local MHAGraphCache mhagraphcache; thread_local MHAGraphCache mhagraphbackwardcache; diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 17039f03e64..a65a5e43ac0 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -1204,7 +1204,7 @@ cudnnRNNAlgo_t get_algo( // Persistent algos typically don't work for packed inputs with sequence // lengths that vary across batch elements, and will return // CUDNN_STATUS_NOT_SUPPORTED if attempted. See - // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions + // https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-890/developer-guide/index.html#features-of-rnn-functions if (!tensors.is_input_packed()) { auto cudnnDataType = getCudnnDataType(input); if (cudnnDataType != CUDNN_DATA_DOUBLE) { @@ -1274,7 +1274,7 @@ int64_t _cudnn_rnn_flatten_weight_prologue( rnn_desc = rnn.descriptor(handle); // Why do we pad to 5 dims here (and elsewhere)? - // https://docs.nvidia.com/deeplearning/sdk/cudnn-api/index.html#cudnnRNNForwardTraining + // https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-892/api/index.html#cudnnRNNForwardTraining // expects descriptors padded to 3 dimensions. x_desc.set(flat_buf_datatype, x_geom.sizes(), x_geom.strides(), 5); diff --git a/aten/src/ATen/native/mps/kernels/Quantized.metal b/aten/src/ATen/native/mps/kernels/Quantized.metal index 1a277602aa2..4d57027a576 100644 --- a/aten/src/ATen/native/mps/kernels/Quantized.metal +++ b/aten/src/ATen/native/mps/kernels/Quantized.metal @@ -213,7 +213,7 @@ INSTANTIATE_INT4MV(bfloat, 256); * 1. Load A and B blocks (32x32 and 64x32 respectively) into shared memory. * 2. In 4 simdgroups, calculate the outer product of the loaded blocks. Each simdgroup produces a 2x4 8x8 result. * 2.1 For how to use outer product to perform matrix multiplication, refer to - * http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products + * https://web.archive.org/web/20230521063455/http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products * 3. Repeat 1 & 2 along K axis, with K block size 32, accumulate the result in the 2x4 8x8 block. * 4. Dequantize the final result and store it in the output matrix. * diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index 41209c3c047..9acdd0937c8 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -1923,7 +1923,7 @@ namespace { * FBGEMM uses vpmaddubsw instruction to multiply activations (uint8_t) and * weights (int8_t). * - * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16&expand=3284,3530 + * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&expand=3284,3530&ig_expand=4236 * * vpmaddubsw operates on a vector of activations and a vector of * weights. If these vectors are diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/README.md b/aten/src/ATen/native/quantized/cpu/qnnpack/README.md index ed6639c4ace..86974f1e212 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/README.md +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/README.md @@ -78,10 +78,10 @@ MAX_JOBS=1 scripts/build_local.sh -DBUILD_BINARY=ON -DBUILD_PYTHON=OFF \ -DUSE_OBSERVERS=OFF -DUSE_DISTRIBUTED=OFF # Download model weights -wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb +wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore # Download model graph -wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb +wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore # Run speed benchmark with 50 warm-up iterations and 10 measurement iterations build/bin/speed_benchmark --net predict_net.pb --init_net init_net.pb \ @@ -104,11 +104,11 @@ scripts/build_android.sh -DANDROID_TOOLCHAIN=clang -DBUILD_BINARY=ON adb push build_android/bin/speed_benchmark /data/local/tmp/speed_benchmark # Download model weights and copy them to Android device -wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb +wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore adb push init_net.pb /data/local/tmp/init_net.pb # Download model graph and copy it to Android device -wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb +wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore adb push predict_net.pb /data/local/tmp/predict_net.pb # Run speed benchmark with 50 warm-up iterations and 10 measurement iterations @@ -134,11 +134,11 @@ scripts/build_android.sh -DANDROID_ABI=arm64-v8a -DANDROID_TOOLCHAIN=clang -DBUI adb push build_android/bin/speed_benchmark /data/local/tmp/speed_benchmark # Download model weights and copy them to Android device -wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb +wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore adb push init_net.pb /data/local/tmp/init_net.pb # Download model graph and copy it to Android device -wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb +wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore adb push predict_net.pb /data/local/tmp/predict_net.pb # Run speed benchmark with 50 warm-up iterations and 10 measurement iterations diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp index edd4d0f5e76..6424000594e 100644 --- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp +++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp @@ -53,7 +53,7 @@ std::unordered_map #endif diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake index 2678cfde3c4..9c1862f6b44 100644 --- a/cmake/External/aotriton.cmake +++ b/cmake/External/aotriton.cmake @@ -98,7 +98,7 @@ if(NOT __AOTRITON_INCLUDED) "${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}" "_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}" "-shared.tar.${__AOTRITON_Z}") - string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/" + string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/" # @lint-ignore "${__AOTRITON_VER}/${__AOTRITON_FILE}") ExternalProject_Add(aotriton_external URL "${__AOTRITON_URL}" diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst index 02fa2a089e9..ddfa7a58d9f 100644 --- a/docs/cpp/source/index.rst +++ b/docs/cpp/source/index.rst @@ -137,7 +137,7 @@ about this by following `this API concerns itself with scenarios in which you would like to extend TorchScript with custom operators, which can similarly be serialized and invoked from C++ during inference. Lastly, the `torch::jit::compile -`_ +`_ function may be used to access the TorchScript compiler directly from C++. C++ Extensions diff --git a/docs/source/community/contribution_guide.rst b/docs/source/community/contribution_guide.rst index ec477ea50ea..4df422f541e 100644 --- a/docs/source/community/contribution_guide.rst +++ b/docs/source/community/contribution_guide.rst @@ -325,13 +325,13 @@ Python Docs PyTorch documentation is generated from python source using `Sphinx `__. Generated HTML is copied to the docs folder in the main branch of -`pytorch.github.io `__, +`pytorch.org/docs `__, and is served via GitHub pages. - Site: https://pytorch.org/docs - GitHub: https://github.com/pytorch/pytorch/tree/main/docs - Served from: - `https://github.com/pytorch/pytorch.github.io/tree/master/docs `__ + `https://pytorch.org/docs/main `__ C++ Docs ~~~~~~~~ diff --git a/docs/source/conf.py b/docs/source/conf.py index 152faa45b0f..1485c80277e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -3708,7 +3708,6 @@ def process_docstring(app, what_, name, obj, options, lines): lines (List[str]): the lines of the docstring, see above References: - https://www.sphinx-doc.org/en/1.5.1/_modules/sphinx/ext/autodoc.html https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html """ import re diff --git a/docs/source/cudnn_rnn_determinism.rst b/docs/source/cudnn_rnn_determinism.rst index c002925a8c3..ffe8f1d8ccd 100644 --- a/docs/source/cudnn_rnn_determinism.rst +++ b/docs/source/cudnn_rnn_determinism.rst @@ -13,4 +13,4 @@ See the `cuDNN 8 Release Notes`_ for more information. -.. _cuDNN 8 Release Notes: https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_8.html +.. _cuDNN 8 Release Notes: https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-880/release-notes/rel_8.html diff --git a/docs/source/distributed.rst b/docs/source/distributed.rst index 4dc70268cb2..f36f6218dac 100644 --- a/docs/source/distributed.rst +++ b/docs/source/distributed.rst @@ -141,7 +141,7 @@ network bandwidth. These two environment variables have been pre-tuned by NCCL for some cloud providers, such as AWS or GCP. For a full list of NCCL environment variables, please refer to -`NVIDIA NCCL's official documentation `_ +`NVIDIA NCCL's official documentation `_ You can tune NCCL communicators even further using `torch.distributed.ProcessGroupNCCL.NCCLConfig` and `torch.distributed.ProcessGroupNCCL.Options`. Learn more about them using `help` diff --git a/docs/source/notes/hip.rst b/docs/source/notes/hip.rst index 57f08b93053..a34535d67fc 100644 --- a/docs/source/notes/hip.rst +++ b/docs/source/notes/hip.rst @@ -141,7 +141,7 @@ Currently, only the "nccl" and "gloo" backends for torch.distributed are support CUDA API to HIP API mappings in C++ ----------------------------------- -Please refer: https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html +Please refer: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/api_syntax.html NOTE: The CUDA_VERSION macro, cudaRuntimeGetVersion and cudaDriverGetVersion APIs do not semantically map to the same values as HIP_VERSION macro, hipRuntimeGetVersion and diff --git a/docs/source/notes/windows.rst b/docs/source/notes/windows.rst index 8fb2f913630..3cf736046a8 100644 --- a/docs/source/notes/windows.rst +++ b/docs/source/notes/windows.rst @@ -24,9 +24,10 @@ MKL and MAGMA. Here are the steps to build with them. REM 2.5.3 (CUDA 10.1 10.2 11.0) x (Debug Release) REM 2.5.2 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release) REM 2.5.1 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release) - set CUDA_PREFIX=cuda102 - set CONFIG=release - curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z + set "CUDA_PREFIX=cuda102" + set "CONFIG=release" + set "HOST=https://s3.amazonaws.com/ossci-windows" + curl -k "%HOST%/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z" -o magma.7z 7z x -aoa magma.7z -omagma REM Setting essential environment variables diff --git a/docs/source/onnx_dynamo.rst b/docs/source/onnx_dynamo.rst index 08395253a05..fb17d2706a9 100644 --- a/docs/source/onnx_dynamo.rst +++ b/docs/source/onnx_dynamo.rst @@ -27,7 +27,7 @@ Dependencies The ONNX exporter depends on extra Python packages: - `ONNX `_ - - `ONNX Script `_ + - `ONNX Script `_ They can be installed through `pip `_: diff --git a/docs/source/onnx_torchscript.rst b/docs/source/onnx_torchscript.rst index 400cc4da23d..2fa02cf78f0 100644 --- a/docs/source/onnx_torchscript.rst +++ b/docs/source/onnx_torchscript.rst @@ -452,7 +452,7 @@ ONNX operators that represent the function's behavior in ONNX. For example:: .. . ``torch::jit::Value::setType``). This is not required, but it can help the exporter's .. shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see .. ``test_aten_embedding_2`` in -.. `test_operators.py `_. +.. `test_operators.py `_. .. The example below shows how you can access ``requires_grad`` via the ``Node`` object: diff --git a/docs/source/rpc/rref.rst b/docs/source/rpc/rref.rst index 3f858e58686..a5177e08ef6 100644 --- a/docs/source/rpc/rref.rst +++ b/docs/source/rpc/rref.rst @@ -204,7 +204,7 @@ will create the ``OwnerRRef``, and returns an ACK to acknowledge ``{100, 1}`` **G2**, the ``OwnerRRef`` is a child of the ``UserRRef``, and the ``UserRRef`` is not deleted until it receives the ACK from the owner. -.. image:: https://user-images\.githubusercontent\.com/16999635/69164772-98181300-0abe-11ea-93a7-9ad9f757cd94.png +.. image:: https://user-images.githubusercontent.com/16999635/69164772-98181300-0abe-11ea-93a7-9ad9f757cd94.png :alt: user_to_owner_ret.png :width: 500 px diff --git a/docs/source/torch.compiler_troubleshooting_old.rst b/docs/source/torch.compiler_troubleshooting_old.rst index 7a4a35dffa3..5f693741e94 100644 --- a/docs/source/torch.compiler_troubleshooting_old.rst +++ b/docs/source/torch.compiler_troubleshooting_old.rst @@ -209,7 +209,7 @@ Diagnosing TorchInductor Errors If the error does not occur with the ``"eager"`` backend, then the backend compiler is the source of the error (`example -error `__). +error `__). There are `different choices <./torch.compiler.rst>`__ for backend compilers for TorchDynamo, with TorchInductor fitting the needs of most users. This section focuses on TorchInductor diff --git a/docs/source/type_info.rst b/docs/source/type_info.rst index a807398c5fb..29a5ca28269 100644 --- a/docs/source/type_info.rst +++ b/docs/source/type_info.rst @@ -15,7 +15,7 @@ torch.finfo .. class:: torch.finfo A :class:`torch.finfo` is an object that represents the numerical properties of a floating point -:class:`torch.dtype`, (i.e. ``torch.float32``, ``torch.float64``, ``torch.float16``, and ``torch.bfloat16``). This is similar to `numpy.finfo `_. +:class:`torch.dtype`, (i.e. ``torch.float32``, ``torch.float64``, ``torch.float16``, and ``torch.bfloat16``). This is similar to `numpy.finfo `_. A :class:`torch.finfo` provides the following attributes: @@ -49,7 +49,7 @@ torch.iinfo A :class:`torch.iinfo` is an object that represents the numerical properties of a integer -:class:`torch.dtype` (i.e. ``torch.uint8``, ``torch.int8``, ``torch.int16``, ``torch.int32``, and ``torch.int64``). This is similar to `numpy.iinfo `_. +:class:`torch.dtype` (i.e. ``torch.uint8``, ``torch.int8``, ``torch.int16``, ``torch.int32``, and ``torch.int64``). This is similar to `numpy.iinfo `_. A :class:`torch.iinfo` provides the following attributes: diff --git a/scripts/build_tizen.sh b/scripts/build_tizen.sh index ce64b6c4298..2262a2503c1 100755 --- a/scripts/build_tizen.sh +++ b/scripts/build_tizen.sh @@ -9,7 +9,7 @@ setup_environment(){ # The rootfs image for a Tizen target (RPi3)is located at the below webpage: -# http://download.tizen.org/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/ +# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/ # If you do not have a Tizen device, Please, run qemu-arm-static and chroot command. # $ sudo chroot ~/tizen-rootfs qemu-arm-static /usr/bin/bash @@ -25,7 +25,7 @@ caffe2_lite_dep_packages(){ # You can set-up a rpm repository with zypper, yum, and dnf because Tizen # software platform officially support rpm format such as Fedora, OpenSUSE. # The official Tizen repository is as following: -# http://download.tizen.org/releases/milestone/tizen/4.0.m1/ +# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/ echo "Installing dependencies." sudo zypper install \ make \ @@ -69,7 +69,7 @@ caffe2_full_dep_packages(){ # You can set-up a rpm repository with zypper, yum, and dnf because Tizen # software platform officially support rpm format such as Fedora, OpenSUSE. # The official Tizen repository is as following: -# http://download.tizen.org/releases/milestone/tizen/4.0.m1/ +# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/ echo "Installing dependencies." sudo zypper install \ cmake \ diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py index 9143fd672fb..8e6eda25520 100644 --- a/scripts/release_notes/common.py +++ b/scripts/release_notes/common.py @@ -212,7 +212,9 @@ headers = {"Authorization": f"token {token}"} def run_query(query): request = requests.post( - "https://api.github.com/graphql", json={"query": query}, headers=headers + "https://api.github.com/graphql", # @lint-ignore + json={"query": query}, + headers=headers, ) if request.status_code == 200: return request.json() diff --git a/test/cpp/api/rnn.cpp b/test/cpp/api/rnn.cpp index fef879b7983..c01b83898b4 100644 --- a/test/cpp/api/rnn.cpp +++ b/test/cpp/api/rnn.cpp @@ -441,7 +441,7 @@ lstm_output_to_device( } // This test is a port of python code introduced here: -// https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66 +// https://medium.com/data-science/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66 // Reverse forward of bidirectional GRU should act // as regular forward of unidirectional GRU void BidirectionalGRUReverseForward(bool cuda) { diff --git a/test/distributed/tensor/test_dtensor_ops.py b/test/distributed/tensor/test_dtensor_ops.py index 4fa6bdeed8e..665f87f797e 100644 --- a/test/distributed/tensor/test_dtensor_ops.py +++ b/test/distributed/tensor/test_dtensor_ops.py @@ -594,7 +594,7 @@ class TestDTensorOps(DTensorOpTestBase): dtensor_rs = func(*dtensor_args, **dtensor_kwargs) # we need to skip tests containing tensors of zero elements for now. - # see issue: https://github.com/pytorch/tau/issues/470 + # see issue: https://github.com/pytorch/PiPPy/issues/470 # TODO remove this once issue above fixed. flat_args = pytree.tree_leaves(dtensor_rs) if any( diff --git a/test/distributed/tensor/test_pointwise_ops.py b/test/distributed/tensor/test_pointwise_ops.py index f30b700b366..3e3cd378165 100644 --- a/test/distributed/tensor/test_pointwise_ops.py +++ b/test/distributed/tensor/test_pointwise_ops.py @@ -192,7 +192,9 @@ class DistElementwiseOpsTest(DTensorOpTestBase): op=torch.sigmoid, ) - @skip("testing RNG based ops is broken: https://github.com/pytorch/tau/issues/494") + @skip( + "testing RNG based ops is broken: https://github.com/pytorch/PiPPy/issues/494" + ) def test_dropout(self): device_mesh = self.build_device_mesh() diff --git a/test/functorch/test_vmap.py b/test/functorch/test_vmap.py index 894aa6f544d..2f1d1416b63 100644 --- a/test/functorch/test_vmap.py +++ b/test/functorch/test_vmap.py @@ -5169,7 +5169,6 @@ class TestVmapOperatorsOpInfo(TestCase): xfail("linalg.vecdot"), # throws in vmap on CUDA # IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2) - # https://github.com/pytorch/pytorch/runs/8110653462?check_suite_focus=true # but it passes locally xfail("linalg.diagonal"), skip("linalg.matrix_norm", ""), diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py index 39b5f589712..2b9ff6ec2c8 100644 --- a/test/inductor/test_cuda_repro.py +++ b/test/inductor/test_cuda_repro.py @@ -581,7 +581,7 @@ class CudaReproTests(TestCase): """ This UT tests autotune on an inplace kernel. The autotune should not contaminate the input buffers when tuning with multiple configs. For more details, refer to - https://github.com/openai/triton/issues/781 + https://github.com/triton-lang/triton/issues/781 https://github.com/pytorch/torchdynamo/issues/1670 """ from torch._C import _cuda_getCurrentRawStream as get_cuda_stream @@ -1587,7 +1587,7 @@ class CudaReproTests(TestCase): @config.patch("triton.use_block_ptr", True) def test_selecsls42b_misaligned_address(self): - # https://github.com/openai/triton/issues/2836 + # https://github.com/triton-lang/triton/issues/2836 @torch.compile(fullgraph=True) def fn(arg207_1, arg208_1, convert_element_type_40, expand, full, mul_3): diff --git a/test/inductor/test_foreach.py b/test/inductor/test_foreach.py index e68ed88a4f2..da243adfdcf 100644 --- a/test/inductor/test_foreach.py +++ b/test/inductor/test_foreach.py @@ -431,7 +431,7 @@ class ForeachTests(TestCase): @requires_cuda @scalar_bin_ops @unittest.skip( - "Triton recursion depth exceeded: https://github.com/openai/triton/issues/1763" + "Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763" ) def test_kernel_split_arg_limit_scalar(self, op): def fn(a): diff --git a/test/inductor/test_indexing.py b/test/inductor/test_indexing.py index 954452882ca..7369d6c9fad 100644 --- a/test/inductor/test_indexing.py +++ b/test/inductor/test_indexing.py @@ -95,7 +95,7 @@ class TestIndexingSimplification(InductorTestCase): ModularIndexing(i0 + i1 * i2 * r3, i2, r3), ModularIndexing(i0, i2, r3) ) - # if there are negative terms, we cannot optimize away zero terms due to https://github.com/openai/triton/issues/619 + # if there are negative terms, we cannot optimize away zero terms due to https://github.com/triton-lang/triton/issues/619 self.assertEqual( ModularIndexing(-i0 + i1 * 20, 2, 10), ModularIndexing(-i0 + i1 * 20, 2, 10) ) diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py index 6476989f675..96b7ade81d0 100644 --- a/test/inductor/test_max_autotune.py +++ b/test/inductor/test_max_autotune.py @@ -166,7 +166,7 @@ class TestMaxAutotune(TestCase): @parametrize("autotune_multi_device", (True, False)) def test_max_autotune_mm_plus_mm(self, autotune_in_subproc, autotune_multi_device): """ - This crash previously due to a triton issue: https://github.com/openai/triton/issues/1298 . + This crash previously due to a triton issue: https://github.com/triton-lang/triton/issues/1298 . With autotuning in subprocess, we don't crash anymore. """ m, n, k = 2048, 1536, 64 diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index 9745cd94134..b90244600dd 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -2969,7 +2969,7 @@ class CommonTemplate: return torch.round(a), torch.round(b + 1), torch.round(a, decimals=2) # without manual_seed, there is some chance this test fails due to: - # https://github.com/openai/triton/issues/530 + # https://github.com/triton-lang/triton/issues/530 torch.manual_seed(0) # with *100 we are always getting a number exactly at .5 which we don't do right in half @@ -7957,7 +7957,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar torch.randint(0, 100, size=[600], dtype=torch.int64), torch.randn([600, 256, 7, 7]), ], - # workaround for https://github.com/openai/triton/issues/558 + # workaround for https://github.com/triton-lang/triton/issues/558 check_lowp=False, ) diff --git a/test/quantization/core/test_quantized_functional.py b/test/quantization/core/test_quantized_functional.py index b14aaf465b0..e593b113b27 100644 --- a/test/quantization/core/test_quantized_functional.py +++ b/test/quantization/core/test_quantized_functional.py @@ -52,7 +52,7 @@ class TestQuantizedFunctionalOps(QuantizationTestCase): # Make sure the results match # assert_array_almost_equal compares using the following formula: # abs(desired-actual) < 1.5 * 10**(-decimal) - # (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html) + # (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html) # We use decimal = 0 to ignore off-by-1 differences between reference # and test. Off-by-1 differences arise due to the order of round and # zero_point addition operation, i.e., if addition followed by round is diff --git a/test/quantization/core/test_quantized_module.py b/test/quantization/core/test_quantized_module.py index c31fe44fa29..8918696078a 100644 --- a/test/quantization/core/test_quantized_module.py +++ b/test/quantization/core/test_quantized_module.py @@ -320,7 +320,7 @@ class TestStaticQuantizedModule(QuantizationTestCase): # Make sure the results match # assert_array_almost_equal compares using the following formula: # abs(desired-actual) < 1.5 * 10**(-decimal) - # (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html) + # (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html) # We use decimal = 0 to ignore off-by-1 differences between reference # and test. Off-by-1 differences arise due to the order of round and # zero_point addition operation, i.e., if addition followed by round is diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index d9809647d27..1be420ee2ed 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -5200,7 +5200,7 @@ class TestQuantizedConv(TestCase): # Make sure the results match # assert_array_almost_equal compares using the following formula: # abs(desired-actual) < 1.5 * 10**(-decimal) - # (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html) + # (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html) # We use decimal = 0 to ignore off-by-1 differences between # reference and test. Off-by-1 differences arise due to the order of # round and zero_point addition operation, i.e., if addition @@ -6935,7 +6935,7 @@ class TestQuantizedConv(TestCase): # Make sure the results match # assert_array_almost_equal compares using the following formula: # abs(desired-actual) < 1.5 * 10**(-decimal) - # (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html) + # (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html) # We use decimal = 0 to ignore off-by-1 differences between # reference and test. Off-by-1 differences arise due to the order of # round and zero_point addition operation, i.e., if addition diff --git a/test/test_nn.py b/test/test_nn.py index f3aad843521..f16ace91d69 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -11752,7 +11752,7 @@ class TestNNDeviceType(NNTestCase): with self.assertRaisesRegex(RuntimeError, msg): F.nll_loss(x, t, weight=weight) - # Ref: https://github.com/pytorch/pytorch/issue/85005 + # Ref: https://github.com/pytorch/pytorch/issues/85005 @onlyCUDA @largeTensorTest("120GB", "cpu") @largeTensorTest("45GB", "cuda") @@ -11785,7 +11785,7 @@ class TestNNDeviceType(NNTestCase): with torch.no_grad(): self.assertTrue(torch.allclose(input.grad.cpu(), input_cpu.grad, rtol=rtol, atol=atol)) - # Ref: https://github.com/pytorch/pytorch/issue/108345 + # Ref: https://github.com/pytorch/pytorch/issues/108345 @onlyCUDA @largeTensorTest("20GB", "cpu") @largeTensorTest("20GB", "cuda") diff --git a/test/test_numba_integration.py b/test/test_numba_integration.py index dc63d4910f5..f42dd4176da 100644 --- a/test/test_numba_integration.py +++ b/test/test_numba_integration.py @@ -36,7 +36,7 @@ class TestNumbaIntegration(common.TestCase): version: (int) Version 0 See: - https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html + https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html """ types = [ @@ -250,7 +250,7 @@ class TestNumbaIntegration(common.TestCase): will use the exposed device memory. See: - https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html + https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html """ dtypes = [ diff --git a/test/test_reductions.py b/test/test_reductions.py index dc84432777d..a931717d475 100644 --- a/test/test_reductions.py +++ b/test/test_reductions.py @@ -1759,7 +1759,6 @@ class TestReductions(TestCase): # On Windows CI, the current version of `numpy` promotes all lower integers # dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking # the exact dtype. - # Reference : https://dr.pytorch.org/api/view-log-full?build_id=122051580 # PR : https://github.com/pytorch/pytorch/pull/38628#issuecomment-655905370 if IS_WINDOWS and is_integral(dtype): exact_dtype = False diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index ca5ab1e7df3..8d29c504d87 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -547,7 +547,7 @@ class TestUnaryUfuncs(TestCase): # sqrt Test Reference: https://github.com/pytorch/pytorch/pull/47424 x = torch.tensor(0.0 - 1.0e20j, dtype=dtype, device=device) self.compare_with_numpy(torch.sqrt, np.sqrt, x) - # acos test reference: https://github.com/pytorch/pytorch/issue/42952 + # acos test reference: https://github.com/pytorch/pytorch/issues/42952 if not (dtype == torch.cdouble and "cuda" in device): self.compare_with_numpy(torch.acos, np.arccos, x) diff --git a/test/torch_np/numpy_tests/core/test_multiarray.py b/test/torch_np/numpy_tests/core/test_multiarray.py index 1ccf5ca4ffe..44503bf0fa3 100644 --- a/test/torch_np/numpy_tests/core/test_multiarray.py +++ b/test/torch_np/numpy_tests/core/test_multiarray.py @@ -4328,7 +4328,7 @@ class TestFromBuffer(TestCase): @skipif( IS_PYPY, reason="PyPy's memoryview currently does not track exports. See: " - "https://foss.heptapod.net/pypy/pypy/-/issues/3724", + "https://github.com/pypy/pypy/issues/3723", ) def test_mmap_close(self): # The old buffer protocol was not safe for some things that the new diff --git a/tools/download_mnist.py b/tools/download_mnist.py index 4fe6068fed9..c8eeb4ec1a9 100644 --- a/tools/download_mnist.py +++ b/tools/download_mnist.py @@ -8,7 +8,7 @@ from urllib.request import urlretrieve MIRRORS = [ "http://yann.lecun.com/exdb/mnist/", - "https://ossci-datasets.s3.amazonaws.com/mnist/", + "https://ossci-datasets.s3.amazonaws.com/mnist/", # @lint-ignore ] RESOURCES = [ diff --git a/tools/stats/upload_external_contrib_stats.py b/tools/stats/upload_external_contrib_stats.py index 93634c4ad5e..6de0e495214 100644 --- a/tools/stats/upload_external_contrib_stats.py +++ b/tools/stats/upload_external_contrib_stats.py @@ -81,7 +81,7 @@ def get_external_pr_data( response = cast( dict[str, Any], fetch_json( - "https://api.github.com/search/issues", + "https://api.github.com/search/issues", # @lint-ignore params={ "q": f'repo:pytorch/pytorch is:pr is:closed \ label:"open source" label:Merged -label:Reverted closed:{period_begin_date}..{period_end_date}', diff --git a/tools/stats/utilization_stats_lib.py b/tools/stats/utilization_stats_lib.py index 50bb9312c05..424808f7be7 100644 --- a/tools/stats/utilization_stats_lib.py +++ b/tools/stats/utilization_stats_lib.py @@ -87,7 +87,7 @@ class OssCiUtilizationMetadataV1: # this data model is for the time series data: -# https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_utilization_time_series_schema.sql +# https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_time_series_schema.sql @dataclass class OssCiUtilizationTimeSeriesV1: created_at: int diff --git a/tools/test/test_create_alerts.py b/tools/test/test_create_alerts.py index 11afebf8557..56a81603638 100644 --- a/tools/test/test_create_alerts.py +++ b/tools/test/test_create_alerts.py @@ -12,7 +12,7 @@ MOCK_TEST_DATA = [ "sha": "f02f3046571d21b48af3067e308a1e0f29b43af9", "id": 7819529276, "conclusion": "failure", - "htmlUrl": "https://github.com/pytorch/pytorch/runs/7819529276?check_suite_focus=true", + "htmlUrl": "https://github.com/pytorch/pytorch/runs/7819529276?check_suite_focus=true", # @lint-ignore "logUrl": "https://ossci-raw-job-status.s3.amazonaws.com/log/7819529276", "durationS": 14876, "failureLine": "##[error]The action has timed out.", @@ -25,7 +25,7 @@ MOCK_TEST_DATA = [ "sha": "d0d6b1f2222bf90f478796d84a525869898f55b6", "id": 7818399623, "conclusion": "failure", - "htmlUrl": "https://github.com/pytorch/pytorch/runs/7818399623?check_suite_focus=true", + "htmlUrl": "https://github.com/pytorch/pytorch/runs/7818399623?check_suite_focus=true", # @lint-ignore "logUrl": "https://ossci-raw-job-status.s3.amazonaws.com/log/7818399623", "durationS": 14882, "failureLine": "##[error]The action has timed out.", diff --git a/tools/testing/upload_artifacts.py b/tools/testing/upload_artifacts.py index 4ebfd03a146..a8b6d15fb39 100644 --- a/tools/testing/upload_artifacts.py +++ b/tools/testing/upload_artifacts.py @@ -94,7 +94,7 @@ def trigger_upload_test_stats_intermediate_workflow() -> None: # The GITHUB_TOKEN cannot trigger workflow so this isn't used for now print("Triggering upload_test_stats_intermediate workflow") x = requests.post( - "https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches", + "https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches", # noqa: B950 @lint-ignore headers={ "Accept": "application/vnd.github.v3+json", "Authorization": f"Bearer {os.environ.get('GITHUB_TOKEN')}", diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 089bd38f057..3bbd7f628ba 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -1267,7 +1267,7 @@ def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ... def _group_tensors_by_device_and_dtype(nested_tensorlists: List[List[Optional[Tensor]]], with_indices: _bool = False) -> Dict[Tuple[torch.device, torch.dtype], Tuple[List[List[Optional[Tensor]]], List[_int]]]: ... # NB: There is no Capsule type in typing, see -# https://code.activestate.com/lists/python-dev/139675/ +# https://github.com/python/cpython/issues/109562 def _to_dlpack(data: Tensor) -> Any: ... # THPModule_toDLPack def _from_dlpack(data: Any) -> Tensor: ... # THPModule_fromDLPack def _get_cpp_backtrace( diff --git a/torch/_appdirs.py b/torch/_appdirs.py index 64d81139d7a..291963f6f6f 100644 --- a/torch/_appdirs.py +++ b/torch/_appdirs.py @@ -41,9 +41,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. See for details and usage. """ # Dev Notes: -# - MSDN on where to store app data files: -# http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120 -# - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html +# - Windows "Known Folders": https://learn.microsoft.com/en-us/windows/win32/shell/csidl +# - macOS File System Programming Guide: https://developer.apple.com/library/archive/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/Introduction/Introduction.html # - XDG spec for Un*x: https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html __version__ = "1.4.4" diff --git a/torch/_decomp/decompositions.py b/torch/_decomp/decompositions.py index b7c3fb0c9db..c57d240c4a1 100644 --- a/torch/_decomp/decompositions.py +++ b/torch/_decomp/decompositions.py @@ -4389,8 +4389,7 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b t1_stride = t1.stride() # Check the contiguous, we can skip the dim with size of 1 - # as aten: https://github.com/pytorch/pytorch/blob/ - # e201460f8aa1510b4c4686627d57b69756c4b916/aten/src/ATen/TensorGeometry.cpp#L17 + # as aten: https://github.com/pytorch/pytorch/blob/e201460f8aa1510b4c4686627d57b69756c4b916/aten/src/ATen/TensorGeometry.cpp#L17 expected_stride = [1] for size in reversed(t1_shape[1:]): expected_stride.append(size * expected_stride[-1]) diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py index 28a546eb1c5..d406880df3f 100644 --- a/torch/_inductor/codegen/cpp.py +++ b/torch/_inductor/codegen/cpp.py @@ -4881,9 +4881,8 @@ class CppScheduling(BaseScheduling): len(get_call_ranges(_node)) == node.outer_loop_fusion_depth + 1 for _node in node.get_outer_nodes() ): - # Ref to the typical case of local buffer - # in https://github.com/pytorch/pytorch/blob/ - # 1115a25c36340554442f28f9570abd42f0aface2/aten/src/ATen/native/cpu/SoftMaxKernel.cpp#L159 + # Ref to the typical case of local buffer in + # https://github.com/pytorch/pytorch/blob/1115a25c36340554442f28f9570abd42f0aface2/aten/src/ATen/native/cpu/SoftMaxKernel.cpp#L159 # noqa: B950 # where the buffer is with size of last dim and contiguous. # Only support this typical case at first. visited_scheduler_nodes = OrderedSet[str]() diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index d9a3fae9220..8f3ddb77129 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -504,7 +504,7 @@ class BlockPtrOptions: def triton_reshape( value: str, old_shape: Sequence[sympy.Expr], new_shape: Sequence[sympy.Expr] ) -> str: - """Workaround https://github.com/openai/triton/issues/2836""" + """Workaround https://github.com/triton-lang/triton/issues/2836""" assert isinstance(old_shape, list) and isinstance(new_shape, list) old_shape_str = [V.kernel.index_to_str(shape) for shape in old_shape] @@ -841,7 +841,7 @@ class TritonOverrides(OpOverrides): # fp8 data type conversions has min_elem_per_thread requirements. # Refer to Triton implementations here: - # https://github.com/openai/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10. + # https://github.com/triton-lang/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10. fp8_dtypes = ( torch.float8_e4m3fn, torch.float8_e5m2, @@ -1828,7 +1828,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]): and len(mask_vars - dense_mask_vars) == 0 and not self.is_indirect_indexing(index) and have_loop_vars - # workaround https://github.com/openai/triton/issues/2821 + # workaround https://github.com/triton-lang/triton/issues/2821 and self.index_dtype == "tl.int32" ): @@ -2053,7 +2053,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]): ) -> tuple[str, str]: check = indexing.boundary_check() if not check: - # workaround https://github.com/openai/triton/issues/2813 + # workaround https://github.com/triton-lang/triton/issues/2813 other = "" elif other: assert other == ", other=0.0" @@ -2114,7 +2114,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]): value, indexing.final_shape, indexing.block_shape, False ) - # workaround https://github.com/openai/triton/issues/2814 + # workaround https://github.com/triton-lang/triton/issues/2814 value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})" return f"tl.store({block_ptr}, {value}{other})" @@ -2260,7 +2260,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]): line += ".to(tl.float32)" dtype = torch.float32 if dtype == torch.bool and torch.version.hip is None: - # Workaround for https://github.com/openai/triton/issues/2151 + # Workaround for https://github.com/triton-lang/triton/issues/2151 # tl.load returns int8 when loading from pointer to int1 # NOTE: Currently causes hangs on bool UTs for ROCm line += ".to(tl.int1)" @@ -2302,7 +2302,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]): indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None) # Guard against write-after-read corruption in triton. - # See # https://github.com/openai/triton/issues/1615 + # See # https://github.com/triton-lang/triton/issues/1615 # This triton bug means that a load which is broadcasted over multiple # warps may see the result of a store that happens later in the triton # program. The workaround is to add a barrier before storing, which @@ -3655,7 +3655,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]): # when they are not constexpr. otherwise there may be a segfault # during launching the Inductor-compiled Triton kernel. # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307 - # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384 + # https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384 for arg_num in equal_1_arg_indices(signature): # type: ignore[index] triton_meta["constants"][signature[arg_num].name] = 1 # type: ignore[index,union-attr] diff --git a/torch/_inductor/codegen/triton_utils.py b/torch/_inductor/codegen/triton_utils.py index ddd4ec51551..7f4d72ee71b 100644 --- a/torch/_inductor/codegen/triton_utils.py +++ b/torch/_inductor/codegen/triton_utils.py @@ -34,7 +34,7 @@ def should_unwrap_unspec_arg(name: str): def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str: if isinstance(arg, TensorArg): # TODO: Remove fp8 special handling when Triton supports PyTorch fp8 dtypes. - # Related PR: https://github.com/openai/triton/pull/2279/ + # Related PR: https://github.com/triton-lang/triton/pull/2279/ if arg.dtype == torch.float8_e4m3fn: tye = "*fp8e4nv" elif arg.dtype == torch.float8_e5m2: @@ -184,7 +184,7 @@ def config_of( def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool: """ Roughly follow triton code here: - https://github.com/openai/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222 + https://github.com/triton-lang/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222 """ if isinstance(x, TensorArg): if include_tensor: diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py index 94aab698e23..906ddbf3b6c 100644 --- a/torch/_inductor/codegen/wrapper.py +++ b/torch/_inductor/codegen/wrapper.py @@ -1985,7 +1985,7 @@ class PythonWrapperCodegen(CodeGen): # TODO(aakhundov): add None args to constants, too. currently, this # causes CUDA errors in test_aot_inductor.test_triton_kernel_with_none_input. # https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307 - # https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384 + # https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384 "constants": { **constants, **dict.fromkeys(equal_to_1_args, 1), diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index bef6783ad33..aa6729ebfcc 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -1158,7 +1158,7 @@ class triton: # of registers being benchmarked. # # NOTE: triton will always report >0 register spills for kernels using sin/cos. - # (check this issue https://github.com/openai/triton/issues/1756 ) + # (check this issue https://github.com/triton-lang/triton/issues/1756 ) # So far we see a fixed 8 spilled registers for kernels using sin/cos. # Raise the threshold to 16 to be safe. # We should revisit this once we understand more of the source of register spills. diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py index 40c0670a965..655a0e44d24 100644 --- a/torch/_inductor/fx_passes/pad_mm.py +++ b/torch/_inductor/fx_passes/pad_mm.py @@ -326,7 +326,7 @@ def should_exclude_padding_time(match: Match, arg_name: str) -> bool: if not fetch_fake_tensors(match, (arg_name,))[0].is_contiguous(): return False - # TODO - see issue https://githpub.com/pytorch/pytorch/issues/128889 + # TODO - see issue https://github.com/pytorch/pytorch/issues/128889 # We would only able to completely plan these out if we were only doing # first dimension padding. non-first we would still need a copy # because these outputs are fixed dense. diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py index 8df1c1e1f2a..88c5f8497ac 100644 --- a/torch/_inductor/fx_passes/quantization.py +++ b/torch/_inductor/fx_passes/quantization.py @@ -2185,8 +2185,7 @@ def _register_qlinear_weight_prepack(): # Step 2: register patterns from bmm # Linear might be decomposed into bmm when input dim exceeds 2 and not contiguous # refer to: - # https://github.com/pytorch/pytorch/blob/ - # 80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968 + # https://github.com/pytorch/pytorch/blob/80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968 # in this case, we can convert it back to qlinear for dtype, with_bias, is_tensor_overload in itertools.product( [torch.float32, torch.bfloat16], [True, False], [True, False] diff --git a/torch/_inductor/kernel/conv.py b/torch/_inductor/kernel/conv.py index 9e6c5e8d42b..4b14989c372 100644 --- a/torch/_inductor/kernel/conv.py +++ b/torch/_inductor/kernel/conv.py @@ -620,7 +620,7 @@ def convolution( PADDING_W=padding[1], GROUPS=groups, # TODO(jansel): try unroll for bigger kernels once fixed: - # https://github.com/openai/triton/issues/1254 + # https://github.com/triton-lang/triton/issues/1254 UNROLL=is_ones(kernel_shape), ALLOW_TF32=torch.backends.cudnn.allow_tf32, num_stages=cfg.num_stages, @@ -643,7 +643,7 @@ def convolution( PADDING_W=padding[2], GROUPS=groups, # TODO(jansel): try unroll for bigger kernels once fixed: - # https://github.com/openai/triton/issues/1254 + # https://github.com/triton-lang/triton/issues/1254 UNROLL=is_ones(kernel_shape), ALLOW_TF32=torch.backends.cudnn.allow_tf32, num_stages=cfg.num_stages, diff --git a/torch/_inductor/kernel/mm_plus_mm.py b/torch/_inductor/kernel/mm_plus_mm.py index 2e190595c0d..5447c27f4f0 100644 --- a/torch/_inductor/kernel/mm_plus_mm.py +++ b/torch/_inductor/kernel/mm_plus_mm.py @@ -134,7 +134,7 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None): ) ): # TODO(jansel): support different K values when this is fixed: - # https://github.com/openai/triton/issues/967 + # https://github.com/triton-lang/triton/issues/967 return lowerings[aten.add]( lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4) ) @@ -151,7 +151,7 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None): if use_triton_template(layout1): for config in mm_configs(): - # see https://github.com/openai/triton/issues/1298 + # see https://github.com/triton-lang/triton/issues/1298 # BLOCK_K = K causes llvm error if V.graph.sizevars.statically_known_lt(config.kwargs["BLOCK_K"], k1): mm_plus_mm_template.maybe_append_choice( diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py index 7b6d2681b70..225600561ed 100644 --- a/torch/_inductor/lowering.py +++ b/torch/_inductor/lowering.py @@ -6092,7 +6092,7 @@ def div_mode(a, b, rounding_mode=None): both_boolean = is_boolean_type(a) and is_boolean_type(b) # floordiv and truncdiv need special handling for integer tensors on Triton, - # see the discussion at https://github.com/openai/triton/issues/605 + # see the discussion at https://github.com/triton-lang/triton/issues/605 if rounding_mode == "floor": assert not both_boolean, "floordiv operands can not be boolean at the same time" return floordiv(a, b) if both_integer else floor(div(a, b)) diff --git a/torch/_inductor/mkldnn_lowerings.py b/torch/_inductor/mkldnn_lowerings.py index 06c54c18820..7e364e139ad 100644 --- a/torch/_inductor/mkldnn_lowerings.py +++ b/torch/_inductor/mkldnn_lowerings.py @@ -707,8 +707,8 @@ def register_onednn_fusion_ops(): assert x_zp.get_numel() == 1, "x_zp is incompatible with oneDNN qlinear" # When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer - # Refer to https://github.com/pytorch/pytorch/blob - # /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577 + # Refer to + # https://github.com/pytorch/pytorch/blob/f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577 # noqa: B950 if w_zp is None: # If w_zp is None, then it's a dummy tensor created to denote the # absence of a zero point, and thus w is int8 symmetrically quantized. @@ -1018,8 +1018,8 @@ def register_onednn_fusion_ops(): x_zp.realize() # When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer - # Refer to https://github.com/pytorch/pytorch/blob - # /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577 + # Refer to + # https://github.com/pytorch/pytorch/blob/f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577 # noqa: B950 w_scale.realize() w_zp.realize() if w_zp.get_dtype() != torch.int32 and isinstance( diff --git a/torch/_inductor/runtime/triton_compat.py b/torch/_inductor/runtime/triton_compat.py index 831898d8d8f..d6e45b72ce4 100644 --- a/torch/_inductor/runtime/triton_compat.py +++ b/torch/_inductor/runtime/triton_compat.py @@ -44,7 +44,7 @@ if triton is not None: return (backend, arch) # In the latest triton, math functions were shuffled around into different modules: - # https://github.com/openai/triton/pull/3172 + # https://github.com/triton-lang/triton/pull/3172 try: from triton.language.extra import libdevice diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 0e4557268d8..93fb36e12bb 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -1472,7 +1472,7 @@ class TritonCompileResult(CompileResult[CompiledKernel]): "metadata", *call_args, ] - else: # args after CompiledKernel.launch_metadata: https://github.com/openai/triton/pull/3492 + else: # args after CompiledKernel.launch_metadata: https://github.com/triton-lang/triton/pull/3492 # Getting the kernel launch args is extremely perf-sensitive. Evaluating # `bin.launch_metadata` is relatively expensive, and returns None unless a # `launch_enter_hook` is installed. So if we don't have that hook installed, diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py index cffcd22ab46..f97bd0582e7 100644 --- a/torch/_inductor/scheduler.py +++ b/torch/_inductor/scheduler.py @@ -4560,7 +4560,7 @@ class Scheduler: ) return False except CompilationError as e: - # workaround triton issue: https://github.com/openai/triton/issues/2151 + # workaround triton issue: https://github.com/triton-lang/triton/issues/2151 if "Loop-carried variable" in str(e): fusion_log.debug( "ComboKernel benchmark: return True because of loop-carried variable" @@ -4574,7 +4574,7 @@ class Scheduler: try: ms2, ms2_clone, _path2_list = self.benchmark_combo_kernel(subkernel_nodes) except CompilationError as e: - # workaround triton issue: https://github.com/openai/triton/issues/2151 + # workaround triton issue: https://github.com/triton-lang/triton/issues/2151 if "Loop-carried variable" in str(e): fusion_log.debug( "ComboKernel benchmark: return True because of loop-carried variable" diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py index 855b81c9d20..d48206f091f 100644 --- a/torch/_inductor/utils.py +++ b/torch/_inductor/utils.py @@ -390,7 +390,7 @@ def ceildiv( def _type_of(key: Optional[torch.dtype]) -> str: # Use the function here to get rid of dependencies on the Triton during the codegen. # Refer to Triton implementation here: - # https://github.com/openai/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238 + # https://github.com/triton-lang/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238 # `None` is nullptr. Implicitly convert to *i8. if key is None: return "*i8" @@ -1981,7 +1981,7 @@ def get_device_tflops(dtype: torch.dtype) -> int: assert dtype in (torch.float16, torch.bfloat16, torch.float32) if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"): - # Triton API change in https://github.com/openai/triton/pull/2293 + # Triton API change in https://github.com/triton-lang/triton/pull/2293 from torch._utils_internal import max_clock_rate sm_clock = max_clock_rate() diff --git a/torch/_numpy/_ndarray.py b/torch/_numpy/_ndarray.py index 20ebd9db818..fe2410a9f4e 100644 --- a/torch/_numpy/_ndarray.py +++ b/torch/_numpy/_ndarray.py @@ -435,7 +435,7 @@ class ndarray: def item(self, *args): # Mimic NumPy's implementation with three special cases (no arguments, # a flat index and a multi-index): - # https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/methods.c#L702 + # https://github.com/numpy/numpy/blob/main/numpy/_core/src/multiarray/methods.c#L702 if args == (): return self.tensor.item() elif len(args) == 1: diff --git a/torch/_tensor.py b/torch/_tensor.py index 5bf70c2eca8..271a76111b6 100644 --- a/torch/_tensor.py +++ b/torch/_tensor.py @@ -1262,7 +1262,7 @@ class Tensor(torch._C.TensorBase): """Array view description for cuda tensors. See: - https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html + https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html """ if has_torch_function_unary(self): # TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185 diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 188c7198666..2a4d684ba85 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -4163,9 +4163,9 @@ Unlike :meth:`~Tensor.expand`, this function copies the tensor's data. .. warning:: :meth:`~Tensor.repeat` behaves differently from - `numpy.repeat `_, + `numpy.repeat `_, but is more similar to - `numpy.tile `_. + `numpy.tile `_. For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`. Args: diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 2b046844e9e..4570dd81b94 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -767,7 +767,7 @@ This function checks if :attr:`input` and :attr:`other` satisfy the condition: """ + r""" elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to -`numpy.allclose `_ +`numpy.allclose `_ Args: input (Tensor): first tensor to compare @@ -13826,7 +13826,7 @@ Returns the indices of the buckets to which each value in the :attr:`input` belo boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that this behavior is opposite the behavior of -`numpy.digitize `_. +`numpy.digitize `_. More formally, the returned index satisfies the following rules: .. list-table:: diff --git a/torch/_vmap_internals.py b/torch/_vmap_internals.py index 1ea8f520123..6baee77ade5 100644 --- a/torch/_vmap_internals.py +++ b/torch/_vmap_internals.py @@ -219,7 +219,7 @@ def _vmap( # The `allow_none_pass_through` argument is a temporary workaround may be removed. # Currently it enables us to wrap the call in `autograd.grad` to the autograd engine, # which may return None if any of the inputs are unused. See the issue discussing this: - # https://github.com/facebookresearch/functorch/issues/159. + # https://github.com/pytorch/functorch/issues/159. @functools.wraps(func) def wrapped(*args): _check_out_dims_is_int_or_int_tuple(out_dims, func) diff --git a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py index 3f91c2ddd13..df4d94b3fbf 100644 --- a/torch/ao/quantization/quantizer/x86_inductor_quantizer.py +++ b/torch/ao/quantization/quantizer/x86_inductor_quantizer.py @@ -703,8 +703,8 @@ class X86InductorQuantizer(Quantizer): # Once we've annotated the model with quantization configurations, we also need to annotate # the output of quantizable operations. For example, if we annotated `maxpool2d` to quantize its inputs, # we will quantize its output accordingly. This enables us to fuse the dq-operator-q into a quantized op. - # Refer to https://github.com/intel/intel-extension-for-pytorch/blob/ - # 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487 + # Refer to + # https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487 # noqa: B950 self._annotate_output_for_int8_in_int8_out_pattern_entry(model) @@ -732,8 +732,8 @@ class X86InductorQuantizer(Quantizer): # Step2: Recipe to propagate annotation for patterns beside conv/linear. # Go through all the nodes from start to end. - # Recipe refer to https://github.com/intel/intel-extension-for-pytorch/blob/ - # 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538 + # Recipe refer to + # https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538 # noqa: B950 self._annotate_propagation_quantizable_pattern_entry( model, quantization_config, filter_fn @@ -1381,9 +1381,9 @@ class X86InductorQuantizer(Quantizer): ) -> None: r""" Check and insert observer at output of node in int8_in_int8_out_ops if needed. - Recipe refers to https://github.com/intel/intel-extension-for-pytorch/blob/ - 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495 - """ + Recipe refers to + https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495 + """ # noqa: B950 edge_or_node: tuple[Node, Node] if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])): if node.target == torch.ops.aten.max_pool2d.default: diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp index ce7414d31b7..f944bb5c546 100644 --- a/torch/csrc/PyInterpreter.cpp +++ b/torch/csrc/PyInterpreter.cpp @@ -635,7 +635,7 @@ static c10::ArrayRef get_set_cached_attr( // is also to <=5 elements, we don't need to reallocate. // Note: I tried removing this optimization and tripped ASAN // in a batchnorm kernel here: - // https://pipelinesghubeus21.actions.githubusercontent.com/mBh68xKhi8LyM7tp3vECvYXNFvuV4gyVGgmYCteuEZP9JH92QN/_apis/pipelines/1/runs/3373307/signedlogcontent/790?urlExpires=2023-09-15T21%3A13%3A51.4327798Z&urlSigningMethod=HMACV1&urlSignature=tDeX7ZqaARVU5NNwyr5yYqqkWq3A2j4z8FFdqYwGr0Q%3D + // https://pipelinesghubeus21.actions.githubusercontent.com/mBh68xKhi8LyM7tp3vECvYXNFvuV4gyVGgmYCteuEZP9JH92QN/_apis/pipelines/1/runs/3373307/signedlogcontent/790?urlExpires=2023-09-15T21%3A13%3A51.4327798Z&urlSigningMethod=HMACV1&urlSignature=tDeX7ZqaARVU5NNwyr5yYqqkWq3A2j4z8FFdqYwGr0Q%3D@lint-ignore // We should fix this instead. bool needs_resize = false; // We need to resize if: diff --git a/torch/csrc/api/src/nn/modules/rnn.cpp b/torch/csrc/api/src/nn/modules/rnn.cpp index eff69a32a85..da1ab02aa6a 100644 --- a/torch/csrc/api/src/nn/modules/rnn.cpp +++ b/torch/csrc/api/src/nn/modules/rnn.cpp @@ -21,7 +21,7 @@ using namespace torch::nn::utils::rnn; namespace torch::nn { /// These must line up with the CUDNN mode codes: -/// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t +/// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/api/cudnn-adv-library.html#cudnnrnnmode-t enum class CuDNNMode { RNN_RELU = 0, RNN_TANH = 1, LSTM = 2, GRU = 3 }; static CuDNNMode get_cudnn_mode_for_rnn( diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 498259c8fa1..aaaadc49672 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -891,8 +891,8 @@ Tensor logcumsumexp_backward( return grad; } - // Reference: https://github.com/tensorflow/tensorflow/blob/ - // 2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863 + // Reference: + // https://github.com/tensorflow/tensorflow/blob/2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863 auto scalar_min = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1( at::ScalarType::BFloat16, diff --git a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h index 9b38cd525e7..24be190ec53 100644 --- a/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h +++ b/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h @@ -15,7 +15,8 @@ namespace torch::jit::fuser::onednn { // being created for each device. The device handle passed from PyTorch allows // oneDNN Graph implementation to work on the device specified by PyTorch, which // is currently CPU, so we only have one engine. -// Ref: https://spec.oneapi.io/onednn-graph/latest/programming_model.html#engine +// Ref: +// https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#engine struct Engine { // CPU engine singleton static dnnl::engine& getEngine(); diff --git a/torch/csrc/jit/codegen/onednn/README.md b/torch/csrc/jit/codegen/onednn/README.md index e3f3ec66734..fb309abc3bc 100644 --- a/torch/csrc/jit/codegen/onednn/README.md +++ b/torch/csrc/jit/codegen/onednn/README.md @@ -1,5 +1,5 @@ # Pytorch - oneDNN Graph API Bridge -This is a PyTorch JIT graph fuser based on [oneDNN Graph API](https://spec.oneapi.io/onednn-graph/latest/programming_model.html), which provides a flexible API for aggressive fusion. Float & BFloat16 inference is supported. However, BFloat16 only performs well on Intel Xeon Cooper Lake platform & beyond, as they have native BFloat16 support. Also, currently, PyTorch has divergent AMP support in JIT & eager modes, so one should disable JIT AMP support & leverage eager mode AMP support to use BFloat16. Please refer to the BFloat16 example below. +This is a PyTorch JIT graph fuser based on [oneDNN Graph API](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model), which provides a flexible API for aggressive fusion. Float & BFloat16 inference is supported. However, BFloat16 only performs well on Intel Xeon Cooper Lake platform & beyond, as they have native BFloat16 support. Also, currently, PyTorch has divergent AMP support in JIT & eager modes, so one should disable JIT AMP support & leverage eager mode AMP support to use BFloat16. Please refer to the BFloat16 example below. Currently, speedup is achieved only for static shapes, although we'd soon add dynamic-shape support. When oneDNN Graph is enabled, weights are cached, as they're constant during inference. @@ -29,7 +29,7 @@ We have registered optimization passes in the custom pre-passes set of PyTorch: ## Graph Executor During runtime execution of a (re-written) PyTorch JIT graph, oneDNN graph partitions will be dispatched to the oneDNN graph JIT variadic Operator. -Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#partition) and [executed](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph. +Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#partition) and [executed](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph. ## Tests diff --git a/torch/csrc/jit/tensorexpr/expr.cpp b/torch/csrc/jit/tensorexpr/expr.cpp index ece08a2f08b..52701665d06 100644 --- a/torch/csrc/jit/tensorexpr/expr.cpp +++ b/torch/csrc/jit/tensorexpr/expr.cpp @@ -143,7 +143,7 @@ ExprHandle abs(const ExprHandle& v) { } // The default tanh is quite slow, use the Eigen version from here: -// https://bitbucket.org/eigen/eigen/src/94875feeeeb9abe5509b314197da1991ba2070f5/Eigen/src/Core/MathFunctionsImpl.h#lines-26 +// https://github.com/TUW-VieVS/VieSchedpp/blob/master/Eigen/src/Core/MathFunctionsImpl.h#L26 ExprHandle fast_tanh(const ExprHandle& v) { // TODO: use a dedicated bind-var to make sure v is not evaluated multiple // times. Clamp the input expression to [-9, 9] @@ -205,7 +205,7 @@ ExprHandle fast_sigmoid(const ExprHandle& x) { ExprHandle fast_log(const ExprHandle& v) { // this implementation is taken from sleef: - // https://github.com/shibatch/sleef/blob/master/src/libm/sleefsp.c#L1131 + // https://github.com/shibatch/sleef/blob/master/src/libm/sleefsimdsp.c#L1277 // to generate coefficients, this tool is provided // https://github.com/shibatch/sleef/blob/master/src/gencoef/gencoef.txt auto ilogb2kf = [](const ExprHandle& x) { diff --git a/torch/csrc/lazy/core/cache.h b/torch/csrc/lazy/core/cache.h index 5b2160c6778..6aad77b85e5 100644 --- a/torch/csrc/lazy/core/cache.h +++ b/torch/csrc/lazy/core/cache.h @@ -1,6 +1,6 @@ /** * Cache utils in this file is adapted from PyTorch/XLA - * https://github.com/pytorch/xla/blob/master/third_party/xla_client/cache.h + * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/cache.h */ #pragma once diff --git a/torch/csrc/lazy/core/metrics.h b/torch/csrc/lazy/core/metrics.h index 05b525778d9..83b388d7740 100644 --- a/torch/csrc/lazy/core/metrics.h +++ b/torch/csrc/lazy/core/metrics.h @@ -1,6 +1,6 @@ /** * This file is adapted from PyTorch/XLA - * https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h + * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h */ #pragma once diff --git a/torch/csrc/lazy/core/multi_wait.h b/torch/csrc/lazy/core/multi_wait.h index a3a33ee3975..df3faf8d806 100644 --- a/torch/csrc/lazy/core/multi_wait.h +++ b/torch/csrc/lazy/core/multi_wait.h @@ -1,6 +1,6 @@ /** * This file is adapted from PyTorch/XLA - * https://github.com/pytorch/xla/blob/master/third_party/xla_client/multi_wait.h + * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/multi_wait.h */ #pragma once diff --git a/torch/csrc/lazy/core/thread_pool.h b/torch/csrc/lazy/core/thread_pool.h index 2e0ae8f89d8..5bffe6ca3a0 100644 --- a/torch/csrc/lazy/core/thread_pool.h +++ b/torch/csrc/lazy/core/thread_pool.h @@ -1,6 +1,6 @@ /** * This file is adapted from PyTorch/XLA - * https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h + * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h */ #pragma once diff --git a/torch/csrc/lazy/core/unique.h b/torch/csrc/lazy/core/unique.h index 7f38c258658..1375f45aa19 100644 --- a/torch/csrc/lazy/core/unique.h +++ b/torch/csrc/lazy/core/unique.h @@ -1,6 +1,6 @@ /** * Unique in this file is adapted from PyTorch/XLA - * https://github.com/pytorch/xla/blob/master/third_party/xla_client/unique.h + * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/unique.h */ #pragma once diff --git a/torch/csrc/lazy/core/util.h b/torch/csrc/lazy/core/util.h index 694cda379a2..865b28d8f3c 100644 --- a/torch/csrc/lazy/core/util.h +++ b/torch/csrc/lazy/core/util.h @@ -1,6 +1,6 @@ /** * Most of the utils in this file is adapted from PyTorch/XLA - * https://github.com/pytorch/xla/blob/master/third_party/xla_client/util.h + * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/util.h */ #pragma once diff --git a/torch/distributed/_tools/sac_estimator.py b/torch/distributed/_tools/sac_estimator.py index 2c1f4f5e937..962f5864c22 100644 --- a/torch/distributed/_tools/sac_estimator.py +++ b/torch/distributed/_tools/sac_estimator.py @@ -50,7 +50,7 @@ def _display_stats_tabular(headers: list[str], table_data: list[list[Any]]) -> N # Based on: -# https://github.com/fairinternal/xformers/blob/0ded5697a2ea15711ce45131002d04e72053cc6d/xformers/checkpoint.py#L62 +# https://github.com/facebookresearch/xformers/blob/main/xformers/checkpoint.py#L71 @dataclass class _SACMetadata: """ diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 5b5ff3434ad..5db84f50b5a 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -5142,9 +5142,9 @@ def new_group( group, they must be synchronized with other cuda streams by calling `work.wait()` before using another process group. - See `Using multiple NCCL communicators concurrently `_ for more details. + See `Using multiple NCCL communicators concurrently + ` + for more details. Args: ranks (list[int]): List of ranks of group members. If ``None``, will be @@ -5163,10 +5163,9 @@ def new_group( the construction of specific process groups. i.e. for the ``nccl`` backend, ``is_high_priority_stream`` can be specified so that process group can pick up high priority cuda streams. For other availble options to config nccl, - See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t - use_local_synchronization (bool, optional): perform a group-local - barrier at the end of the process group creation. This is different - in that non-member ranks don't need to call into API and don't + See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-tuse_local_synchronization + (bool, optional): perform a group-local barrier at the end of the process group creation. + This is different in that non-member ranks don't need to call into API and don't join the barrier. group_desc (str, optional): a string to describe the process group. device_id (torch.device, optional): a single, specific device diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py index 922a177b203..48d5eb8a644 100644 --- a/torch/jit/annotations.py +++ b/torch/jit/annotations.py @@ -331,7 +331,7 @@ def try_real_annotations(fn, loc): try: # Note: anything annotated as `Optional[T]` will automatically # be returned as `Union[T, None]` per - # https://github.com/python/typing/blob/master/src/typing.py#L850 + # https://github.com/python/cpython/blob/main/Lib/typing.py#L732 sig = inspect.signature(fn) except ValueError: return None diff --git a/torch/linalg/__init__.py b/torch/linalg/__init__.py index 1aa04204164..4d40718bcd0 100644 --- a/torch/linalg/__init__.py +++ b/torch/linalg/__init__.py @@ -329,7 +329,7 @@ Examples:: tensor(0, dtype=torch.int32) .. _LAPACK's getrf: - https://www.netlib.org/lapack/explore-html/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html + https://www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html """, ) @@ -967,7 +967,7 @@ Examples:: tensor([1, 2, 3], dtype=torch.int32) .. _LAPACK's sytrf: - https://www.netlib.org/lapack/explore-html/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html + https://www.netlib.org/lapack/explore-html-3.6.1/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html """, ) @@ -1025,7 +1025,7 @@ Examples:: tensor(0, dtype=torch.int32) .. _LAPACK's sytrf: - https://www.netlib.org/lapack/explore-html/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html + https://www.netlib.org/lapack/explore-html-3.6.1/d3/db6/group__double_s_ycomputational_gad91bde1212277b3e909eb6af7f64858a.html """, ) @@ -2513,7 +2513,7 @@ Returns: A named tuple `(LU, pivots, info)`. .. _LAPACK's getrf: - https://www.netlib.org/lapack/explore-html/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html + https://www.netlib.org/lapack/explore-html-3.6.1/dd/d9a/group__double_g_ecomputational_ga0019443faea08275ca60a734d0593e60.html """, ) diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 371745664f4..fa295418504 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -5315,7 +5315,7 @@ def index(g: jit_utils.GraphContext, self, index): # 2. prim::Constant[value=...] or tensor output # representing advanced indexing. E.g. tensor[[0, 1], [2, 0]]. # For more info on advanced indexing, - # check https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing + # check https://numpy.org/doc/stable/user/basics.indexing.html#advanced-indexing # Consider a general case of # t: [x_1, y_1, y_2, ..., x_m, ..., y_n] @@ -5389,7 +5389,7 @@ def index(g: jit_utils.GraphContext, self, index): cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index) # check if all advanced indices are consecutive. - # Refer to https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#combining-advanced-and-basic-indexing + # Refer to https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing # to understand how the subarray position is decided. if adv_idx_indices == list( range(adv_idx_indices[0], adv_idx_indices[-1] + 1) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index fdaf31cbdd1..5d03deebcbc 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -20751,7 +20751,6 @@ op_db: list[OpInfo] = [ DecorateInfo(unittest.expectedFailure, 'TestNormalizeOperators', 'test_normalize_operator_exhaustive'), # AssertionError: Tensor-likes are not close! # Fails in cuda11.7 - # Error Log: https://github.com/pytorch/pytorch/actions/runs/3440108478/jobs/5738475757 DecorateInfo(unittest.expectedFailure, 'TestCommon', 'test_compare_cpu', device_type='cuda'), DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit'),),), # In training mode, feature_alpha_dropout currently doesn't support inputs of complex dtype diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py index 39069f3d06c..61c63ece236 100644 --- a/torch/utils/_sympy/functions.py +++ b/torch/utils/_sympy/functions.py @@ -342,9 +342,9 @@ class ModularIndexing(sympy.Function): and isinstance(term.args[0], sympy.Integer) and term.args[0] < 0 ): - # workaround for https://github.com/openai/triton/issues/619, + # workaround for https://github.com/triton-lang/triton/issues/619, # if there are negative terms, // produces wrong result - # TODO if https://github.com/openai/triton/issues/619 is fixed + # TODO if https://github.com/triton-lang/triton/issues/619 is fixed # this optimization would become valid all_positive = False break diff --git a/torch/utils/collect_env.py b/torch/utils/collect_env.py index 747a6c1249c..5fa84295767 100644 --- a/torch/utils/collect_env.py +++ b/torch/utils/collect_env.py @@ -199,8 +199,8 @@ def get_cudnn_version(run_lambda): cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) elif get_platform() == 'darwin': # CUDA libraries and drivers can be found in /usr/local/cuda/. See - # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install - # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac + # https://docs.nvidia.com/cuda/archive/9.0/cuda-installation-guide-mac-os-x/index.html#installation + # https://docs.nvidia.com/deeplearning/cudnn/installation/latest/ # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*' else: diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py index c425e9edc9f..81db11fd285 100644 --- a/torch/utils/hipify/cuda_to_hip_mappings.py +++ b/torch/utils/hipify/cuda_to_hip_mappings.py @@ -16,7 +16,7 @@ from .constants import (API_BLAS, API_C10, API_CAFFE2, API_DRIVER, API_FFT, """ Mapping of CUDA functions, include files, constants, and types to ROCm/HIP equivalents This closely follows the implementation in hipify-clang -https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/CUDA2HipMap.cpp +https://github.com/ROCm/hip/blob/59071b895ed1c86d9698b4c859cefcdd5acda06f/hipify-clang/src/CUDA2HipMap.cpp and its structure. There are different maps for fundamental names, include files, identifies, sparse, and PyTorch specific translations. diff --git a/torch/utils/tensorboard/_pytorch_graph.py b/torch/utils/tensorboard/_pytorch_graph.py index 0e9e453183d..85427162fc7 100644 --- a/torch/utils/tensorboard/_pytorch_graph.py +++ b/torch/utils/tensorboard/_pytorch_graph.py @@ -341,7 +341,7 @@ def graph(model, args, verbose=False, use_strict_trace=True): # and pass it correctly to TensorBoard. # # Definition of StepStats and DeviceStepStats can be found at - # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/graph/tf_graph_common/test/graph-test.ts + # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/graph/tf_graph_common/proto.ts # and # https://github.com/tensorflow/tensorboard/blob/master/tensorboard/compat/proto/step_stats.proto stepstats = RunMetadata( diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py index a6792c5b8ab..129281cb8ac 100644 --- a/torch/utils/tensorboard/writer.py +++ b/torch/utils/tensorboard/writer.py @@ -472,7 +472,7 @@ class SummaryWriter: values (torch.Tensor, numpy.ndarray, or string/blobname): Values to build histogram global_step (int): Global step value to record bins (str): One of {'tensorflow','auto', 'fd', ...}. This determines how the bins are made. You can find - other options in: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram.html + other options in: https://numpy.org/doc/stable/reference/generated/numpy.histogram.html walltime (float): Optional override default walltime (time.time()) seconds after epoch of event diff --git a/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh b/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh index fd50b2e79fb..531b698bde6 100644 --- a/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh +++ b/torchgen/_autoheuristic/mixed_mm/get_mixedmm_dataset.sh @@ -1,6 +1,6 @@ #!/bin/bash -base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/' +base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/' # @lint-ignore a100_data='mixedmm_a100_data.zip' h100_data='mixedmm_h100_data.zip' datasets=("${a100_data}" "${h100_data}") diff --git a/torchgen/_autoheuristic/mm/get_mm_dataset.sh b/torchgen/_autoheuristic/mm/get_mm_dataset.sh index 7461dec41dd..60280104ebf 100644 --- a/torchgen/_autoheuristic/mm/get_mm_dataset.sh +++ b/torchgen/_autoheuristic/mm/get_mm_dataset.sh @@ -1,6 +1,6 @@ #!/bin/bash -base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/' +base_url='https://github.com/AlnisM/autoheuristic-datasets/raw/main/' # @lint-ignore a100_data='a100_mm.zip' h100_data='h100_mm.zip' datasets=("${a100_data}" "${h100_data}") diff --git a/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh b/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh index b8ab60d943e..ba8ed904660 100644 --- a/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh +++ b/torchgen/_autoheuristic/pad_mm/get_padmm_dataset.sh @@ -1,7 +1,7 @@ #!/bin/bash a100_zip="pad_mm_a100_data.zip" -a100_data="https://github.com/AlnisM/autoheuristic-datasets/raw/main/${a100_zip}" +a100_data="https://github.com/AlnisM/autoheuristic-datasets/raw/main/${a100_zip}" # @lint-ignore rm -f ${a100_zip} wget ${a100_data} unzip -o ${a100_zip}