Fix broken URLs (#152237)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/152237
Approved by: https://github.com/huydhn, https://github.com/malfet
This commit is contained in:
Anthony Shoumikhin 2025-04-27 09:56:42 +00:00 committed by PyTorch MergeBot
parent cbcc03c2ad
commit e2f9759bd0
115 changed files with 176 additions and 181 deletions

View File

@ -128,7 +128,7 @@ def gh_fetch_json_dict(
def gh_graphql(query: str, **kwargs: Any) -> dict[str, Any]:
rc = gh_fetch_url(
"https://api.github.com/graphql",
"https://api.github.com/graphql", # @lint-ignore
data={"query": query, "variables": kwargs},
reader=json.load,
)

View File

@ -64,7 +64,7 @@ endif()
# This define is needed to preserve behavior given anticpated changes to
# cccl/thrust
# https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html
# https://nvidia.github.io/cccl/libcudacxx/standard_api/numerics_library/complex.html
string(APPEND CMAKE_CUDA_FLAGS
" -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS")

View File

@ -194,7 +194,7 @@ If you want to compile with CUDA support, [select a supported version of CUDA fr
- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
- [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA
Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
Other potentially useful environment variables may be found in `setup.py`.

View File

@ -221,7 +221,7 @@ Release candidates are currently stored in the following places:
* Wheels: https://download.pytorch.org/whl/test/
* Conda: https://anaconda.org/pytorch-test
* Libtorch: https://download.pytorch.org/libtorch/test
* Libtorch: https://download.pytorch.org/libtorch/test <!-- @lint-ignore -->
Backups are stored in a non-public S3 bucket at [`s3://pytorch-backup`](https://s3.console.aws.amazon.com/s3/buckets/pytorch-backup?region=us-east-1&tab=objects)
@ -322,7 +322,7 @@ Promotion should occur in two steps:
* Promote S3 artifacts (wheels, libtorch) and Conda packages
* Promote S3 wheels to PyPI
**NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypa/warehouse/issues/726 for a discussion on potential draft releases within PyPI)
**NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypi/warehouse/issues/726 for a discussion on potential draft releases within PyPI)
## Additional Steps to prepare for release day

View File

@ -28,7 +28,7 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D
"Autograd dispatch key for the backend.\n",
"If you only want to run inference instead of training, in C++, add `c10::InferenceMode mode;` "
"before model.forward(); in Python, use `torch.inference_mode()` as a context manager (see "
"https://pytorch.org/docs/stable/generated/torch.inference_mode.html).",
"https://pytorch.org/docs/stable/generated/torch.autograd.grad_mode.inference_mode.html).",
"\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
}

View File

@ -410,7 +410,7 @@ template <typename T>
__host__ __device__ T safe_max(T a, T b) {
#if defined(__HIPCC__)
// TODO: remove this special case for HIP when issue is fixed:
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
// https://github.com/ROCm/hip/issues/2209
T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max<T>(a, b));
#else
T max = at::_isnan(b) ? b : std::max<T>(a, b);
@ -470,7 +470,7 @@ template <typename T>
__host__ __device__ T safe_min(T a, T b) {
#if defined(__HIPCC__)
// TODO: remove this special case for HIP when issue is fixed:
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
// https://github.com/ROCm/hip/issues/2209
T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min<T>(a, b));
#else
T min = at::_isnan(b) ? b : std::min<T>(a, b);

View File

@ -1680,7 +1680,7 @@ inline C10_HOST_DEVICE T calc_ndtri(T y0) {
return x;
}
/* The next function is taken from http://ab-initio.mit.edu/Faddeev */
/* The next function is taken from http://ab-initio.mit.edu/faddeeva */
/* Copyright (c) 2012 Massachusetts Institute of Technology
*

View File

@ -26,7 +26,7 @@ template <typename scalar_t>
inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
#if defined(__HIPCC__)
// TODO: remove this special case for HIP when issue is fixed:
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
// https://github.com/ROCm/hip/issues/2209
scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b));
#else
scalar_t max = at::_isnan(b) ? b : std::max(a, b);
@ -37,7 +37,7 @@ template <typename scalar_t>
inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
#if defined(__HIPCC__)
// TODO: remove this special case for HIP when issue is fixed:
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
// https://github.com/ROCm/hip/issues/2209
scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b));
#else
scalar_t min = at::_isnan(b) ? b : std::min(a, b);

View File

@ -13,7 +13,7 @@
namespace {
// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e,
// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH__SINGLE.html,
// to ensure the Cuda math library's isfinite is actually what gets called in
// _amp_non_finite_check_and_unscale_cuda_'s gpu_kernel lambda.
//

View File

@ -766,7 +766,7 @@ const auto sinc_string = jiterator_stringify(
); // sinc_string
const auto erfcx_string = jiterator_stringify(
/* The next function is taken from http://ab-initio.mit.edu/Faddeev */
/* The next function is taken from http://ab-initio.mit.edu/faddeeva */
/* Copyright (c) 2012 Massachusetts Institute of Technology
*

View File

@ -1865,8 +1865,6 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
// We require to perform ?geqrf_gpu again due to this bug in MAGMA:
// - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
// - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
// Refer to the below link for more details:
// http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=1015&p=2800&hilit=geqrf_gpu#p2800
case at::LinalgBackend::Magma:
return geqrf_magma(input, tau);
case at::LinalgBackend::Cusolver:

View File

@ -347,7 +347,7 @@ struct BenchmarkCache {
// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
// be thread safe across all engines see Limitations in
// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyWrapper>
benchmark_cache;
thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyFusedWrapper>

View File

@ -296,7 +296,7 @@ struct MHAGraphCache {
// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
// be thread safe across all engines see Limitations in
// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
thread_local MHAGraphCache<graph_and_tensors, MHACacheKeyWrapper> mhagraphcache;
thread_local MHAGraphCache<graph_and_tensors_backward, MHACacheKeyWrapper>
mhagraphbackwardcache;

View File

@ -1204,7 +1204,7 @@ cudnnRNNAlgo_t get_algo(
// Persistent algos typically don't work for packed inputs with sequence
// lengths that vary across batch elements, and will return
// CUDNN_STATUS_NOT_SUPPORTED if attempted. See
// https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions
// https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-890/developer-guide/index.html#features-of-rnn-functions
if (!tensors.is_input_packed()) {
auto cudnnDataType = getCudnnDataType(input);
if (cudnnDataType != CUDNN_DATA_DOUBLE) {
@ -1274,7 +1274,7 @@ int64_t _cudnn_rnn_flatten_weight_prologue(
rnn_desc = rnn.descriptor(handle);
// Why do we pad to 5 dims here (and elsewhere)?
// https://docs.nvidia.com/deeplearning/sdk/cudnn-api/index.html#cudnnRNNForwardTraining
// https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-892/api/index.html#cudnnRNNForwardTraining
// expects descriptors padded to 3 dimensions.
x_desc.set(flat_buf_datatype, x_geom.sizes(), x_geom.strides(), 5);

View File

@ -213,7 +213,7 @@ INSTANTIATE_INT4MV(bfloat, 256);
* 1. Load A and B blocks (32x32 and 64x32 respectively) into shared memory.
* 2. In 4 simdgroups, calculate the outer product of the loaded blocks. Each simdgroup produces a 2x4 8x8 result.
* 2.1 For how to use outer product to perform matrix multiplication, refer to
* http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products
* https://web.archive.org/web/20230521063455/http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products
* 3. Repeat 1 & 2 along K axis, with K block size 32, accumulate the result in the 2x4 8x8 block.
* 4. Dequantize the final result and store it in the output matrix.
*

View File

@ -1923,7 +1923,7 @@ namespace {
* FBGEMM uses vpmaddubsw instruction to multiply activations (uint8_t) and
* weights (int8_t).
*
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16&expand=3284,3530
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&expand=3284,3530&ig_expand=4236
*
* vpmaddubsw operates on a vector of activations and a vector of
* weights. If these vectors are

View File

@ -78,10 +78,10 @@ MAX_JOBS=1 scripts/build_local.sh -DBUILD_BINARY=ON -DBUILD_PYTHON=OFF \
-DUSE_OBSERVERS=OFF -DUSE_DISTRIBUTED=OFF
# Download model weights
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore
# Download model graph
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore
# Run speed benchmark with 50 warm-up iterations and 10 measurement iterations
build/bin/speed_benchmark --net predict_net.pb --init_net init_net.pb \
@ -104,11 +104,11 @@ scripts/build_android.sh -DANDROID_TOOLCHAIN=clang -DBUILD_BINARY=ON
adb push build_android/bin/speed_benchmark /data/local/tmp/speed_benchmark
# Download model weights and copy them to Android device
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore
adb push init_net.pb /data/local/tmp/init_net.pb
# Download model graph and copy it to Android device
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore
adb push predict_net.pb /data/local/tmp/predict_net.pb
# Run speed benchmark with 50 warm-up iterations and 10 measurement iterations
@ -134,11 +134,11 @@ scripts/build_android.sh -DANDROID_ABI=arm64-v8a -DANDROID_TOOLCHAIN=clang -DBUI
adb push build_android/bin/speed_benchmark /data/local/tmp/speed_benchmark
# Download model weights and copy them to Android device
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore
adb push init_net.pb /data/local/tmp/init_net.pb
# Download model graph and copy it to Android device
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore
adb push predict_net.pb /data/local/tmp/predict_net.pb
# Run speed benchmark with 50 warm-up iterations and 10 measurement iterations

View File

@ -53,7 +53,7 @@ std::unordered_map<CacheKey, cudnn_frontend::ExecutionPlan, at::native::ParamsHa
} // anonymous namespace
// TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching
// multiple operators
// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293
// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/legacy_samples/conv_sample.cpp#L295
//static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache");
// the parameter quantized_output is a quantized tensor

View File

@ -79,7 +79,7 @@ std::unordered_map<CacheKey, cudnn_frontend::ExecutionPlan, at::native::ParamsHa
}
// TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching
// multiple operators
// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293
// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/legacy_samples/conv_sample.cpp#L295
//static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache");
// currently we only support int8 symmetric (zero_point = 0 for inputs and output) quantized linear op

View File

@ -76,7 +76,7 @@ Tensor adaptive_avg_pool2d_quantized_cuda(
// any 3D tensors to 4D prior to using cudnn
// This implementation currently uses the v7 cudnn APIs as v8 cudnn APIs are not yet available for
// pooling operations.
// Consult https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingForward for
// Consult https://docs.nvidia.com/deeplearning/cudnn/backend/latest/api/cudnn-ops-library.html#cudnnpoolingforward for
// documentation on the APIs
// Currently, it appears there is no cudnn support for dilated pooling -- we will
// submit a feature request for this with cudnn

View File

@ -67,7 +67,7 @@ ORDER BY
workflowName, jobName
"""
ARTIFACTS_QUERY_URL = (
"https://console-api.clickhouse.cloud/.api/query-endpoints/"
"https://console-api.clickhouse.cloud/.api/query-endpoints/" # @lint-ignore
"c1cdfadc-6bb2-4a91-bbf9-3d19e1981cd4/run?format=JSON"
)
CSV_LINTER = str(

View File

@ -473,7 +473,7 @@ class TransformerModel(nn.Module):
return F.log_softmax(output, dim=-1)
# From https://github.com/pytorch/text/blob/master/torchtext/modules
# From https://github.com/pytorch/text/tree/master/torchtext/nn/modules
class MultiheadAttentionContainer(torch.nn.Module):
def __init__(self, nhead, in_proj_container, attention_layer, out_proj):
r"""A multi-head attention container

View File

@ -37,7 +37,7 @@ def numpy_random(dtype, *shapes):
Args:
shapes: int or a sequence of ints to defining the shapes of the tensor
dtype: use the dtypes from numpy
(https://docs.scipy.org/doc/numpy/user/basics.types.html)
(https://numpy.org/doc/stable/user/basics.types.html)
Return:
numpy tensor of dtype
"""

View File

@ -2,7 +2,7 @@
DATASET_ROOT_DIR=$HOME/datasets/
# wget https://storage.googleapis.com/sgk-sc2020/dlmc.tar.gz -P $DATASET_ROOT_DIR
# wget https://storage.googleapis.com/sgk-sc2020/dlmc.tar.gz -P $DATASET_ROOT_DIR # @lint-ignore
# tar -xvf $DATASET_ROOT_DIR/dlmc.tar.gz
echo "!! SPARSE SPMS TIME BENCHMARK!! "

View File

@ -1,6 +1,6 @@
# This is a copy of rnn_attention from MLPerf, with some common sizes hardcoded
# for benchmarking and some control flow stripped out.
# https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/seq2seq/models/attention.py
# https://github.com/mlcommons/training/blob/master/retired_benchmarks/gnmt/pytorch/seq2seq/models/attention.py
import torch

View File

@ -48,7 +48,7 @@ class ScribeUploader:
access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN")
if not access_token:
raise ValueError("Can't find access token from environment variable")
url = "https://graph.facebook.com/scribe_logs"
url = "https://graph.facebook.com/scribe_logs" # @lint-ignore
r = requests.post(
url,
data={

View File

@ -297,7 +297,7 @@ def get_pt_preprocessor_flags():
PT_PREPROCESSOR_FLAGS.append("-DENABLE_PYTORCH_NON_PRODUCTION_BUILDS")
return PT_PREPROCESSOR_FLAGS
# This needs to be kept in sync with https://github.com/pytorch/pytorch/blob/release/1.9/torchgen/gen.py#L892
# This needs to be kept in sync with https://github.com/pytorch/pytorch/blob/release/1.9/torchgen/gen.py#L892 @lint-ignore
PT_BACKEND_HEADERS = [
"CPU",
"CUDA",

View File

@ -241,7 +241,7 @@ using namespace c10::xpu;
#ifdef __HIPCC__
// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
// We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
// See https://github.com/ROCm-Developer-Tools/HIP/issues/441
// See https://github.com/ROCm/hip/issues/441
#include <hip/hip_runtime.h>
#endif

View File

@ -98,7 +98,7 @@ if(NOT __AOTRITON_INCLUDED)
"${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
"_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
"-shared.tar.${__AOTRITON_Z}")
string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/"
string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/" # @lint-ignore
"${__AOTRITON_VER}/${__AOTRITON_FILE}")
ExternalProject_Add(aotriton_external
URL "${__AOTRITON_URL}"

View File

@ -137,7 +137,7 @@ about this by following `this
API concerns itself with scenarios in which you would like to extend
TorchScript with custom operators, which can similarly be serialized and
invoked from C++ during inference. Lastly, the `torch::jit::compile
<https://pytorch.org/cppdocs/api/function_namespacetorch_1_1jit_1a176d99fd5bf0233119a5f49c07a1d01d.html#exhale-function-namespacetorch-1-1jit-1a176d99fd5bf0233119a5f49c07a1d01d>`_
<https://pytorch.org/cppdocs/api/function_namespacetorch_1_1jit_1a8660dc13a6b82336aadac667e6dccba1.html>`_
function may be used to access the TorchScript compiler directly from C++.
C++ Extensions

View File

@ -325,13 +325,13 @@ Python Docs
PyTorch documentation is generated from python source using
`Sphinx <https://www.sphinx-doc.org/en/master/>`__. Generated HTML is
copied to the docs folder in the main branch of
`pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__,
`pytorch.org/docs <https://pytorch.org/docs/main>`__,
and is served via GitHub pages.
- Site: https://pytorch.org/docs
- GitHub: https://github.com/pytorch/pytorch/tree/main/docs
- Served from:
`https://github.com/pytorch/pytorch.github.io/tree/master/docs <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__
`https://pytorch.org/docs/main <https://pytorch.org/docs/main>`__
C++ Docs
~~~~~~~~

View File

@ -3708,7 +3708,6 @@ def process_docstring(app, what_, name, obj, options, lines):
lines (List[str]): the lines of the docstring, see above
References:
https://www.sphinx-doc.org/en/1.5.1/_modules/sphinx/ext/autodoc.html
https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
"""
import re

View File

@ -13,4 +13,4 @@
See the `cuDNN 8 Release Notes`_ for more information.
.. _cuDNN 8 Release Notes: https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_8.html
.. _cuDNN 8 Release Notes: https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-880/release-notes/rel_8.html

View File

@ -141,7 +141,7 @@ network bandwidth. These two environment variables have been pre-tuned by NCCL
for some cloud providers, such as AWS or GCP.
For a full list of NCCL environment variables, please refer to
`NVIDIA NCCL's official documentation <https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html>`_
`NVIDIA NCCL's official documentation <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`_
You can tune NCCL communicators even further using `torch.distributed.ProcessGroupNCCL.NCCLConfig`
and `torch.distributed.ProcessGroupNCCL.Options`. Learn more about them using `help`

View File

@ -141,7 +141,7 @@ Currently, only the "nccl" and "gloo" backends for torch.distributed are support
CUDA API to HIP API mappings in C++
-----------------------------------
Please refer: https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html
Please refer: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/api_syntax.html
NOTE: The CUDA_VERSION macro, cudaRuntimeGetVersion and cudaDriverGetVersion APIs do not
semantically map to the same values as HIP_VERSION macro, hipRuntimeGetVersion and

View File

@ -24,9 +24,10 @@ MKL and MAGMA. Here are the steps to build with them.
REM 2.5.3 (CUDA 10.1 10.2 11.0) x (Debug Release)
REM 2.5.2 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release)
REM 2.5.1 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release)
set CUDA_PREFIX=cuda102
set CONFIG=release
curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z
set "CUDA_PREFIX=cuda102"
set "CONFIG=release"
set "HOST=https://s3.amazonaws.com/ossci-windows"
curl -k "%HOST%/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z" -o magma.7z
7z x -aoa magma.7z -omagma
REM Setting essential environment variables

View File

@ -27,7 +27,7 @@ Dependencies
The ONNX exporter depends on extra Python packages:
- `ONNX <https://onnx.ai>`_
- `ONNX Script <https://onnxscript.ai>`_
- `ONNX Script <https://microsoft.github.io/onnxscript>`_
They can be installed through `pip <https://pypi.org/project/pip/>`_:

View File

@ -452,7 +452,7 @@ ONNX operators that represent the function's behavior in ONNX. For example::
.. . ``torch::jit::Value::setType``). This is not required, but it can help the exporter's
.. shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see
.. ``test_aten_embedding_2`` in
.. `test_operators.py <https://github.com/pytorch/pytorch/blob/main/test/onnx/test_operators.py>`_.
.. `test_operators.py <https://github.com/pytorch/pytorch/blob/release/2.5/test/onnx/test_operators.py#L1179>`_.
.. The example below shows how you can access ``requires_grad`` via the ``Node`` object:

View File

@ -204,7 +204,7 @@ will create the ``OwnerRRef``, and returns an ACK to acknowledge ``{100, 1}``
**G2**, the ``OwnerRRef`` is a child of the ``UserRRef``, and the ``UserRRef``
is not deleted until it receives the ACK from the owner.
.. image:: https://user-images\.githubusercontent\.com/16999635/69164772-98181300-0abe-11ea-93a7-9ad9f757cd94.png
.. image:: https://user-images.githubusercontent.com/16999635/69164772-98181300-0abe-11ea-93a7-9ad9f757cd94.png
:alt: user_to_owner_ret.png
:width: 500 px

View File

@ -209,7 +209,7 @@ Diagnosing TorchInductor Errors
If the error does not occur with the ``"eager"`` backend, then the
backend compiler is the source of the error (`example
error <https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de%5D>`__).
error <https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de>`__).
There are `different choices <./torch.compiler.rst>`__
for backend compilers for TorchDynamo, with TorchInductor
fitting the needs of most users. This section focuses on TorchInductor

View File

@ -15,7 +15,7 @@ torch.finfo
.. class:: torch.finfo
A :class:`torch.finfo` is an object that represents the numerical properties of a floating point
:class:`torch.dtype`, (i.e. ``torch.float32``, ``torch.float64``, ``torch.float16``, and ``torch.bfloat16``). This is similar to `numpy.finfo <https://docs.scipy.org/doc/numpy/reference/generated/numpy.finfo.html>`_.
:class:`torch.dtype`, (i.e. ``torch.float32``, ``torch.float64``, ``torch.float16``, and ``torch.bfloat16``). This is similar to `numpy.finfo <https://numpy.org/doc/stable/reference/generated/numpy.finfo.html>`_.
A :class:`torch.finfo` provides the following attributes:
@ -49,7 +49,7 @@ torch.iinfo
A :class:`torch.iinfo` is an object that represents the numerical properties of a integer
:class:`torch.dtype` (i.e. ``torch.uint8``, ``torch.int8``, ``torch.int16``, ``torch.int32``, and ``torch.int64``). This is similar to `numpy.iinfo <https://docs.scipy.org/doc/numpy/reference/generated/numpy.iinfo.html>`_.
:class:`torch.dtype` (i.e. ``torch.uint8``, ``torch.int8``, ``torch.int16``, ``torch.int32``, and ``torch.int64``). This is similar to `numpy.iinfo <https://numpy.org/doc/stable/reference/generated/numpy.iinfo.html>`_.
A :class:`torch.iinfo` provides the following attributes:

View File

@ -9,7 +9,7 @@
setup_environment(){
# The rootfs image for a Tizen target (RPi3)is located at the below webpage:
# http://download.tizen.org/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/
# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/
# If you do not have a Tizen device, Please, run qemu-arm-static and chroot command.
# $ sudo chroot ~/tizen-rootfs qemu-arm-static /usr/bin/bash
@ -25,7 +25,7 @@ caffe2_lite_dep_packages(){
# You can set-up a rpm repository with zypper, yum, and dnf because Tizen
# software platform officially support rpm format such as Fedora, OpenSUSE.
# The official Tizen repository is as following:
# http://download.tizen.org/releases/milestone/tizen/4.0.m1/
# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/
echo "Installing dependencies."
sudo zypper install \
make \
@ -69,7 +69,7 @@ caffe2_full_dep_packages(){
# You can set-up a rpm repository with zypper, yum, and dnf because Tizen
# software platform officially support rpm format such as Fedora, OpenSUSE.
# The official Tizen repository is as following:
# http://download.tizen.org/releases/milestone/tizen/4.0.m1/
# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/
echo "Installing dependencies."
sudo zypper install \
cmake \

View File

@ -212,7 +212,9 @@ headers = {"Authorization": f"token {token}"}
def run_query(query):
request = requests.post(
"https://api.github.com/graphql", json={"query": query}, headers=headers
"https://api.github.com/graphql", # @lint-ignore
json={"query": query},
headers=headers,
)
if request.status_code == 200:
return request.json()

View File

@ -441,7 +441,7 @@ lstm_output_to_device(
}
// This test is a port of python code introduced here:
// https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
// https://medium.com/data-science/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
// Reverse forward of bidirectional GRU should act
// as regular forward of unidirectional GRU
void BidirectionalGRUReverseForward(bool cuda) {

View File

@ -594,7 +594,7 @@ class TestDTensorOps(DTensorOpTestBase):
dtensor_rs = func(*dtensor_args, **dtensor_kwargs)
# we need to skip tests containing tensors of zero elements for now.
# see issue: https://github.com/pytorch/tau/issues/470
# see issue: https://github.com/pytorch/PiPPy/issues/470
# TODO remove this once issue above fixed.
flat_args = pytree.tree_leaves(dtensor_rs)
if any(

View File

@ -192,7 +192,9 @@ class DistElementwiseOpsTest(DTensorOpTestBase):
op=torch.sigmoid,
)
@skip("testing RNG based ops is broken: https://github.com/pytorch/tau/issues/494")
@skip(
"testing RNG based ops is broken: https://github.com/pytorch/PiPPy/issues/494"
)
def test_dropout(self):
device_mesh = self.build_device_mesh()

View File

@ -5169,7 +5169,6 @@ class TestVmapOperatorsOpInfo(TestCase):
xfail("linalg.vecdot"),
# throws in vmap on CUDA
# IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)
# https://github.com/pytorch/pytorch/runs/8110653462?check_suite_focus=true
# but it passes locally
xfail("linalg.diagonal"),
skip("linalg.matrix_norm", ""),

View File

@ -581,7 +581,7 @@ class CudaReproTests(TestCase):
"""
This UT tests autotune on an inplace kernel. The autotune should not contaminate
the input buffers when tuning with multiple configs. For more details, refer to
https://github.com/openai/triton/issues/781
https://github.com/triton-lang/triton/issues/781
https://github.com/pytorch/torchdynamo/issues/1670
"""
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
@ -1587,7 +1587,7 @@ class CudaReproTests(TestCase):
@config.patch("triton.use_block_ptr", True)
def test_selecsls42b_misaligned_address(self):
# https://github.com/openai/triton/issues/2836
# https://github.com/triton-lang/triton/issues/2836
@torch.compile(fullgraph=True)
def fn(arg207_1, arg208_1, convert_element_type_40, expand, full, mul_3):

View File

@ -431,7 +431,7 @@ class ForeachTests(TestCase):
@requires_cuda
@scalar_bin_ops
@unittest.skip(
"Triton recursion depth exceeded: https://github.com/openai/triton/issues/1763"
"Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763"
)
def test_kernel_split_arg_limit_scalar(self, op):
def fn(a):

View File

@ -95,7 +95,7 @@ class TestIndexingSimplification(InductorTestCase):
ModularIndexing(i0 + i1 * i2 * r3, i2, r3), ModularIndexing(i0, i2, r3)
)
# if there are negative terms, we cannot optimize away zero terms due to https://github.com/openai/triton/issues/619
# if there are negative terms, we cannot optimize away zero terms due to https://github.com/triton-lang/triton/issues/619
self.assertEqual(
ModularIndexing(-i0 + i1 * 20, 2, 10), ModularIndexing(-i0 + i1 * 20, 2, 10)
)

View File

@ -166,7 +166,7 @@ class TestMaxAutotune(TestCase):
@parametrize("autotune_multi_device", (True, False))
def test_max_autotune_mm_plus_mm(self, autotune_in_subproc, autotune_multi_device):
"""
This crash previously due to a triton issue: https://github.com/openai/triton/issues/1298 .
This crash previously due to a triton issue: https://github.com/triton-lang/triton/issues/1298 .
With autotuning in subprocess, we don't crash anymore.
"""
m, n, k = 2048, 1536, 64

View File

@ -2969,7 +2969,7 @@ class CommonTemplate:
return torch.round(a), torch.round(b + 1), torch.round(a, decimals=2)
# without manual_seed, there is some chance this test fails due to:
# https://github.com/openai/triton/issues/530
# https://github.com/triton-lang/triton/issues/530
torch.manual_seed(0)
# with *100 we are always getting a number exactly at .5 which we don't do right in half
@ -7957,7 +7957,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
torch.randint(0, 100, size=[600], dtype=torch.int64),
torch.randn([600, 256, 7, 7]),
],
# workaround for https://github.com/openai/triton/issues/558
# workaround for https://github.com/triton-lang/triton/issues/558
check_lowp=False,
)

View File

@ -52,7 +52,7 @@ class TestQuantizedFunctionalOps(QuantizationTestCase):
# Make sure the results match
# assert_array_almost_equal compares using the following formula:
# abs(desired-actual) < 1.5 * 10**(-decimal)
# (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
# (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
# We use decimal = 0 to ignore off-by-1 differences between reference
# and test. Off-by-1 differences arise due to the order of round and
# zero_point addition operation, i.e., if addition followed by round is

View File

@ -320,7 +320,7 @@ class TestStaticQuantizedModule(QuantizationTestCase):
# Make sure the results match
# assert_array_almost_equal compares using the following formula:
# abs(desired-actual) < 1.5 * 10**(-decimal)
# (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
# (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
# We use decimal = 0 to ignore off-by-1 differences between reference
# and test. Off-by-1 differences arise due to the order of round and
# zero_point addition operation, i.e., if addition followed by round is

View File

@ -5200,7 +5200,7 @@ class TestQuantizedConv(TestCase):
# Make sure the results match
# assert_array_almost_equal compares using the following formula:
# abs(desired-actual) < 1.5 * 10**(-decimal)
# (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
# (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
# We use decimal = 0 to ignore off-by-1 differences between
# reference and test. Off-by-1 differences arise due to the order of
# round and zero_point addition operation, i.e., if addition
@ -6935,7 +6935,7 @@ class TestQuantizedConv(TestCase):
# Make sure the results match
# assert_array_almost_equal compares using the following formula:
# abs(desired-actual) < 1.5 * 10**(-decimal)
# (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
# (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
# We use decimal = 0 to ignore off-by-1 differences between
# reference and test. Off-by-1 differences arise due to the order of
# round and zero_point addition operation, i.e., if addition

View File

@ -11752,7 +11752,7 @@ class TestNNDeviceType(NNTestCase):
with self.assertRaisesRegex(RuntimeError, msg):
F.nll_loss(x, t, weight=weight)
# Ref: https://github.com/pytorch/pytorch/issue/85005
# Ref: https://github.com/pytorch/pytorch/issues/85005
@onlyCUDA
@largeTensorTest("120GB", "cpu")
@largeTensorTest("45GB", "cuda")
@ -11785,7 +11785,7 @@ class TestNNDeviceType(NNTestCase):
with torch.no_grad():
self.assertTrue(torch.allclose(input.grad.cpu(), input_cpu.grad, rtol=rtol, atol=atol))
# Ref: https://github.com/pytorch/pytorch/issue/108345
# Ref: https://github.com/pytorch/pytorch/issues/108345
@onlyCUDA
@largeTensorTest("20GB", "cpu")
@largeTensorTest("20GB", "cuda")

View File

@ -36,7 +36,7 @@ class TestNumbaIntegration(common.TestCase):
version: (int) Version 0
See:
https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
"""
types = [
@ -250,7 +250,7 @@ class TestNumbaIntegration(common.TestCase):
will use the exposed device memory.
See:
https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
"""
dtypes = [

View File

@ -1759,7 +1759,6 @@ class TestReductions(TestCase):
# On Windows CI, the current version of `numpy` promotes all lower integers
# dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking
# the exact dtype.
# Reference : https://dr.pytorch.org/api/view-log-full?build_id=122051580
# PR : https://github.com/pytorch/pytorch/pull/38628#issuecomment-655905370
if IS_WINDOWS and is_integral(dtype):
exact_dtype = False

View File

@ -547,7 +547,7 @@ class TestUnaryUfuncs(TestCase):
# sqrt Test Reference: https://github.com/pytorch/pytorch/pull/47424
x = torch.tensor(0.0 - 1.0e20j, dtype=dtype, device=device)
self.compare_with_numpy(torch.sqrt, np.sqrt, x)
# acos test reference: https://github.com/pytorch/pytorch/issue/42952
# acos test reference: https://github.com/pytorch/pytorch/issues/42952
if not (dtype == torch.cdouble and "cuda" in device):
self.compare_with_numpy(torch.acos, np.arccos, x)

View File

@ -4328,7 +4328,7 @@ class TestFromBuffer(TestCase):
@skipif(
IS_PYPY,
reason="PyPy's memoryview currently does not track exports. See: "
"https://foss.heptapod.net/pypy/pypy/-/issues/3724",
"https://github.com/pypy/pypy/issues/3723",
)
def test_mmap_close(self):
# The old buffer protocol was not safe for some things that the new

View File

@ -8,7 +8,7 @@ from urllib.request import urlretrieve
MIRRORS = [
"http://yann.lecun.com/exdb/mnist/",
"https://ossci-datasets.s3.amazonaws.com/mnist/",
"https://ossci-datasets.s3.amazonaws.com/mnist/", # @lint-ignore
]
RESOURCES = [

View File

@ -81,7 +81,7 @@ def get_external_pr_data(
response = cast(
dict[str, Any],
fetch_json(
"https://api.github.com/search/issues",
"https://api.github.com/search/issues", # @lint-ignore
params={
"q": f'repo:pytorch/pytorch is:pr is:closed \
label:"open source" label:Merged -label:Reverted closed:{period_begin_date}..{period_end_date}',

View File

@ -87,7 +87,7 @@ class OssCiUtilizationMetadataV1:
# this data model is for the time series data:
# https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_utilization_time_series_schema.sql
# https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_time_series_schema.sql
@dataclass
class OssCiUtilizationTimeSeriesV1:
created_at: int

View File

@ -12,7 +12,7 @@ MOCK_TEST_DATA = [
"sha": "f02f3046571d21b48af3067e308a1e0f29b43af9",
"id": 7819529276,
"conclusion": "failure",
"htmlUrl": "https://github.com/pytorch/pytorch/runs/7819529276?check_suite_focus=true",
"htmlUrl": "https://github.com/pytorch/pytorch/runs/7819529276?check_suite_focus=true", # @lint-ignore
"logUrl": "https://ossci-raw-job-status.s3.amazonaws.com/log/7819529276",
"durationS": 14876,
"failureLine": "##[error]The action has timed out.",
@ -25,7 +25,7 @@ MOCK_TEST_DATA = [
"sha": "d0d6b1f2222bf90f478796d84a525869898f55b6",
"id": 7818399623,
"conclusion": "failure",
"htmlUrl": "https://github.com/pytorch/pytorch/runs/7818399623?check_suite_focus=true",
"htmlUrl": "https://github.com/pytorch/pytorch/runs/7818399623?check_suite_focus=true", # @lint-ignore
"logUrl": "https://ossci-raw-job-status.s3.amazonaws.com/log/7818399623",
"durationS": 14882,
"failureLine": "##[error]The action has timed out.",

View File

@ -94,7 +94,7 @@ def trigger_upload_test_stats_intermediate_workflow() -> None:
# The GITHUB_TOKEN cannot trigger workflow so this isn't used for now
print("Triggering upload_test_stats_intermediate workflow")
x = requests.post(
"https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches",
"https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches", # noqa: B950 @lint-ignore
headers={
"Accept": "application/vnd.github.v3+json",
"Authorization": f"Bearer {os.environ.get('GITHUB_TOKEN')}",

View File

@ -1267,7 +1267,7 @@ def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ...
def _group_tensors_by_device_and_dtype(nested_tensorlists: List[List[Optional[Tensor]]], with_indices: _bool = False) -> Dict[Tuple[torch.device, torch.dtype], Tuple[List[List[Optional[Tensor]]], List[_int]]]: ...
# NB: There is no Capsule type in typing, see
# https://code.activestate.com/lists/python-dev/139675/
# https://github.com/python/cpython/issues/109562
def _to_dlpack(data: Tensor) -> Any: ... # THPModule_toDLPack
def _from_dlpack(data: Any) -> Tensor: ... # THPModule_fromDLPack
def _get_cpp_backtrace(

View File

@ -41,9 +41,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
See <https://github.com/ActiveState/appdirs> for details and usage.
"""
# Dev Notes:
# - MSDN on where to store app data files:
# http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120
# - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html
# - Windows "Known Folders": https://learn.microsoft.com/en-us/windows/win32/shell/csidl
# - macOS File System Programming Guide: https://developer.apple.com/library/archive/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/Introduction/Introduction.html
# - XDG spec for Un*x: https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
__version__ = "1.4.4"

View File

@ -4389,8 +4389,7 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b
t1_stride = t1.stride()
# Check the contiguous, we can skip the dim with size of 1
# as aten: https://github.com/pytorch/pytorch/blob/
# e201460f8aa1510b4c4686627d57b69756c4b916/aten/src/ATen/TensorGeometry.cpp#L17
# as aten: https://github.com/pytorch/pytorch/blob/e201460f8aa1510b4c4686627d57b69756c4b916/aten/src/ATen/TensorGeometry.cpp#L17
expected_stride = [1]
for size in reversed(t1_shape[1:]):
expected_stride.append(size * expected_stride[-1])

View File

@ -4881,9 +4881,8 @@ class CppScheduling(BaseScheduling):
len(get_call_ranges(_node)) == node.outer_loop_fusion_depth + 1
for _node in node.get_outer_nodes()
):
# Ref to the typical case of local buffer
# in https://github.com/pytorch/pytorch/blob/
# 1115a25c36340554442f28f9570abd42f0aface2/aten/src/ATen/native/cpu/SoftMaxKernel.cpp#L159
# Ref to the typical case of local buffer in
# https://github.com/pytorch/pytorch/blob/1115a25c36340554442f28f9570abd42f0aface2/aten/src/ATen/native/cpu/SoftMaxKernel.cpp#L159 # noqa: B950
# where the buffer is with size of last dim and contiguous.
# Only support this typical case at first.
visited_scheduler_nodes = OrderedSet[str]()

View File

@ -504,7 +504,7 @@ class BlockPtrOptions:
def triton_reshape(
value: str, old_shape: Sequence[sympy.Expr], new_shape: Sequence[sympy.Expr]
) -> str:
"""Workaround https://github.com/openai/triton/issues/2836"""
"""Workaround https://github.com/triton-lang/triton/issues/2836"""
assert isinstance(old_shape, list) and isinstance(new_shape, list)
old_shape_str = [V.kernel.index_to_str(shape) for shape in old_shape]
@ -841,7 +841,7 @@ class TritonOverrides(OpOverrides):
# fp8 data type conversions has min_elem_per_thread requirements.
# Refer to Triton implementations here:
# https://github.com/openai/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10.
# https://github.com/triton-lang/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10.
fp8_dtypes = (
torch.float8_e4m3fn,
torch.float8_e5m2,
@ -1828,7 +1828,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
and len(mask_vars - dense_mask_vars) == 0
and not self.is_indirect_indexing(index)
and have_loop_vars
# workaround https://github.com/openai/triton/issues/2821
# workaround https://github.com/triton-lang/triton/issues/2821
and self.index_dtype == "tl.int32"
):
@ -2053,7 +2053,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
) -> tuple[str, str]:
check = indexing.boundary_check()
if not check:
# workaround https://github.com/openai/triton/issues/2813
# workaround https://github.com/triton-lang/triton/issues/2813
other = ""
elif other:
assert other == ", other=0.0"
@ -2114,7 +2114,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
value, indexing.final_shape, indexing.block_shape, False
)
# workaround https://github.com/openai/triton/issues/2814
# workaround https://github.com/triton-lang/triton/issues/2814
value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
return f"tl.store({block_ptr}, {value}{other})"
@ -2260,7 +2260,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
line += ".to(tl.float32)"
dtype = torch.float32
if dtype == torch.bool and torch.version.hip is None:
# Workaround for https://github.com/openai/triton/issues/2151
# Workaround for https://github.com/triton-lang/triton/issues/2151
# tl.load returns int8 when loading from pointer to int1
# NOTE: Currently causes hangs on bool UTs for ROCm
line += ".to(tl.int1)"
@ -2302,7 +2302,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
# Guard against write-after-read corruption in triton.
# See # https://github.com/openai/triton/issues/1615
# See # https://github.com/triton-lang/triton/issues/1615
# This triton bug means that a load which is broadcasted over multiple
# warps may see the result of a store that happens later in the triton
# program. The workaround is to add a barrier before storing, which
@ -3655,7 +3655,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
# when they are not constexpr. otherwise there may be a segfault
# during launching the Inductor-compiled Triton kernel.
# https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
# https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
# https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
for arg_num in equal_1_arg_indices(signature): # type: ignore[index]
triton_meta["constants"][signature[arg_num].name] = 1 # type: ignore[index,union-attr]

View File

@ -34,7 +34,7 @@ def should_unwrap_unspec_arg(name: str):
def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
if isinstance(arg, TensorArg):
# TODO: Remove fp8 special handling when Triton supports PyTorch fp8 dtypes.
# Related PR: https://github.com/openai/triton/pull/2279/
# Related PR: https://github.com/triton-lang/triton/pull/2279/
if arg.dtype == torch.float8_e4m3fn:
tye = "*fp8e4nv"
elif arg.dtype == torch.float8_e5m2:
@ -184,7 +184,7 @@ def config_of(
def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
"""
Roughly follow triton code here:
https://github.com/openai/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222
https://github.com/triton-lang/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222
"""
if isinstance(x, TensorArg):
if include_tensor:

View File

@ -1985,7 +1985,7 @@ class PythonWrapperCodegen(CodeGen):
# TODO(aakhundov): add None args to constants, too. currently, this
# causes CUDA errors in test_aot_inductor.test_triton_kernel_with_none_input.
# https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
# https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
# https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
"constants": {
**constants,
**dict.fromkeys(equal_to_1_args, 1),

View File

@ -1158,7 +1158,7 @@ class triton:
# of registers being benchmarked.
#
# NOTE: triton will always report >0 register spills for kernels using sin/cos.
# (check this issue https://github.com/openai/triton/issues/1756 )
# (check this issue https://github.com/triton-lang/triton/issues/1756 )
# So far we see a fixed 8 spilled registers for kernels using sin/cos.
# Raise the threshold to 16 to be safe.
# We should revisit this once we understand more of the source of register spills.

View File

@ -326,7 +326,7 @@ def should_exclude_padding_time(match: Match, arg_name: str) -> bool:
if not fetch_fake_tensors(match, (arg_name,))[0].is_contiguous():
return False
# TODO - see issue https://githpub.com/pytorch/pytorch/issues/128889
# TODO - see issue https://github.com/pytorch/pytorch/issues/128889
# We would only able to completely plan these out if we were only doing
# first dimension padding. non-first we would still need a copy
# because these outputs are fixed dense.

View File

@ -2185,8 +2185,7 @@ def _register_qlinear_weight_prepack():
# Step 2: register patterns from bmm
# Linear might be decomposed into bmm when input dim exceeds 2 and not contiguous
# refer to:
# https://github.com/pytorch/pytorch/blob/
# 80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
# https://github.com/pytorch/pytorch/blob/80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
# in this case, we can convert it back to qlinear
for dtype, with_bias, is_tensor_overload in itertools.product(
[torch.float32, torch.bfloat16], [True, False], [True, False]

View File

@ -620,7 +620,7 @@ def convolution(
PADDING_W=padding[1],
GROUPS=groups,
# TODO(jansel): try unroll for bigger kernels once fixed:
# https://github.com/openai/triton/issues/1254
# https://github.com/triton-lang/triton/issues/1254
UNROLL=is_ones(kernel_shape),
ALLOW_TF32=torch.backends.cudnn.allow_tf32,
num_stages=cfg.num_stages,
@ -643,7 +643,7 @@ def convolution(
PADDING_W=padding[2],
GROUPS=groups,
# TODO(jansel): try unroll for bigger kernels once fixed:
# https://github.com/openai/triton/issues/1254
# https://github.com/triton-lang/triton/issues/1254
UNROLL=is_ones(kernel_shape),
ALLOW_TF32=torch.backends.cudnn.allow_tf32,
num_stages=cfg.num_stages,

View File

@ -134,7 +134,7 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
)
):
# TODO(jansel): support different K values when this is fixed:
# https://github.com/openai/triton/issues/967
# https://github.com/triton-lang/triton/issues/967
return lowerings[aten.add](
lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4)
)
@ -151,7 +151,7 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
if use_triton_template(layout1):
for config in mm_configs():
# see https://github.com/openai/triton/issues/1298
# see https://github.com/triton-lang/triton/issues/1298
# BLOCK_K = K causes llvm error
if V.graph.sizevars.statically_known_lt(config.kwargs["BLOCK_K"], k1):
mm_plus_mm_template.maybe_append_choice(

View File

@ -6092,7 +6092,7 @@ def div_mode(a, b, rounding_mode=None):
both_boolean = is_boolean_type(a) and is_boolean_type(b)
# floordiv and truncdiv need special handling for integer tensors on Triton,
# see the discussion at https://github.com/openai/triton/issues/605
# see the discussion at https://github.com/triton-lang/triton/issues/605
if rounding_mode == "floor":
assert not both_boolean, "floordiv operands can not be boolean at the same time"
return floordiv(a, b) if both_integer else floor(div(a, b))

View File

@ -707,8 +707,8 @@ def register_onednn_fusion_ops():
assert x_zp.get_numel() == 1, "x_zp is incompatible with oneDNN qlinear"
# When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer
# Refer to https://github.com/pytorch/pytorch/blob
# /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577
# Refer to
# https://github.com/pytorch/pytorch/blob/f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577 # noqa: B950
if w_zp is None:
# If w_zp is None, then it's a dummy tensor created to denote the
# absence of a zero point, and thus w is int8 symmetrically quantized.
@ -1018,8 +1018,8 @@ def register_onednn_fusion_ops():
x_zp.realize()
# When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer
# Refer to https://github.com/pytorch/pytorch/blob
# /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577
# Refer to
# https://github.com/pytorch/pytorch/blob/f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577 # noqa: B950
w_scale.realize()
w_zp.realize()
if w_zp.get_dtype() != torch.int32 and isinstance(

View File

@ -44,7 +44,7 @@ if triton is not None:
return (backend, arch)
# In the latest triton, math functions were shuffled around into different modules:
# https://github.com/openai/triton/pull/3172
# https://github.com/triton-lang/triton/pull/3172
try:
from triton.language.extra import libdevice

View File

@ -1472,7 +1472,7 @@ class TritonCompileResult(CompileResult[CompiledKernel]):
"metadata",
*call_args,
]
else: # args after CompiledKernel.launch_metadata: https://github.com/openai/triton/pull/3492
else: # args after CompiledKernel.launch_metadata: https://github.com/triton-lang/triton/pull/3492
# Getting the kernel launch args is extremely perf-sensitive. Evaluating
# `bin.launch_metadata` is relatively expensive, and returns None unless a
# `launch_enter_hook` is installed. So if we don't have that hook installed,

View File

@ -4560,7 +4560,7 @@ class Scheduler:
)
return False
except CompilationError as e:
# workaround triton issue: https://github.com/openai/triton/issues/2151
# workaround triton issue: https://github.com/triton-lang/triton/issues/2151
if "Loop-carried variable" in str(e):
fusion_log.debug(
"ComboKernel benchmark: return True because of loop-carried variable"
@ -4574,7 +4574,7 @@ class Scheduler:
try:
ms2, ms2_clone, _path2_list = self.benchmark_combo_kernel(subkernel_nodes)
except CompilationError as e:
# workaround triton issue: https://github.com/openai/triton/issues/2151
# workaround triton issue: https://github.com/triton-lang/triton/issues/2151
if "Loop-carried variable" in str(e):
fusion_log.debug(
"ComboKernel benchmark: return True because of loop-carried variable"

View File

@ -390,7 +390,7 @@ def ceildiv(
def _type_of(key: Optional[torch.dtype]) -> str:
# Use the function here to get rid of dependencies on the Triton during the codegen.
# Refer to Triton implementation here:
# https://github.com/openai/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
# https://github.com/triton-lang/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
# `None` is nullptr. Implicitly convert to *i8.
if key is None:
return "*i8"
@ -1981,7 +1981,7 @@ def get_device_tflops(dtype: torch.dtype) -> int:
assert dtype in (torch.float16, torch.bfloat16, torch.float32)
if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
# Triton API change in https://github.com/openai/triton/pull/2293
# Triton API change in https://github.com/triton-lang/triton/pull/2293
from torch._utils_internal import max_clock_rate
sm_clock = max_clock_rate()

View File

@ -435,7 +435,7 @@ class ndarray:
def item(self, *args):
# Mimic NumPy's implementation with three special cases (no arguments,
# a flat index and a multi-index):
# https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/methods.c#L702
# https://github.com/numpy/numpy/blob/main/numpy/_core/src/multiarray/methods.c#L702
if args == ():
return self.tensor.item()
elif len(args) == 1:

View File

@ -1262,7 +1262,7 @@ class Tensor(torch._C.TensorBase):
"""Array view description for cuda tensors.
See:
https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
"""
if has_torch_function_unary(self):
# TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185

View File

@ -4163,9 +4163,9 @@ Unlike :meth:`~Tensor.expand`, this function copies the tensor's data.
.. warning::
:meth:`~Tensor.repeat` behaves differently from
`numpy.repeat <https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html>`_,
`numpy.repeat <https://numpy.org/doc/stable/reference/generated/numpy.repeat.html>`_,
but is more similar to
`numpy.tile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_.
`numpy.tile <https://numpy.org/doc/stable/reference/generated/numpy.tile.html>`_.
For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`.
Args:

View File

@ -767,7 +767,7 @@ This function checks if :attr:`input` and :attr:`other` satisfy the condition:
"""
+ r"""
elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to
`numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_
`numpy.allclose <https://numpy.org/doc/stable/reference/generated/numpy.allclose.html>`_
Args:
input (Tensor): first tensor to compare
@ -13826,7 +13826,7 @@ Returns the indices of the buckets to which each value in the :attr:`input` belo
boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
this behavior is opposite the behavior of
`numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
`numpy.digitize <https://numpy.org/doc/stable/reference/generated/numpy.digitize.html>`_.
More formally, the returned index satisfies the following rules:
.. list-table::

View File

@ -219,7 +219,7 @@ def _vmap(
# The `allow_none_pass_through` argument is a temporary workaround may be removed.
# Currently it enables us to wrap the call in `autograd.grad` to the autograd engine,
# which may return None if any of the inputs are unused. See the issue discussing this:
# https://github.com/facebookresearch/functorch/issues/159.
# https://github.com/pytorch/functorch/issues/159.
@functools.wraps(func)
def wrapped(*args):
_check_out_dims_is_int_or_int_tuple(out_dims, func)

View File

@ -703,8 +703,8 @@ class X86InductorQuantizer(Quantizer):
# Once we've annotated the model with quantization configurations, we also need to annotate
# the output of quantizable operations. For example, if we annotated `maxpool2d` to quantize its inputs,
# we will quantize its output accordingly. This enables us to fuse the dq-operator-q into a quantized op.
# Refer to https://github.com/intel/intel-extension-for-pytorch/blob/
# 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487
# Refer to
# https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487 # noqa: B950
self._annotate_output_for_int8_in_int8_out_pattern_entry(model)
@ -732,8 +732,8 @@ class X86InductorQuantizer(Quantizer):
# Step2: Recipe to propagate annotation for patterns beside conv/linear.
# Go through all the nodes from start to end.
# Recipe refer to https://github.com/intel/intel-extension-for-pytorch/blob/
# 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538
# Recipe refer to
# https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538 # noqa: B950
self._annotate_propagation_quantizable_pattern_entry(
model, quantization_config, filter_fn
@ -1381,9 +1381,9 @@ class X86InductorQuantizer(Quantizer):
) -> None:
r"""
Check and insert observer at output of node in int8_in_int8_out_ops if needed.
Recipe refers to https://github.com/intel/intel-extension-for-pytorch/blob/
90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
"""
Recipe refers to
https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
""" # noqa: B950
edge_or_node: tuple[Node, Node]
if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])):
if node.target == torch.ops.aten.max_pool2d.default:

View File

@ -635,7 +635,7 @@ static c10::ArrayRef<T> get_set_cached_attr(
// is also to <=5 elements, we don't need to reallocate.
// Note: I tried removing this optimization and tripped ASAN
// in a batchnorm kernel here:
// https://pipelinesghubeus21.actions.githubusercontent.com/mBh68xKhi8LyM7tp3vECvYXNFvuV4gyVGgmYCteuEZP9JH92QN/_apis/pipelines/1/runs/3373307/signedlogcontent/790?urlExpires=2023-09-15T21%3A13%3A51.4327798Z&urlSigningMethod=HMACV1&urlSignature=tDeX7ZqaARVU5NNwyr5yYqqkWq3A2j4z8FFdqYwGr0Q%3D
// https://pipelinesghubeus21.actions.githubusercontent.com/mBh68xKhi8LyM7tp3vECvYXNFvuV4gyVGgmYCteuEZP9JH92QN/_apis/pipelines/1/runs/3373307/signedlogcontent/790?urlExpires=2023-09-15T21%3A13%3A51.4327798Z&urlSigningMethod=HMACV1&urlSignature=tDeX7ZqaARVU5NNwyr5yYqqkWq3A2j4z8FFdqYwGr0Q%3D@lint-ignore
// We should fix this instead.
bool needs_resize = false;
// We need to resize if:

View File

@ -21,7 +21,7 @@ using namespace torch::nn::utils::rnn;
namespace torch::nn {
/// These must line up with the CUDNN mode codes:
/// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
/// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/api/cudnn-adv-library.html#cudnnrnnmode-t
enum class CuDNNMode { RNN_RELU = 0, RNN_TANH = 1, LSTM = 2, GRU = 3 };
static CuDNNMode get_cudnn_mode_for_rnn(

View File

@ -891,8 +891,8 @@ Tensor logcumsumexp_backward(
return grad;
}
// Reference: https://github.com/tensorflow/tensorflow/blob/
// 2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
// Reference:
// https://github.com/tensorflow/tensorflow/blob/2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
auto scalar_min = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
at::ScalarType::BFloat16,

View File

@ -15,7 +15,8 @@ namespace torch::jit::fuser::onednn {
// being created for each device. The device handle passed from PyTorch allows
// oneDNN Graph implementation to work on the device specified by PyTorch, which
// is currently CPU, so we only have one engine.
// Ref: https://spec.oneapi.io/onednn-graph/latest/programming_model.html#engine
// Ref:
// https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#engine
struct Engine {
// CPU engine singleton
static dnnl::engine& getEngine();

View File

@ -1,5 +1,5 @@
# Pytorch - oneDNN Graph API Bridge
This is a PyTorch JIT graph fuser based on [oneDNN Graph API](https://spec.oneapi.io/onednn-graph/latest/programming_model.html), which provides a flexible API for aggressive fusion. Float & BFloat16 inference is supported. However, BFloat16 only performs well on Intel Xeon Cooper Lake platform & beyond, as they have native BFloat16 support. Also, currently, PyTorch has divergent AMP support in JIT & eager modes, so one should disable JIT AMP support & leverage eager mode AMP support to use BFloat16. Please refer to the BFloat16 example below.
This is a PyTorch JIT graph fuser based on [oneDNN Graph API](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model), which provides a flexible API for aggressive fusion. Float & BFloat16 inference is supported. However, BFloat16 only performs well on Intel Xeon Cooper Lake platform & beyond, as they have native BFloat16 support. Also, currently, PyTorch has divergent AMP support in JIT & eager modes, so one should disable JIT AMP support & leverage eager mode AMP support to use BFloat16. Please refer to the BFloat16 example below.
Currently, speedup is achieved only for static shapes, although we'd soon add dynamic-shape support. When oneDNN Graph is enabled, weights are cached, as they're constant during inference.
@ -29,7 +29,7 @@ We have registered optimization passes in the custom pre-passes set of PyTorch:
## Graph Executor
During runtime execution of a (re-written) PyTorch JIT graph, oneDNN graph partitions will be dispatched to the oneDNN graph JIT variadic Operator.
Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#partition) and [executed](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph.
Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#partition) and [executed](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph.
## Tests

View File

@ -143,7 +143,7 @@ ExprHandle abs(const ExprHandle& v) {
}
// The default tanh is quite slow, use the Eigen version from here:
// https://bitbucket.org/eigen/eigen/src/94875feeeeb9abe5509b314197da1991ba2070f5/Eigen/src/Core/MathFunctionsImpl.h#lines-26
// https://github.com/TUW-VieVS/VieSchedpp/blob/master/Eigen/src/Core/MathFunctionsImpl.h#L26
ExprHandle fast_tanh(const ExprHandle& v) {
// TODO: use a dedicated bind-var to make sure v is not evaluated multiple
// times. Clamp the input expression to [-9, 9]
@ -205,7 +205,7 @@ ExprHandle fast_sigmoid(const ExprHandle& x) {
ExprHandle fast_log(const ExprHandle& v) {
// this implementation is taken from sleef:
// https://github.com/shibatch/sleef/blob/master/src/libm/sleefsp.c#L1131
// https://github.com/shibatch/sleef/blob/master/src/libm/sleefsimdsp.c#L1277
// to generate coefficients, this tool is provided
// https://github.com/shibatch/sleef/blob/master/src/gencoef/gencoef.txt
auto ilogb2kf = [](const ExprHandle& x) {

View File

@ -1,6 +1,6 @@
/**
* Cache utils in this file is adapted from PyTorch/XLA
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/cache.h
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/cache.h
*/
#pragma once

View File

@ -1,6 +1,6 @@
/**
* This file is adapted from PyTorch/XLA
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h
*/
#pragma once

View File

@ -1,6 +1,6 @@
/**
* This file is adapted from PyTorch/XLA
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/multi_wait.h
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/multi_wait.h
*/
#pragma once

View File

@ -1,6 +1,6 @@
/**
* This file is adapted from PyTorch/XLA
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h
*/
#pragma once

View File

@ -1,6 +1,6 @@
/**
* Unique in this file is adapted from PyTorch/XLA
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/unique.h
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/unique.h
*/
#pragma once

Some files were not shown because too many files have changed in this diff Show More