mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Fix broken URLs (#152237)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/152237 Approved by: https://github.com/huydhn, https://github.com/malfet
This commit is contained in:
parent
cbcc03c2ad
commit
e2f9759bd0
2
.github/scripts/github_utils.py
vendored
2
.github/scripts/github_utils.py
vendored
|
|
@ -128,7 +128,7 @@ def gh_fetch_json_dict(
|
|||
|
||||
def gh_graphql(query: str, **kwargs: Any) -> dict[str, Any]:
|
||||
rc = gh_fetch_url(
|
||||
"https://api.github.com/graphql",
|
||||
"https://api.github.com/graphql", # @lint-ignore
|
||||
data={"query": query, "variables": kwargs},
|
||||
reader=json.load,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ endif()
|
|||
|
||||
# This define is needed to preserve behavior given anticpated changes to
|
||||
# cccl/thrust
|
||||
# https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html
|
||||
# https://nvidia.github.io/cccl/libcudacxx/standard_api/numerics_library/complex.html
|
||||
string(APPEND CMAKE_CUDA_FLAGS
|
||||
" -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS")
|
||||
|
||||
|
|
|
|||
|
|
@ -194,7 +194,7 @@ If you want to compile with CUDA support, [select a supported version of CUDA fr
|
|||
- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
|
||||
- [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA
|
||||
|
||||
Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
|
||||
Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
|
||||
|
||||
If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
|
||||
Other potentially useful environment variables may be found in `setup.py`.
|
||||
|
|
|
|||
|
|
@ -221,7 +221,7 @@ Release candidates are currently stored in the following places:
|
|||
|
||||
* Wheels: https://download.pytorch.org/whl/test/
|
||||
* Conda: https://anaconda.org/pytorch-test
|
||||
* Libtorch: https://download.pytorch.org/libtorch/test
|
||||
* Libtorch: https://download.pytorch.org/libtorch/test <!-- @lint-ignore -->
|
||||
|
||||
Backups are stored in a non-public S3 bucket at [`s3://pytorch-backup`](https://s3.console.aws.amazon.com/s3/buckets/pytorch-backup?region=us-east-1&tab=objects)
|
||||
|
||||
|
|
@ -322,7 +322,7 @@ Promotion should occur in two steps:
|
|||
* Promote S3 artifacts (wheels, libtorch) and Conda packages
|
||||
* Promote S3 wheels to PyPI
|
||||
|
||||
**NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypa/warehouse/issues/726 for a discussion on potential draft releases within PyPI)
|
||||
**NOTE**: The promotion of wheels to PyPI can only be done once so take caution when attempting to promote wheels to PyPI, (see https://github.com/pypi/warehouse/issues/726 for a discussion on potential draft releases within PyPI)
|
||||
|
||||
## Additional Steps to prepare for release day
|
||||
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ void ambiguous_autogradother_kernel(OperatorKernel*, const OperatorHandle& op, D
|
|||
"Autograd dispatch key for the backend.\n",
|
||||
"If you only want to run inference instead of training, in C++, add `c10::InferenceMode mode;` "
|
||||
"before model.forward(); in Python, use `torch.inference_mode()` as a context manager (see "
|
||||
"https://pytorch.org/docs/stable/generated/torch.inference_mode.html).",
|
||||
"https://pytorch.org/docs/stable/generated/torch.autograd.grad_mode.inference_mode.html).",
|
||||
"\nCanonical state\n~~~~~~~~~~~\n", op.dumpState(), "\n\n");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -410,7 +410,7 @@ template <typename T>
|
|||
__host__ __device__ T safe_max(T a, T b) {
|
||||
#if defined(__HIPCC__)
|
||||
// TODO: remove this special case for HIP when issue is fixed:
|
||||
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
|
||||
// https://github.com/ROCm/hip/issues/2209
|
||||
T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max<T>(a, b));
|
||||
#else
|
||||
T max = at::_isnan(b) ? b : std::max<T>(a, b);
|
||||
|
|
@ -470,7 +470,7 @@ template <typename T>
|
|||
__host__ __device__ T safe_min(T a, T b) {
|
||||
#if defined(__HIPCC__)
|
||||
// TODO: remove this special case for HIP when issue is fixed:
|
||||
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
|
||||
// https://github.com/ROCm/hip/issues/2209
|
||||
T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min<T>(a, b));
|
||||
#else
|
||||
T min = at::_isnan(b) ? b : std::min<T>(a, b);
|
||||
|
|
|
|||
|
|
@ -1680,7 +1680,7 @@ inline C10_HOST_DEVICE T calc_ndtri(T y0) {
|
|||
return x;
|
||||
}
|
||||
|
||||
/* The next function is taken from http://ab-initio.mit.edu/Faddeev */
|
||||
/* The next function is taken from http://ab-initio.mit.edu/faddeeva */
|
||||
|
||||
/* Copyright (c) 2012 Massachusetts Institute of Technology
|
||||
*
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ template <typename scalar_t>
|
|||
inline C10_DEVICE scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
|
||||
#if defined(__HIPCC__)
|
||||
// TODO: remove this special case for HIP when issue is fixed:
|
||||
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
|
||||
// https://github.com/ROCm/hip/issues/2209
|
||||
scalar_t max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b));
|
||||
#else
|
||||
scalar_t max = at::_isnan(b) ? b : std::max(a, b);
|
||||
|
|
@ -37,7 +37,7 @@ template <typename scalar_t>
|
|||
inline C10_DEVICE scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
|
||||
#if defined(__HIPCC__)
|
||||
// TODO: remove this special case for HIP when issue is fixed:
|
||||
// https://github.com/ROCm-Developer-Tools/HIP/issues/2209
|
||||
// https://github.com/ROCm/hip/issues/2209
|
||||
scalar_t min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b));
|
||||
#else
|
||||
scalar_t min = at::_isnan(b) ? b : std::min(a, b);
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
|
||||
namespace {
|
||||
// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE_1g57a3c8313f570282a1a7bcc78743b08e,
|
||||
// Thin wrapper around https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH__SINGLE.html,
|
||||
// to ensure the Cuda math library's isfinite is actually what gets called in
|
||||
// _amp_non_finite_check_and_unscale_cuda_'s gpu_kernel lambda.
|
||||
//
|
||||
|
|
|
|||
|
|
@ -766,7 +766,7 @@ const auto sinc_string = jiterator_stringify(
|
|||
); // sinc_string
|
||||
|
||||
const auto erfcx_string = jiterator_stringify(
|
||||
/* The next function is taken from http://ab-initio.mit.edu/Faddeev */
|
||||
/* The next function is taken from http://ab-initio.mit.edu/faddeeva */
|
||||
|
||||
/* Copyright (c) 2012 Massachusetts Institute of Technology
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1865,8 +1865,6 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
|||
// We require to perform ?geqrf_gpu again due to this bug in MAGMA:
|
||||
// - ?geqrf_gpu allows fast computation of Q via ?orgqr_gpu, but doesn't give R properly.
|
||||
// - ?geqrf2_gpu gives correct R, but doesn't allow computation of Q via ?orgqr_gpu
|
||||
// Refer to the below link for more details:
|
||||
// http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=1015&p=2800&hilit=geqrf_gpu#p2800
|
||||
case at::LinalgBackend::Magma:
|
||||
return geqrf_magma(input, tau);
|
||||
case at::LinalgBackend::Cusolver:
|
||||
|
|
|
|||
|
|
@ -347,7 +347,7 @@ struct BenchmarkCache {
|
|||
|
||||
// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
|
||||
// be thread safe across all engines see Limitations in
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
|
||||
thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyWrapper>
|
||||
benchmark_cache;
|
||||
thread_local BenchmarkCache<cudnn_frontend::ExecutionPlan, CacheKeyFusedWrapper>
|
||||
|
|
|
|||
|
|
@ -296,7 +296,7 @@ struct MHAGraphCache {
|
|||
|
||||
// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to
|
||||
// be thread safe across all engines see Limitations in
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/release-notes.html
|
||||
thread_local MHAGraphCache<graph_and_tensors, MHACacheKeyWrapper> mhagraphcache;
|
||||
thread_local MHAGraphCache<graph_and_tensors_backward, MHACacheKeyWrapper>
|
||||
mhagraphbackwardcache;
|
||||
|
|
|
|||
|
|
@ -1204,7 +1204,7 @@ cudnnRNNAlgo_t get_algo(
|
|||
// Persistent algos typically don't work for packed inputs with sequence
|
||||
// lengths that vary across batch elements, and will return
|
||||
// CUDNN_STATUS_NOT_SUPPORTED if attempted. See
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-890/developer-guide/index.html#features-of-rnn-functions
|
||||
if (!tensors.is_input_packed()) {
|
||||
auto cudnnDataType = getCudnnDataType(input);
|
||||
if (cudnnDataType != CUDNN_DATA_DOUBLE) {
|
||||
|
|
@ -1274,7 +1274,7 @@ int64_t _cudnn_rnn_flatten_weight_prologue(
|
|||
rnn_desc = rnn.descriptor(handle);
|
||||
|
||||
// Why do we pad to 5 dims here (and elsewhere)?
|
||||
// https://docs.nvidia.com/deeplearning/sdk/cudnn-api/index.html#cudnnRNNForwardTraining
|
||||
// https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-892/api/index.html#cudnnRNNForwardTraining
|
||||
// expects descriptors padded to 3 dimensions.
|
||||
x_desc.set(flat_buf_datatype, x_geom.sizes(), x_geom.strides(), 5);
|
||||
|
||||
|
|
|
|||
|
|
@ -213,7 +213,7 @@ INSTANTIATE_INT4MV(bfloat, 256);
|
|||
* 1. Load A and B blocks (32x32 and 64x32 respectively) into shared memory.
|
||||
* 2. In 4 simdgroups, calculate the outer product of the loaded blocks. Each simdgroup produces a 2x4 8x8 result.
|
||||
* 2.1 For how to use outer product to perform matrix multiplication, refer to
|
||||
* http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products
|
||||
* https://web.archive.org/web/20230521063455/http://mlwiki.org/index.php/Matrix-Matrix_Multiplication#Sum_of_Outer_Products
|
||||
* 3. Repeat 1 & 2 along K axis, with K block size 32, accumulate the result in the 2x4 8x8 block.
|
||||
* 4. Dequantize the final result and store it in the output matrix.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -1923,7 +1923,7 @@ namespace {
|
|||
* FBGEMM uses vpmaddubsw instruction to multiply activations (uint8_t) and
|
||||
* weights (int8_t).
|
||||
*
|
||||
* https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16&expand=3284,3530
|
||||
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16&expand=3284,3530&ig_expand=4236
|
||||
*
|
||||
* vpmaddubsw operates on a vector of activations and a vector of
|
||||
* weights. If these vectors are
|
||||
|
|
|
|||
|
|
@ -78,10 +78,10 @@ MAX_JOBS=1 scripts/build_local.sh -DBUILD_BINARY=ON -DBUILD_PYTHON=OFF \
|
|||
-DUSE_OBSERVERS=OFF -DUSE_DISTRIBUTED=OFF
|
||||
|
||||
# Download model weights
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore
|
||||
|
||||
# Download model graph
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore
|
||||
|
||||
# Run speed benchmark with 50 warm-up iterations and 10 measurement iterations
|
||||
build/bin/speed_benchmark --net predict_net.pb --init_net init_net.pb \
|
||||
|
|
@ -104,11 +104,11 @@ scripts/build_android.sh -DANDROID_TOOLCHAIN=clang -DBUILD_BINARY=ON
|
|||
adb push build_android/bin/speed_benchmark /data/local/tmp/speed_benchmark
|
||||
|
||||
# Download model weights and copy them to Android device
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore
|
||||
adb push init_net.pb /data/local/tmp/init_net.pb
|
||||
|
||||
# Download model graph and copy it to Android device
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore
|
||||
adb push predict_net.pb /data/local/tmp/predict_net.pb
|
||||
|
||||
# Run speed benchmark with 50 warm-up iterations and 10 measurement iterations
|
||||
|
|
@ -134,11 +134,11 @@ scripts/build_android.sh -DANDROID_ABI=arm64-v8a -DANDROID_TOOLCHAIN=clang -DBUI
|
|||
adb push build_android/bin/speed_benchmark /data/local/tmp/speed_benchmark
|
||||
|
||||
# Download model weights and copy them to Android device
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/init_net.pb # @lint-ignore
|
||||
adb push init_net.pb /data/local/tmp/init_net.pb
|
||||
|
||||
# Download model graph and copy it to Android device
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb
|
||||
wget https://s3.amazonaws.com/download.caffe2.ai/models/mobilenet_v2_1.0_224_quant/predict_net.pb # @lint-ignore
|
||||
adb push predict_net.pb /data/local/tmp/predict_net.pb
|
||||
|
||||
# Run speed benchmark with 50 warm-up iterations and 10 measurement iterations
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ std::unordered_map<CacheKey, cudnn_frontend::ExecutionPlan, at::native::ParamsHa
|
|||
} // anonymous namespace
|
||||
// TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching
|
||||
// multiple operators
|
||||
// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293
|
||||
// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/legacy_samples/conv_sample.cpp#L295
|
||||
//static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache");
|
||||
|
||||
// the parameter quantized_output is a quantized tensor
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ std::unordered_map<CacheKey, cudnn_frontend::ExecutionPlan, at::native::ParamsHa
|
|||
}
|
||||
// TODO: we can use cudnn_frontend::ExecutionPlanCache when it supports caching
|
||||
// multiple operators
|
||||
// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/conv_sample.cpp#L293
|
||||
// reference: https://github.com/NVIDIA/cudnn-frontend/blob/main/samples/legacy_samples/conv_sample.cpp#L295
|
||||
//static cudnn_frontend::ExecutionPlanCache plan_cache("sample_cache");
|
||||
|
||||
// currently we only support int8 symmetric (zero_point = 0 for inputs and output) quantized linear op
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ Tensor adaptive_avg_pool2d_quantized_cuda(
|
|||
// any 3D tensors to 4D prior to using cudnn
|
||||
// This implementation currently uses the v7 cudnn APIs as v8 cudnn APIs are not yet available for
|
||||
// pooling operations.
|
||||
// Consult https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnPoolingForward for
|
||||
// Consult https://docs.nvidia.com/deeplearning/cudnn/backend/latest/api/cudnn-ops-library.html#cudnnpoolingforward for
|
||||
// documentation on the APIs
|
||||
// Currently, it appears there is no cudnn support for dilated pooling -- we will
|
||||
// submit a feature request for this with cudnn
|
||||
|
|
|
|||
|
|
@ -67,7 +67,7 @@ ORDER BY
|
|||
workflowName, jobName
|
||||
"""
|
||||
ARTIFACTS_QUERY_URL = (
|
||||
"https://console-api.clickhouse.cloud/.api/query-endpoints/"
|
||||
"https://console-api.clickhouse.cloud/.api/query-endpoints/" # @lint-ignore
|
||||
"c1cdfadc-6bb2-4a91-bbf9-3d19e1981cd4/run?format=JSON"
|
||||
)
|
||||
CSV_LINTER = str(
|
||||
|
|
|
|||
|
|
@ -473,7 +473,7 @@ class TransformerModel(nn.Module):
|
|||
return F.log_softmax(output, dim=-1)
|
||||
|
||||
|
||||
# From https://github.com/pytorch/text/blob/master/torchtext/modules
|
||||
# From https://github.com/pytorch/text/tree/master/torchtext/nn/modules
|
||||
class MultiheadAttentionContainer(torch.nn.Module):
|
||||
def __init__(self, nhead, in_proj_container, attention_layer, out_proj):
|
||||
r"""A multi-head attention container
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ def numpy_random(dtype, *shapes):
|
|||
Args:
|
||||
shapes: int or a sequence of ints to defining the shapes of the tensor
|
||||
dtype: use the dtypes from numpy
|
||||
(https://docs.scipy.org/doc/numpy/user/basics.types.html)
|
||||
(https://numpy.org/doc/stable/user/basics.types.html)
|
||||
Return:
|
||||
numpy tensor of dtype
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
DATASET_ROOT_DIR=$HOME/datasets/
|
||||
|
||||
# wget https://storage.googleapis.com/sgk-sc2020/dlmc.tar.gz -P $DATASET_ROOT_DIR
|
||||
# wget https://storage.googleapis.com/sgk-sc2020/dlmc.tar.gz -P $DATASET_ROOT_DIR # @lint-ignore
|
||||
# tar -xvf $DATASET_ROOT_DIR/dlmc.tar.gz
|
||||
|
||||
echo "!! SPARSE SPMS TIME BENCHMARK!! "
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
# This is a copy of rnn_attention from MLPerf, with some common sizes hardcoded
|
||||
# for benchmarking and some control flow stripped out.
|
||||
# https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/seq2seq/models/attention.py
|
||||
# https://github.com/mlcommons/training/blob/master/retired_benchmarks/gnmt/pytorch/seq2seq/models/attention.py
|
||||
|
||||
import torch
|
||||
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ class ScribeUploader:
|
|||
access_token = os.environ.get("SCRIBE_GRAPHQL_ACCESS_TOKEN")
|
||||
if not access_token:
|
||||
raise ValueError("Can't find access token from environment variable")
|
||||
url = "https://graph.facebook.com/scribe_logs"
|
||||
url = "https://graph.facebook.com/scribe_logs" # @lint-ignore
|
||||
r = requests.post(
|
||||
url,
|
||||
data={
|
||||
|
|
|
|||
|
|
@ -297,7 +297,7 @@ def get_pt_preprocessor_flags():
|
|||
PT_PREPROCESSOR_FLAGS.append("-DENABLE_PYTORCH_NON_PRODUCTION_BUILDS")
|
||||
return PT_PREPROCESSOR_FLAGS
|
||||
|
||||
# This needs to be kept in sync with https://github.com/pytorch/pytorch/blob/release/1.9/torchgen/gen.py#L892
|
||||
# This needs to be kept in sync with https://github.com/pytorch/pytorch/blob/release/1.9/torchgen/gen.py#L892 @lint-ignore
|
||||
PT_BACKEND_HEADERS = [
|
||||
"CPU",
|
||||
"CUDA",
|
||||
|
|
|
|||
|
|
@ -241,7 +241,7 @@ using namespace c10::xpu;
|
|||
#ifdef __HIPCC__
|
||||
// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
|
||||
// We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
|
||||
// See https://github.com/ROCm-Developer-Tools/HIP/issues/441
|
||||
// See https://github.com/ROCm/hip/issues/441
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif
|
||||
|
||||
|
|
|
|||
2
cmake/External/aotriton.cmake
vendored
2
cmake/External/aotriton.cmake
vendored
|
|
@ -98,7 +98,7 @@ if(NOT __AOTRITON_INCLUDED)
|
|||
"${__AOTRITON_VER}-${__AOTRITON_MANYLINUX}"
|
||||
"_${__AOTRITON_ARCH}-rocm${__AOTRITON_ROCM}"
|
||||
"-shared.tar.${__AOTRITON_Z}")
|
||||
string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/"
|
||||
string(CONCAT __AOTRITON_URL "https://github.com/ROCm/aotriton/releases/download/" # @lint-ignore
|
||||
"${__AOTRITON_VER}/${__AOTRITON_FILE}")
|
||||
ExternalProject_Add(aotriton_external
|
||||
URL "${__AOTRITON_URL}"
|
||||
|
|
|
|||
|
|
@ -137,7 +137,7 @@ about this by following `this
|
|||
API concerns itself with scenarios in which you would like to extend
|
||||
TorchScript with custom operators, which can similarly be serialized and
|
||||
invoked from C++ during inference. Lastly, the `torch::jit::compile
|
||||
<https://pytorch.org/cppdocs/api/function_namespacetorch_1_1jit_1a176d99fd5bf0233119a5f49c07a1d01d.html#exhale-function-namespacetorch-1-1jit-1a176d99fd5bf0233119a5f49c07a1d01d>`_
|
||||
<https://pytorch.org/cppdocs/api/function_namespacetorch_1_1jit_1a8660dc13a6b82336aadac667e6dccba1.html>`_
|
||||
function may be used to access the TorchScript compiler directly from C++.
|
||||
|
||||
C++ Extensions
|
||||
|
|
|
|||
|
|
@ -325,13 +325,13 @@ Python Docs
|
|||
PyTorch documentation is generated from python source using
|
||||
`Sphinx <https://www.sphinx-doc.org/en/master/>`__. Generated HTML is
|
||||
copied to the docs folder in the main branch of
|
||||
`pytorch.github.io <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__,
|
||||
`pytorch.org/docs <https://pytorch.org/docs/main>`__,
|
||||
and is served via GitHub pages.
|
||||
|
||||
- Site: https://pytorch.org/docs
|
||||
- GitHub: https://github.com/pytorch/pytorch/tree/main/docs
|
||||
- Served from:
|
||||
`https://github.com/pytorch/pytorch.github.io/tree/master/docs <https://github.com/pytorch/pytorch.github.io/tree/master/docs>`__
|
||||
`https://pytorch.org/docs/main <https://pytorch.org/docs/main>`__
|
||||
|
||||
C++ Docs
|
||||
~~~~~~~~
|
||||
|
|
|
|||
|
|
@ -3708,7 +3708,6 @@ def process_docstring(app, what_, name, obj, options, lines):
|
|||
lines (List[str]): the lines of the docstring, see above
|
||||
|
||||
References:
|
||||
https://www.sphinx-doc.org/en/1.5.1/_modules/sphinx/ext/autodoc.html
|
||||
https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html
|
||||
"""
|
||||
import re
|
||||
|
|
|
|||
|
|
@ -13,4 +13,4 @@
|
|||
|
||||
See the `cuDNN 8 Release Notes`_ for more information.
|
||||
|
||||
.. _cuDNN 8 Release Notes: https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_8.html
|
||||
.. _cuDNN 8 Release Notes: https://docs.nvidia.com/deeplearning/cudnn/archives/cudnn-880/release-notes/rel_8.html
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ network bandwidth. These two environment variables have been pre-tuned by NCCL
|
|||
for some cloud providers, such as AWS or GCP.
|
||||
|
||||
For a full list of NCCL environment variables, please refer to
|
||||
`NVIDIA NCCL's official documentation <https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html>`_
|
||||
`NVIDIA NCCL's official documentation <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`_
|
||||
|
||||
You can tune NCCL communicators even further using `torch.distributed.ProcessGroupNCCL.NCCLConfig`
|
||||
and `torch.distributed.ProcessGroupNCCL.Options`. Learn more about them using `help`
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ Currently, only the "nccl" and "gloo" backends for torch.distributed are support
|
|||
CUDA API to HIP API mappings in C++
|
||||
-----------------------------------
|
||||
|
||||
Please refer: https://rocmdocs.amd.com/en/latest/Programming_Guides/HIP_API_Guide.html
|
||||
Please refer: https://rocm.docs.amd.com/projects/HIP/en/latest/reference/api_syntax.html
|
||||
|
||||
NOTE: The CUDA_VERSION macro, cudaRuntimeGetVersion and cudaDriverGetVersion APIs do not
|
||||
semantically map to the same values as HIP_VERSION macro, hipRuntimeGetVersion and
|
||||
|
|
|
|||
|
|
@ -24,9 +24,10 @@ MKL and MAGMA. Here are the steps to build with them.
|
|||
REM 2.5.3 (CUDA 10.1 10.2 11.0) x (Debug Release)
|
||||
REM 2.5.2 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release)
|
||||
REM 2.5.1 (CUDA 9.2 10.0 10.1 10.2) x (Debug Release)
|
||||
set CUDA_PREFIX=cuda102
|
||||
set CONFIG=release
|
||||
curl -k https://s3.amazonaws.com/ossci-windows/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z -o magma.7z
|
||||
set "CUDA_PREFIX=cuda102"
|
||||
set "CONFIG=release"
|
||||
set "HOST=https://s3.amazonaws.com/ossci-windows"
|
||||
curl -k "%HOST%/magma_2.5.4_%CUDA_PREFIX%_%CONFIG%.7z" -o magma.7z
|
||||
7z x -aoa magma.7z -omagma
|
||||
|
||||
REM Setting essential environment variables
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ Dependencies
|
|||
The ONNX exporter depends on extra Python packages:
|
||||
|
||||
- `ONNX <https://onnx.ai>`_
|
||||
- `ONNX Script <https://onnxscript.ai>`_
|
||||
- `ONNX Script <https://microsoft.github.io/onnxscript>`_
|
||||
|
||||
They can be installed through `pip <https://pypi.org/project/pip/>`_:
|
||||
|
||||
|
|
|
|||
|
|
@ -452,7 +452,7 @@ ONNX operators that represent the function's behavior in ONNX. For example::
|
|||
.. . ``torch::jit::Value::setType``). This is not required, but it can help the exporter's
|
||||
.. shape and type inference for down-stream nodes. For a non-trivial example of ``setType``, see
|
||||
.. ``test_aten_embedding_2`` in
|
||||
.. `test_operators.py <https://github.com/pytorch/pytorch/blob/main/test/onnx/test_operators.py>`_.
|
||||
.. `test_operators.py <https://github.com/pytorch/pytorch/blob/release/2.5/test/onnx/test_operators.py#L1179>`_.
|
||||
|
||||
.. The example below shows how you can access ``requires_grad`` via the ``Node`` object:
|
||||
|
||||
|
|
|
|||
|
|
@ -204,7 +204,7 @@ will create the ``OwnerRRef``, and returns an ACK to acknowledge ``{100, 1}``
|
|||
**G2**, the ``OwnerRRef`` is a child of the ``UserRRef``, and the ``UserRRef``
|
||||
is not deleted until it receives the ACK from the owner.
|
||||
|
||||
.. image:: https://user-images\.githubusercontent\.com/16999635/69164772-98181300-0abe-11ea-93a7-9ad9f757cd94.png
|
||||
.. image:: https://user-images.githubusercontent.com/16999635/69164772-98181300-0abe-11ea-93a7-9ad9f757cd94.png
|
||||
:alt: user_to_owner_ret.png
|
||||
:width: 500 px
|
||||
|
||||
|
|
|
|||
|
|
@ -209,7 +209,7 @@ Diagnosing TorchInductor Errors
|
|||
|
||||
If the error does not occur with the ``"eager"`` backend, then the
|
||||
backend compiler is the source of the error (`example
|
||||
error <https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de%5D>`__).
|
||||
error <https://gist.github.com/mlazos/2f13681e3cc6c43b3911f336327032de>`__).
|
||||
There are `different choices <./torch.compiler.rst>`__
|
||||
for backend compilers for TorchDynamo, with TorchInductor
|
||||
fitting the needs of most users. This section focuses on TorchInductor
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ torch.finfo
|
|||
.. class:: torch.finfo
|
||||
|
||||
A :class:`torch.finfo` is an object that represents the numerical properties of a floating point
|
||||
:class:`torch.dtype`, (i.e. ``torch.float32``, ``torch.float64``, ``torch.float16``, and ``torch.bfloat16``). This is similar to `numpy.finfo <https://docs.scipy.org/doc/numpy/reference/generated/numpy.finfo.html>`_.
|
||||
:class:`torch.dtype`, (i.e. ``torch.float32``, ``torch.float64``, ``torch.float16``, and ``torch.bfloat16``). This is similar to `numpy.finfo <https://numpy.org/doc/stable/reference/generated/numpy.finfo.html>`_.
|
||||
|
||||
A :class:`torch.finfo` provides the following attributes:
|
||||
|
||||
|
|
@ -49,7 +49,7 @@ torch.iinfo
|
|||
|
||||
|
||||
A :class:`torch.iinfo` is an object that represents the numerical properties of a integer
|
||||
:class:`torch.dtype` (i.e. ``torch.uint8``, ``torch.int8``, ``torch.int16``, ``torch.int32``, and ``torch.int64``). This is similar to `numpy.iinfo <https://docs.scipy.org/doc/numpy/reference/generated/numpy.iinfo.html>`_.
|
||||
:class:`torch.dtype` (i.e. ``torch.uint8``, ``torch.int8``, ``torch.int16``, ``torch.int32``, and ``torch.int64``). This is similar to `numpy.iinfo <https://numpy.org/doc/stable/reference/generated/numpy.iinfo.html>`_.
|
||||
|
||||
A :class:`torch.iinfo` provides the following attributes:
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
|
||||
setup_environment(){
|
||||
# The rootfs image for a Tizen target (RPi3)is located at the below webpage:
|
||||
# http://download.tizen.org/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/
|
||||
# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/tizen-unified_20170529.1/images/
|
||||
# If you do not have a Tizen device, Please, run qemu-arm-static and chroot command.
|
||||
# $ sudo chroot ~/tizen-rootfs qemu-arm-static /usr/bin/bash
|
||||
|
||||
|
|
@ -25,7 +25,7 @@ caffe2_lite_dep_packages(){
|
|||
# You can set-up a rpm repository with zypper, yum, and dnf because Tizen
|
||||
# software platform officially support rpm format such as Fedora, OpenSUSE.
|
||||
# The official Tizen repository is as following:
|
||||
# http://download.tizen.org/releases/milestone/tizen/4.0.m1/
|
||||
# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/
|
||||
echo "Installing dependencies."
|
||||
sudo zypper install \
|
||||
make \
|
||||
|
|
@ -69,7 +69,7 @@ caffe2_full_dep_packages(){
|
|||
# You can set-up a rpm repository with zypper, yum, and dnf because Tizen
|
||||
# software platform officially support rpm format such as Fedora, OpenSUSE.
|
||||
# The official Tizen repository is as following:
|
||||
# http://download.tizen.org/releases/milestone/tizen/4.0.m1/
|
||||
# https://cdn.download.tizen.org/archive/releases/milestone/tizen/4.0.m1/
|
||||
echo "Installing dependencies."
|
||||
sudo zypper install \
|
||||
cmake \
|
||||
|
|
|
|||
|
|
@ -212,7 +212,9 @@ headers = {"Authorization": f"token {token}"}
|
|||
|
||||
def run_query(query):
|
||||
request = requests.post(
|
||||
"https://api.github.com/graphql", json={"query": query}, headers=headers
|
||||
"https://api.github.com/graphql", # @lint-ignore
|
||||
json={"query": query},
|
||||
headers=headers,
|
||||
)
|
||||
if request.status_code == 200:
|
||||
return request.json()
|
||||
|
|
|
|||
|
|
@ -441,7 +441,7 @@ lstm_output_to_device(
|
|||
}
|
||||
|
||||
// This test is a port of python code introduced here:
|
||||
// https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
|
||||
// https://medium.com/data-science/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66
|
||||
// Reverse forward of bidirectional GRU should act
|
||||
// as regular forward of unidirectional GRU
|
||||
void BidirectionalGRUReverseForward(bool cuda) {
|
||||
|
|
|
|||
|
|
@ -594,7 +594,7 @@ class TestDTensorOps(DTensorOpTestBase):
|
|||
dtensor_rs = func(*dtensor_args, **dtensor_kwargs)
|
||||
|
||||
# we need to skip tests containing tensors of zero elements for now.
|
||||
# see issue: https://github.com/pytorch/tau/issues/470
|
||||
# see issue: https://github.com/pytorch/PiPPy/issues/470
|
||||
# TODO remove this once issue above fixed.
|
||||
flat_args = pytree.tree_leaves(dtensor_rs)
|
||||
if any(
|
||||
|
|
|
|||
|
|
@ -192,7 +192,9 @@ class DistElementwiseOpsTest(DTensorOpTestBase):
|
|||
op=torch.sigmoid,
|
||||
)
|
||||
|
||||
@skip("testing RNG based ops is broken: https://github.com/pytorch/tau/issues/494")
|
||||
@skip(
|
||||
"testing RNG based ops is broken: https://github.com/pytorch/PiPPy/issues/494"
|
||||
)
|
||||
def test_dropout(self):
|
||||
device_mesh = self.build_device_mesh()
|
||||
|
||||
|
|
|
|||
|
|
@ -5169,7 +5169,6 @@ class TestVmapOperatorsOpInfo(TestCase):
|
|||
xfail("linalg.vecdot"),
|
||||
# throws in vmap on CUDA
|
||||
# IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)
|
||||
# https://github.com/pytorch/pytorch/runs/8110653462?check_suite_focus=true
|
||||
# but it passes locally
|
||||
xfail("linalg.diagonal"),
|
||||
skip("linalg.matrix_norm", ""),
|
||||
|
|
|
|||
|
|
@ -581,7 +581,7 @@ class CudaReproTests(TestCase):
|
|||
"""
|
||||
This UT tests autotune on an inplace kernel. The autotune should not contaminate
|
||||
the input buffers when tuning with multiple configs. For more details, refer to
|
||||
https://github.com/openai/triton/issues/781
|
||||
https://github.com/triton-lang/triton/issues/781
|
||||
https://github.com/pytorch/torchdynamo/issues/1670
|
||||
"""
|
||||
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
|
||||
|
|
@ -1587,7 +1587,7 @@ class CudaReproTests(TestCase):
|
|||
|
||||
@config.patch("triton.use_block_ptr", True)
|
||||
def test_selecsls42b_misaligned_address(self):
|
||||
# https://github.com/openai/triton/issues/2836
|
||||
# https://github.com/triton-lang/triton/issues/2836
|
||||
|
||||
@torch.compile(fullgraph=True)
|
||||
def fn(arg207_1, arg208_1, convert_element_type_40, expand, full, mul_3):
|
||||
|
|
|
|||
|
|
@ -431,7 +431,7 @@ class ForeachTests(TestCase):
|
|||
@requires_cuda
|
||||
@scalar_bin_ops
|
||||
@unittest.skip(
|
||||
"Triton recursion depth exceeded: https://github.com/openai/triton/issues/1763"
|
||||
"Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763"
|
||||
)
|
||||
def test_kernel_split_arg_limit_scalar(self, op):
|
||||
def fn(a):
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ class TestIndexingSimplification(InductorTestCase):
|
|||
ModularIndexing(i0 + i1 * i2 * r3, i2, r3), ModularIndexing(i0, i2, r3)
|
||||
)
|
||||
|
||||
# if there are negative terms, we cannot optimize away zero terms due to https://github.com/openai/triton/issues/619
|
||||
# if there are negative terms, we cannot optimize away zero terms due to https://github.com/triton-lang/triton/issues/619
|
||||
self.assertEqual(
|
||||
ModularIndexing(-i0 + i1 * 20, 2, 10), ModularIndexing(-i0 + i1 * 20, 2, 10)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -166,7 +166,7 @@ class TestMaxAutotune(TestCase):
|
|||
@parametrize("autotune_multi_device", (True, False))
|
||||
def test_max_autotune_mm_plus_mm(self, autotune_in_subproc, autotune_multi_device):
|
||||
"""
|
||||
This crash previously due to a triton issue: https://github.com/openai/triton/issues/1298 .
|
||||
This crash previously due to a triton issue: https://github.com/triton-lang/triton/issues/1298 .
|
||||
With autotuning in subprocess, we don't crash anymore.
|
||||
"""
|
||||
m, n, k = 2048, 1536, 64
|
||||
|
|
|
|||
|
|
@ -2969,7 +2969,7 @@ class CommonTemplate:
|
|||
return torch.round(a), torch.round(b + 1), torch.round(a, decimals=2)
|
||||
|
||||
# without manual_seed, there is some chance this test fails due to:
|
||||
# https://github.com/openai/triton/issues/530
|
||||
# https://github.com/triton-lang/triton/issues/530
|
||||
torch.manual_seed(0)
|
||||
|
||||
# with *100 we are always getting a number exactly at .5 which we don't do right in half
|
||||
|
|
@ -7957,7 +7957,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
|||
torch.randint(0, 100, size=[600], dtype=torch.int64),
|
||||
torch.randn([600, 256, 7, 7]),
|
||||
],
|
||||
# workaround for https://github.com/openai/triton/issues/558
|
||||
# workaround for https://github.com/triton-lang/triton/issues/558
|
||||
check_lowp=False,
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ class TestQuantizedFunctionalOps(QuantizationTestCase):
|
|||
# Make sure the results match
|
||||
# assert_array_almost_equal compares using the following formula:
|
||||
# abs(desired-actual) < 1.5 * 10**(-decimal)
|
||||
# (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
|
||||
# (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
|
||||
# We use decimal = 0 to ignore off-by-1 differences between reference
|
||||
# and test. Off-by-1 differences arise due to the order of round and
|
||||
# zero_point addition operation, i.e., if addition followed by round is
|
||||
|
|
|
|||
|
|
@ -320,7 +320,7 @@ class TestStaticQuantizedModule(QuantizationTestCase):
|
|||
# Make sure the results match
|
||||
# assert_array_almost_equal compares using the following formula:
|
||||
# abs(desired-actual) < 1.5 * 10**(-decimal)
|
||||
# (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
|
||||
# (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
|
||||
# We use decimal = 0 to ignore off-by-1 differences between reference
|
||||
# and test. Off-by-1 differences arise due to the order of round and
|
||||
# zero_point addition operation, i.e., if addition followed by round is
|
||||
|
|
|
|||
|
|
@ -5200,7 +5200,7 @@ class TestQuantizedConv(TestCase):
|
|||
# Make sure the results match
|
||||
# assert_array_almost_equal compares using the following formula:
|
||||
# abs(desired-actual) < 1.5 * 10**(-decimal)
|
||||
# (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
|
||||
# (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
|
||||
# We use decimal = 0 to ignore off-by-1 differences between
|
||||
# reference and test. Off-by-1 differences arise due to the order of
|
||||
# round and zero_point addition operation, i.e., if addition
|
||||
|
|
@ -6935,7 +6935,7 @@ class TestQuantizedConv(TestCase):
|
|||
# Make sure the results match
|
||||
# assert_array_almost_equal compares using the following formula:
|
||||
# abs(desired-actual) < 1.5 * 10**(-decimal)
|
||||
# (https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_almost_equal.html)
|
||||
# (https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html)
|
||||
# We use decimal = 0 to ignore off-by-1 differences between
|
||||
# reference and test. Off-by-1 differences arise due to the order of
|
||||
# round and zero_point addition operation, i.e., if addition
|
||||
|
|
|
|||
|
|
@ -11752,7 +11752,7 @@ class TestNNDeviceType(NNTestCase):
|
|||
with self.assertRaisesRegex(RuntimeError, msg):
|
||||
F.nll_loss(x, t, weight=weight)
|
||||
|
||||
# Ref: https://github.com/pytorch/pytorch/issue/85005
|
||||
# Ref: https://github.com/pytorch/pytorch/issues/85005
|
||||
@onlyCUDA
|
||||
@largeTensorTest("120GB", "cpu")
|
||||
@largeTensorTest("45GB", "cuda")
|
||||
|
|
@ -11785,7 +11785,7 @@ class TestNNDeviceType(NNTestCase):
|
|||
with torch.no_grad():
|
||||
self.assertTrue(torch.allclose(input.grad.cpu(), input_cpu.grad, rtol=rtol, atol=atol))
|
||||
|
||||
# Ref: https://github.com/pytorch/pytorch/issue/108345
|
||||
# Ref: https://github.com/pytorch/pytorch/issues/108345
|
||||
@onlyCUDA
|
||||
@largeTensorTest("20GB", "cpu")
|
||||
@largeTensorTest("20GB", "cuda")
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ class TestNumbaIntegration(common.TestCase):
|
|||
version: (int) Version 0
|
||||
|
||||
See:
|
||||
https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
|
||||
https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
|
||||
"""
|
||||
|
||||
types = [
|
||||
|
|
@ -250,7 +250,7 @@ class TestNumbaIntegration(common.TestCase):
|
|||
will use the exposed device memory.
|
||||
|
||||
See:
|
||||
https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
|
||||
https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
|
||||
"""
|
||||
|
||||
dtypes = [
|
||||
|
|
|
|||
|
|
@ -1759,7 +1759,6 @@ class TestReductions(TestCase):
|
|||
# On Windows CI, the current version of `numpy` promotes all lower integers
|
||||
# dtypes to int32 while `torch` promotes them to int64. Hence we skip on checking
|
||||
# the exact dtype.
|
||||
# Reference : https://dr.pytorch.org/api/view-log-full?build_id=122051580
|
||||
# PR : https://github.com/pytorch/pytorch/pull/38628#issuecomment-655905370
|
||||
if IS_WINDOWS and is_integral(dtype):
|
||||
exact_dtype = False
|
||||
|
|
|
|||
|
|
@ -547,7 +547,7 @@ class TestUnaryUfuncs(TestCase):
|
|||
# sqrt Test Reference: https://github.com/pytorch/pytorch/pull/47424
|
||||
x = torch.tensor(0.0 - 1.0e20j, dtype=dtype, device=device)
|
||||
self.compare_with_numpy(torch.sqrt, np.sqrt, x)
|
||||
# acos test reference: https://github.com/pytorch/pytorch/issue/42952
|
||||
# acos test reference: https://github.com/pytorch/pytorch/issues/42952
|
||||
if not (dtype == torch.cdouble and "cuda" in device):
|
||||
self.compare_with_numpy(torch.acos, np.arccos, x)
|
||||
|
||||
|
|
|
|||
|
|
@ -4328,7 +4328,7 @@ class TestFromBuffer(TestCase):
|
|||
@skipif(
|
||||
IS_PYPY,
|
||||
reason="PyPy's memoryview currently does not track exports. See: "
|
||||
"https://foss.heptapod.net/pypy/pypy/-/issues/3724",
|
||||
"https://github.com/pypy/pypy/issues/3723",
|
||||
)
|
||||
def test_mmap_close(self):
|
||||
# The old buffer protocol was not safe for some things that the new
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from urllib.request import urlretrieve
|
|||
|
||||
MIRRORS = [
|
||||
"http://yann.lecun.com/exdb/mnist/",
|
||||
"https://ossci-datasets.s3.amazonaws.com/mnist/",
|
||||
"https://ossci-datasets.s3.amazonaws.com/mnist/", # @lint-ignore
|
||||
]
|
||||
|
||||
RESOURCES = [
|
||||
|
|
|
|||
|
|
@ -81,7 +81,7 @@ def get_external_pr_data(
|
|||
response = cast(
|
||||
dict[str, Any],
|
||||
fetch_json(
|
||||
"https://api.github.com/search/issues",
|
||||
"https://api.github.com/search/issues", # @lint-ignore
|
||||
params={
|
||||
"q": f'repo:pytorch/pytorch is:pr is:closed \
|
||||
label:"open source" label:Merged -label:Reverted closed:{period_begin_date}..{period_end_date}',
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ class OssCiUtilizationMetadataV1:
|
|||
|
||||
|
||||
# this data model is for the time series data:
|
||||
# https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_utilization_time_series_schema.sql
|
||||
# https://github.com/pytorch/test-infra/blob/main/clickhouse_db_schema/oss_ci_utilization/oss_ci_time_series_schema.sql
|
||||
@dataclass
|
||||
class OssCiUtilizationTimeSeriesV1:
|
||||
created_at: int
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ MOCK_TEST_DATA = [
|
|||
"sha": "f02f3046571d21b48af3067e308a1e0f29b43af9",
|
||||
"id": 7819529276,
|
||||
"conclusion": "failure",
|
||||
"htmlUrl": "https://github.com/pytorch/pytorch/runs/7819529276?check_suite_focus=true",
|
||||
"htmlUrl": "https://github.com/pytorch/pytorch/runs/7819529276?check_suite_focus=true", # @lint-ignore
|
||||
"logUrl": "https://ossci-raw-job-status.s3.amazonaws.com/log/7819529276",
|
||||
"durationS": 14876,
|
||||
"failureLine": "##[error]The action has timed out.",
|
||||
|
|
@ -25,7 +25,7 @@ MOCK_TEST_DATA = [
|
|||
"sha": "d0d6b1f2222bf90f478796d84a525869898f55b6",
|
||||
"id": 7818399623,
|
||||
"conclusion": "failure",
|
||||
"htmlUrl": "https://github.com/pytorch/pytorch/runs/7818399623?check_suite_focus=true",
|
||||
"htmlUrl": "https://github.com/pytorch/pytorch/runs/7818399623?check_suite_focus=true", # @lint-ignore
|
||||
"logUrl": "https://ossci-raw-job-status.s3.amazonaws.com/log/7818399623",
|
||||
"durationS": 14882,
|
||||
"failureLine": "##[error]The action has timed out.",
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ def trigger_upload_test_stats_intermediate_workflow() -> None:
|
|||
# The GITHUB_TOKEN cannot trigger workflow so this isn't used for now
|
||||
print("Triggering upload_test_stats_intermediate workflow")
|
||||
x = requests.post(
|
||||
"https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches",
|
||||
"https://api.github.com/repos/pytorch/pytorch/actions/workflows/upload_test_stats_intermediate.yml/dispatches", # noqa: B950 @lint-ignore
|
||||
headers={
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
"Authorization": f"Bearer {os.environ.get('GITHUB_TOKEN')}",
|
||||
|
|
|
|||
|
|
@ -1267,7 +1267,7 @@ def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ...
|
|||
def _group_tensors_by_device_and_dtype(nested_tensorlists: List[List[Optional[Tensor]]], with_indices: _bool = False) -> Dict[Tuple[torch.device, torch.dtype], Tuple[List[List[Optional[Tensor]]], List[_int]]]: ...
|
||||
|
||||
# NB: There is no Capsule type in typing, see
|
||||
# https://code.activestate.com/lists/python-dev/139675/
|
||||
# https://github.com/python/cpython/issues/109562
|
||||
def _to_dlpack(data: Tensor) -> Any: ... # THPModule_toDLPack
|
||||
def _from_dlpack(data: Any) -> Tensor: ... # THPModule_fromDLPack
|
||||
def _get_cpp_backtrace(
|
||||
|
|
|
|||
|
|
@ -41,9 +41,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|||
See <https://github.com/ActiveState/appdirs> for details and usage.
|
||||
"""
|
||||
# Dev Notes:
|
||||
# - MSDN on where to store app data files:
|
||||
# http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120
|
||||
# - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html
|
||||
# - Windows "Known Folders": https://learn.microsoft.com/en-us/windows/win32/shell/csidl
|
||||
# - macOS File System Programming Guide: https://developer.apple.com/library/archive/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/Introduction/Introduction.html
|
||||
# - XDG spec for Un*x: https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
|
||||
|
||||
__version__ = "1.4.4"
|
||||
|
|
|
|||
|
|
@ -4389,8 +4389,7 @@ def should_fold(tensor1: torch.Tensor, tensor2: torch.Tensor, is_out: bool) -> b
|
|||
t1_stride = t1.stride()
|
||||
|
||||
# Check the contiguous, we can skip the dim with size of 1
|
||||
# as aten: https://github.com/pytorch/pytorch/blob/
|
||||
# e201460f8aa1510b4c4686627d57b69756c4b916/aten/src/ATen/TensorGeometry.cpp#L17
|
||||
# as aten: https://github.com/pytorch/pytorch/blob/e201460f8aa1510b4c4686627d57b69756c4b916/aten/src/ATen/TensorGeometry.cpp#L17
|
||||
expected_stride = [1]
|
||||
for size in reversed(t1_shape[1:]):
|
||||
expected_stride.append(size * expected_stride[-1])
|
||||
|
|
|
|||
|
|
@ -4881,9 +4881,8 @@ class CppScheduling(BaseScheduling):
|
|||
len(get_call_ranges(_node)) == node.outer_loop_fusion_depth + 1
|
||||
for _node in node.get_outer_nodes()
|
||||
):
|
||||
# Ref to the typical case of local buffer
|
||||
# in https://github.com/pytorch/pytorch/blob/
|
||||
# 1115a25c36340554442f28f9570abd42f0aface2/aten/src/ATen/native/cpu/SoftMaxKernel.cpp#L159
|
||||
# Ref to the typical case of local buffer in
|
||||
# https://github.com/pytorch/pytorch/blob/1115a25c36340554442f28f9570abd42f0aface2/aten/src/ATen/native/cpu/SoftMaxKernel.cpp#L159 # noqa: B950
|
||||
# where the buffer is with size of last dim and contiguous.
|
||||
# Only support this typical case at first.
|
||||
visited_scheduler_nodes = OrderedSet[str]()
|
||||
|
|
|
|||
|
|
@ -504,7 +504,7 @@ class BlockPtrOptions:
|
|||
def triton_reshape(
|
||||
value: str, old_shape: Sequence[sympy.Expr], new_shape: Sequence[sympy.Expr]
|
||||
) -> str:
|
||||
"""Workaround https://github.com/openai/triton/issues/2836"""
|
||||
"""Workaround https://github.com/triton-lang/triton/issues/2836"""
|
||||
assert isinstance(old_shape, list) and isinstance(new_shape, list)
|
||||
|
||||
old_shape_str = [V.kernel.index_to_str(shape) for shape in old_shape]
|
||||
|
|
@ -841,7 +841,7 @@ class TritonOverrides(OpOverrides):
|
|||
|
||||
# fp8 data type conversions has min_elem_per_thread requirements.
|
||||
# Refer to Triton implementations here:
|
||||
# https://github.com/openai/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10.
|
||||
# https://github.com/triton-lang/triton/blob/10f59d8ce04052521c1bc0cb3a3f8b98918fc7e3/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp#L10.
|
||||
fp8_dtypes = (
|
||||
torch.float8_e4m3fn,
|
||||
torch.float8_e5m2,
|
||||
|
|
@ -1828,7 +1828,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
|
|||
and len(mask_vars - dense_mask_vars) == 0
|
||||
and not self.is_indirect_indexing(index)
|
||||
and have_loop_vars
|
||||
# workaround https://github.com/openai/triton/issues/2821
|
||||
# workaround https://github.com/triton-lang/triton/issues/2821
|
||||
and self.index_dtype == "tl.int32"
|
||||
):
|
||||
|
||||
|
|
@ -2053,7 +2053,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
|
|||
) -> tuple[str, str]:
|
||||
check = indexing.boundary_check()
|
||||
if not check:
|
||||
# workaround https://github.com/openai/triton/issues/2813
|
||||
# workaround https://github.com/triton-lang/triton/issues/2813
|
||||
other = ""
|
||||
elif other:
|
||||
assert other == ", other=0.0"
|
||||
|
|
@ -2114,7 +2114,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
|
|||
value, indexing.final_shape, indexing.block_shape, False
|
||||
)
|
||||
|
||||
# workaround https://github.com/openai/triton/issues/2814
|
||||
# workaround https://github.com/triton-lang/triton/issues/2814
|
||||
value = f"{value}.to({triton_store_type(V.graph.get_dtype(name))})"
|
||||
return f"tl.store({block_ptr}, {value}{other})"
|
||||
|
||||
|
|
@ -2260,7 +2260,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
|
|||
line += ".to(tl.float32)"
|
||||
dtype = torch.float32
|
||||
if dtype == torch.bool and torch.version.hip is None:
|
||||
# Workaround for https://github.com/openai/triton/issues/2151
|
||||
# Workaround for https://github.com/triton-lang/triton/issues/2151
|
||||
# tl.load returns int8 when loading from pointer to int1
|
||||
# NOTE: Currently causes hangs on bool UTs for ROCm
|
||||
line += ".to(tl.int1)"
|
||||
|
|
@ -2302,7 +2302,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
|
|||
indexing = self.indexing(index, dense_indexing=True, block_ptr=mode is None)
|
||||
|
||||
# Guard against write-after-read corruption in triton.
|
||||
# See # https://github.com/openai/triton/issues/1615
|
||||
# See # https://github.com/triton-lang/triton/issues/1615
|
||||
# This triton bug means that a load which is broadcasted over multiple
|
||||
# warps may see the result of a store that happens later in the triton
|
||||
# program. The workaround is to add a barrier before storing, which
|
||||
|
|
@ -3655,7 +3655,7 @@ class TritonKernel(SIMDKernel[TritonCSEVariable]):
|
|||
# when they are not constexpr. otherwise there may be a segfault
|
||||
# during launching the Inductor-compiled Triton kernel.
|
||||
# https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
|
||||
# https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
|
||||
# https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
|
||||
for arg_num in equal_1_arg_indices(signature): # type: ignore[index]
|
||||
triton_meta["constants"][signature[arg_num].name] = 1 # type: ignore[index,union-attr]
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ def should_unwrap_unspec_arg(name: str):
|
|||
def signature_of(arg: KernelArgType, *, size_dtype: Optional[str]) -> str:
|
||||
if isinstance(arg, TensorArg):
|
||||
# TODO: Remove fp8 special handling when Triton supports PyTorch fp8 dtypes.
|
||||
# Related PR: https://github.com/openai/triton/pull/2279/
|
||||
# Related PR: https://github.com/triton-lang/triton/pull/2279/
|
||||
if arg.dtype == torch.float8_e4m3fn:
|
||||
tye = "*fp8e4nv"
|
||||
elif arg.dtype == torch.float8_e5m2:
|
||||
|
|
@ -184,7 +184,7 @@ def config_of(
|
|||
def is_aligned(x: KernelArgType, alignment: int, include_tensor: bool) -> bool:
|
||||
"""
|
||||
Roughly follow triton code here:
|
||||
https://github.com/openai/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222
|
||||
https://github.com/triton-lang/triton/blob/5282ed890d453e10b9ee30076ef89115dd197761/python/triton/runtime/jit.py#L208-L222
|
||||
"""
|
||||
if isinstance(x, TensorArg):
|
||||
if include_tensor:
|
||||
|
|
|
|||
|
|
@ -1985,7 +1985,7 @@ class PythonWrapperCodegen(CodeGen):
|
|||
# TODO(aakhundov): add None args to constants, too. currently, this
|
||||
# causes CUDA errors in test_aot_inductor.test_triton_kernel_with_none_input.
|
||||
# https://github.com/pytorch/pytorch/issues/120478#issuecomment-1962822307
|
||||
# https://github.com/openai/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
|
||||
# https://github.com/triton-lang/triton/blob/231efe9ed2d200be0f69a07c298e4342b08efe3d/python/triton/runtime/jit.py#L384
|
||||
"constants": {
|
||||
**constants,
|
||||
**dict.fromkeys(equal_to_1_args, 1),
|
||||
|
|
|
|||
|
|
@ -1158,7 +1158,7 @@ class triton:
|
|||
# of registers being benchmarked.
|
||||
#
|
||||
# NOTE: triton will always report >0 register spills for kernels using sin/cos.
|
||||
# (check this issue https://github.com/openai/triton/issues/1756 )
|
||||
# (check this issue https://github.com/triton-lang/triton/issues/1756 )
|
||||
# So far we see a fixed 8 spilled registers for kernels using sin/cos.
|
||||
# Raise the threshold to 16 to be safe.
|
||||
# We should revisit this once we understand more of the source of register spills.
|
||||
|
|
|
|||
|
|
@ -326,7 +326,7 @@ def should_exclude_padding_time(match: Match, arg_name: str) -> bool:
|
|||
if not fetch_fake_tensors(match, (arg_name,))[0].is_contiguous():
|
||||
return False
|
||||
|
||||
# TODO - see issue https://githpub.com/pytorch/pytorch/issues/128889
|
||||
# TODO - see issue https://github.com/pytorch/pytorch/issues/128889
|
||||
# We would only able to completely plan these out if we were only doing
|
||||
# first dimension padding. non-first we would still need a copy
|
||||
# because these outputs are fixed dense.
|
||||
|
|
|
|||
|
|
@ -2185,8 +2185,7 @@ def _register_qlinear_weight_prepack():
|
|||
# Step 2: register patterns from bmm
|
||||
# Linear might be decomposed into bmm when input dim exceeds 2 and not contiguous
|
||||
# refer to:
|
||||
# https://github.com/pytorch/pytorch/blob/
|
||||
# 80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
|
||||
# https://github.com/pytorch/pytorch/blob/80c07df659362a95da7cd4f3ec367abfdace38c4/torch/_decomp/decompositions.py#L3965-L3968
|
||||
# in this case, we can convert it back to qlinear
|
||||
for dtype, with_bias, is_tensor_overload in itertools.product(
|
||||
[torch.float32, torch.bfloat16], [True, False], [True, False]
|
||||
|
|
|
|||
|
|
@ -620,7 +620,7 @@ def convolution(
|
|||
PADDING_W=padding[1],
|
||||
GROUPS=groups,
|
||||
# TODO(jansel): try unroll for bigger kernels once fixed:
|
||||
# https://github.com/openai/triton/issues/1254
|
||||
# https://github.com/triton-lang/triton/issues/1254
|
||||
UNROLL=is_ones(kernel_shape),
|
||||
ALLOW_TF32=torch.backends.cudnn.allow_tf32,
|
||||
num_stages=cfg.num_stages,
|
||||
|
|
@ -643,7 +643,7 @@ def convolution(
|
|||
PADDING_W=padding[2],
|
||||
GROUPS=groups,
|
||||
# TODO(jansel): try unroll for bigger kernels once fixed:
|
||||
# https://github.com/openai/triton/issues/1254
|
||||
# https://github.com/triton-lang/triton/issues/1254
|
||||
UNROLL=is_ones(kernel_shape),
|
||||
ALLOW_TF32=torch.backends.cudnn.allow_tf32,
|
||||
num_stages=cfg.num_stages,
|
||||
|
|
|
|||
|
|
@ -134,7 +134,7 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
|
|||
)
|
||||
):
|
||||
# TODO(jansel): support different K values when this is fixed:
|
||||
# https://github.com/openai/triton/issues/967
|
||||
# https://github.com/triton-lang/triton/issues/967
|
||||
return lowerings[aten.add](
|
||||
lowerings[aten.mm](mat1, mat2), lowerings[aten.mm](mat3, mat4)
|
||||
)
|
||||
|
|
@ -151,7 +151,7 @@ def tuned_mm_plus_mm(mat1, mat2, mat3, mat4, *, layout=None):
|
|||
|
||||
if use_triton_template(layout1):
|
||||
for config in mm_configs():
|
||||
# see https://github.com/openai/triton/issues/1298
|
||||
# see https://github.com/triton-lang/triton/issues/1298
|
||||
# BLOCK_K = K causes llvm error
|
||||
if V.graph.sizevars.statically_known_lt(config.kwargs["BLOCK_K"], k1):
|
||||
mm_plus_mm_template.maybe_append_choice(
|
||||
|
|
|
|||
|
|
@ -6092,7 +6092,7 @@ def div_mode(a, b, rounding_mode=None):
|
|||
both_boolean = is_boolean_type(a) and is_boolean_type(b)
|
||||
|
||||
# floordiv and truncdiv need special handling for integer tensors on Triton,
|
||||
# see the discussion at https://github.com/openai/triton/issues/605
|
||||
# see the discussion at https://github.com/triton-lang/triton/issues/605
|
||||
if rounding_mode == "floor":
|
||||
assert not both_boolean, "floordiv operands can not be boolean at the same time"
|
||||
return floordiv(a, b) if both_integer else floor(div(a, b))
|
||||
|
|
|
|||
|
|
@ -707,8 +707,8 @@ def register_onednn_fusion_ops():
|
|||
assert x_zp.get_numel() == 1, "x_zp is incompatible with oneDNN qlinear"
|
||||
|
||||
# When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer
|
||||
# Refer to https://github.com/pytorch/pytorch/blob
|
||||
# /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577
|
||||
# Refer to
|
||||
# https://github.com/pytorch/pytorch/blob/f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577 # noqa: B950
|
||||
if w_zp is None:
|
||||
# If w_zp is None, then it's a dummy tensor created to denote the
|
||||
# absence of a zero point, and thus w is int8 symmetrically quantized.
|
||||
|
|
@ -1018,8 +1018,8 @@ def register_onednn_fusion_ops():
|
|||
x_zp.realize()
|
||||
|
||||
# When channels less than 8, w_scale/w_zp is Pointwise instead of ConstantBuffer
|
||||
# Refer to https://github.com/pytorch/pytorch/blob
|
||||
# /f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577
|
||||
# Refer to
|
||||
# https://github.com/pytorch/pytorch/blob/f353d17755ed23b02924c962a86ff99a3405fe10/torch/_inductor/graph.py#L570-L577 # noqa: B950
|
||||
w_scale.realize()
|
||||
w_zp.realize()
|
||||
if w_zp.get_dtype() != torch.int32 and isinstance(
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ if triton is not None:
|
|||
return (backend, arch)
|
||||
|
||||
# In the latest triton, math functions were shuffled around into different modules:
|
||||
# https://github.com/openai/triton/pull/3172
|
||||
# https://github.com/triton-lang/triton/pull/3172
|
||||
try:
|
||||
from triton.language.extra import libdevice
|
||||
|
||||
|
|
|
|||
|
|
@ -1472,7 +1472,7 @@ class TritonCompileResult(CompileResult[CompiledKernel]):
|
|||
"metadata",
|
||||
*call_args,
|
||||
]
|
||||
else: # args after CompiledKernel.launch_metadata: https://github.com/openai/triton/pull/3492
|
||||
else: # args after CompiledKernel.launch_metadata: https://github.com/triton-lang/triton/pull/3492
|
||||
# Getting the kernel launch args is extremely perf-sensitive. Evaluating
|
||||
# `bin.launch_metadata` is relatively expensive, and returns None unless a
|
||||
# `launch_enter_hook` is installed. So if we don't have that hook installed,
|
||||
|
|
|
|||
|
|
@ -4560,7 +4560,7 @@ class Scheduler:
|
|||
)
|
||||
return False
|
||||
except CompilationError as e:
|
||||
# workaround triton issue: https://github.com/openai/triton/issues/2151
|
||||
# workaround triton issue: https://github.com/triton-lang/triton/issues/2151
|
||||
if "Loop-carried variable" in str(e):
|
||||
fusion_log.debug(
|
||||
"ComboKernel benchmark: return True because of loop-carried variable"
|
||||
|
|
@ -4574,7 +4574,7 @@ class Scheduler:
|
|||
try:
|
||||
ms2, ms2_clone, _path2_list = self.benchmark_combo_kernel(subkernel_nodes)
|
||||
except CompilationError as e:
|
||||
# workaround triton issue: https://github.com/openai/triton/issues/2151
|
||||
# workaround triton issue: https://github.com/triton-lang/triton/issues/2151
|
||||
if "Loop-carried variable" in str(e):
|
||||
fusion_log.debug(
|
||||
"ComboKernel benchmark: return True because of loop-carried variable"
|
||||
|
|
|
|||
|
|
@ -390,7 +390,7 @@ def ceildiv(
|
|||
def _type_of(key: Optional[torch.dtype]) -> str:
|
||||
# Use the function here to get rid of dependencies on the Triton during the codegen.
|
||||
# Refer to Triton implementation here:
|
||||
# https://github.com/openai/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
|
||||
# https://github.com/triton-lang/triton/blob/98b5945d2aef679e00ebca8e07c35c3658ec76de/python/triton/runtime/jit.py#L238
|
||||
# `None` is nullptr. Implicitly convert to *i8.
|
||||
if key is None:
|
||||
return "*i8"
|
||||
|
|
@ -1981,7 +1981,7 @@ def get_device_tflops(dtype: torch.dtype) -> int:
|
|||
assert dtype in (torch.float16, torch.bfloat16, torch.float32)
|
||||
|
||||
if inspect.signature(get_max_simd_tflops).parameters.get("clock_rate"):
|
||||
# Triton API change in https://github.com/openai/triton/pull/2293
|
||||
# Triton API change in https://github.com/triton-lang/triton/pull/2293
|
||||
from torch._utils_internal import max_clock_rate
|
||||
|
||||
sm_clock = max_clock_rate()
|
||||
|
|
|
|||
|
|
@ -435,7 +435,7 @@ class ndarray:
|
|||
def item(self, *args):
|
||||
# Mimic NumPy's implementation with three special cases (no arguments,
|
||||
# a flat index and a multi-index):
|
||||
# https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/methods.c#L702
|
||||
# https://github.com/numpy/numpy/blob/main/numpy/_core/src/multiarray/methods.c#L702
|
||||
if args == ():
|
||||
return self.tensor.item()
|
||||
elif len(args) == 1:
|
||||
|
|
|
|||
|
|
@ -1262,7 +1262,7 @@ class Tensor(torch._C.TensorBase):
|
|||
"""Array view description for cuda tensors.
|
||||
|
||||
See:
|
||||
https://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html
|
||||
https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
|
||||
"""
|
||||
if has_torch_function_unary(self):
|
||||
# TODO mypy doesn't support @property, see: https://github.com/python/mypy/issues/6185
|
||||
|
|
|
|||
|
|
@ -4163,9 +4163,9 @@ Unlike :meth:`~Tensor.expand`, this function copies the tensor's data.
|
|||
.. warning::
|
||||
|
||||
:meth:`~Tensor.repeat` behaves differently from
|
||||
`numpy.repeat <https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html>`_,
|
||||
`numpy.repeat <https://numpy.org/doc/stable/reference/generated/numpy.repeat.html>`_,
|
||||
but is more similar to
|
||||
`numpy.tile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html>`_.
|
||||
`numpy.tile <https://numpy.org/doc/stable/reference/generated/numpy.tile.html>`_.
|
||||
For the operator similar to `numpy.repeat`, see :func:`torch.repeat_interleave`.
|
||||
|
||||
Args:
|
||||
|
|
|
|||
|
|
@ -767,7 +767,7 @@ This function checks if :attr:`input` and :attr:`other` satisfy the condition:
|
|||
"""
|
||||
+ r"""
|
||||
elementwise, for all elements of :attr:`input` and :attr:`other`. The behaviour of this function is analogous to
|
||||
`numpy.allclose <https://docs.scipy.org/doc/numpy/reference/generated/numpy.allclose.html>`_
|
||||
`numpy.allclose <https://numpy.org/doc/stable/reference/generated/numpy.allclose.html>`_
|
||||
|
||||
Args:
|
||||
input (Tensor): first tensor to compare
|
||||
|
|
@ -13826,7 +13826,7 @@ Returns the indices of the buckets to which each value in the :attr:`input` belo
|
|||
boundaries of the buckets are set by :attr:`boundaries`. Return a new tensor with the same size
|
||||
as :attr:`input`. If :attr:`right` is False (default), then the left boundary is open. Note that
|
||||
this behavior is opposite the behavior of
|
||||
`numpy.digitize <https://docs.scipy.org/doc/numpy/reference/generated/numpy.digitize.html>`_.
|
||||
`numpy.digitize <https://numpy.org/doc/stable/reference/generated/numpy.digitize.html>`_.
|
||||
More formally, the returned index satisfies the following rules:
|
||||
|
||||
.. list-table::
|
||||
|
|
|
|||
|
|
@ -219,7 +219,7 @@ def _vmap(
|
|||
# The `allow_none_pass_through` argument is a temporary workaround may be removed.
|
||||
# Currently it enables us to wrap the call in `autograd.grad` to the autograd engine,
|
||||
# which may return None if any of the inputs are unused. See the issue discussing this:
|
||||
# https://github.com/facebookresearch/functorch/issues/159.
|
||||
# https://github.com/pytorch/functorch/issues/159.
|
||||
@functools.wraps(func)
|
||||
def wrapped(*args):
|
||||
_check_out_dims_is_int_or_int_tuple(out_dims, func)
|
||||
|
|
|
|||
|
|
@ -703,8 +703,8 @@ class X86InductorQuantizer(Quantizer):
|
|||
# Once we've annotated the model with quantization configurations, we also need to annotate
|
||||
# the output of quantizable operations. For example, if we annotated `maxpool2d` to quantize its inputs,
|
||||
# we will quantize its output accordingly. This enables us to fuse the dq-operator-q into a quantized op.
|
||||
# Refer to https://github.com/intel/intel-extension-for-pytorch/blob/
|
||||
# 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487
|
||||
# Refer to
|
||||
# https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L487 # noqa: B950
|
||||
|
||||
self._annotate_output_for_int8_in_int8_out_pattern_entry(model)
|
||||
|
||||
|
|
@ -732,8 +732,8 @@ class X86InductorQuantizer(Quantizer):
|
|||
|
||||
# Step2: Recipe to propagate annotation for patterns beside conv/linear.
|
||||
# Go through all the nodes from start to end.
|
||||
# Recipe refer to https://github.com/intel/intel-extension-for-pytorch/blob/
|
||||
# 90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538
|
||||
# Recipe refer to
|
||||
# https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_recipe.py#L538 # noqa: B950
|
||||
|
||||
self._annotate_propagation_quantizable_pattern_entry(
|
||||
model, quantization_config, filter_fn
|
||||
|
|
@ -1381,9 +1381,9 @@ class X86InductorQuantizer(Quantizer):
|
|||
) -> None:
|
||||
r"""
|
||||
Check and insert observer at output of node in int8_in_int8_out_ops if needed.
|
||||
Recipe refers to https://github.com/intel/intel-extension-for-pytorch/blob/
|
||||
90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
|
||||
"""
|
||||
Recipe refers to
|
||||
https://github.com/intel/intel-extension-for-pytorch/blob/90d19323d96afc53fcc22ba5a7bb3fb07fdd6c1c/intel_extension_for_pytorch/quantization/_utils.py#L495
|
||||
""" # noqa: B950
|
||||
edge_or_node: tuple[Node, Node]
|
||||
if (node.target in int8_in_int8_out_ops) and (_is_any_annotated([node])):
|
||||
if node.target == torch.ops.aten.max_pool2d.default:
|
||||
|
|
|
|||
|
|
@ -635,7 +635,7 @@ static c10::ArrayRef<T> get_set_cached_attr(
|
|||
// is also to <=5 elements, we don't need to reallocate.
|
||||
// Note: I tried removing this optimization and tripped ASAN
|
||||
// in a batchnorm kernel here:
|
||||
// https://pipelinesghubeus21.actions.githubusercontent.com/mBh68xKhi8LyM7tp3vECvYXNFvuV4gyVGgmYCteuEZP9JH92QN/_apis/pipelines/1/runs/3373307/signedlogcontent/790?urlExpires=2023-09-15T21%3A13%3A51.4327798Z&urlSigningMethod=HMACV1&urlSignature=tDeX7ZqaARVU5NNwyr5yYqqkWq3A2j4z8FFdqYwGr0Q%3D
|
||||
// https://pipelinesghubeus21.actions.githubusercontent.com/mBh68xKhi8LyM7tp3vECvYXNFvuV4gyVGgmYCteuEZP9JH92QN/_apis/pipelines/1/runs/3373307/signedlogcontent/790?urlExpires=2023-09-15T21%3A13%3A51.4327798Z&urlSigningMethod=HMACV1&urlSignature=tDeX7ZqaARVU5NNwyr5yYqqkWq3A2j4z8FFdqYwGr0Q%3D@lint-ignore
|
||||
// We should fix this instead.
|
||||
bool needs_resize = false;
|
||||
// We need to resize if:
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ using namespace torch::nn::utils::rnn;
|
|||
namespace torch::nn {
|
||||
|
||||
/// These must line up with the CUDNN mode codes:
|
||||
/// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
|
||||
/// https://docs.nvidia.com/deeplearning/cudnn/backend/latest/api/cudnn-adv-library.html#cudnnrnnmode-t
|
||||
enum class CuDNNMode { RNN_RELU = 0, RNN_TANH = 1, LSTM = 2, GRU = 3 };
|
||||
|
||||
static CuDNNMode get_cudnn_mode_for_rnn(
|
||||
|
|
|
|||
|
|
@ -891,8 +891,8 @@ Tensor logcumsumexp_backward(
|
|||
return grad;
|
||||
}
|
||||
|
||||
// Reference: https://github.com/tensorflow/tensorflow/blob/
|
||||
// 2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
|
||||
// Reference:
|
||||
// https://github.com/tensorflow/tensorflow/blob/2a5910906a0e0f3dbc186ff9db6386d81a63448c/tensorflow/python/ops/math_grad.py#L1832-L1863
|
||||
|
||||
auto scalar_min = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(
|
||||
at::ScalarType::BFloat16,
|
||||
|
|
|
|||
|
|
@ -15,7 +15,8 @@ namespace torch::jit::fuser::onednn {
|
|||
// being created for each device. The device handle passed from PyTorch allows
|
||||
// oneDNN Graph implementation to work on the device specified by PyTorch, which
|
||||
// is currently CPU, so we only have one engine.
|
||||
// Ref: https://spec.oneapi.io/onednn-graph/latest/programming_model.html#engine
|
||||
// Ref:
|
||||
// https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#engine
|
||||
struct Engine {
|
||||
// CPU engine singleton
|
||||
static dnnl::engine& getEngine();
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# Pytorch - oneDNN Graph API Bridge
|
||||
This is a PyTorch JIT graph fuser based on [oneDNN Graph API](https://spec.oneapi.io/onednn-graph/latest/programming_model.html), which provides a flexible API for aggressive fusion. Float & BFloat16 inference is supported. However, BFloat16 only performs well on Intel Xeon Cooper Lake platform & beyond, as they have native BFloat16 support. Also, currently, PyTorch has divergent AMP support in JIT & eager modes, so one should disable JIT AMP support & leverage eager mode AMP support to use BFloat16. Please refer to the BFloat16 example below.
|
||||
This is a PyTorch JIT graph fuser based on [oneDNN Graph API](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model), which provides a flexible API for aggressive fusion. Float & BFloat16 inference is supported. However, BFloat16 only performs well on Intel Xeon Cooper Lake platform & beyond, as they have native BFloat16 support. Also, currently, PyTorch has divergent AMP support in JIT & eager modes, so one should disable JIT AMP support & leverage eager mode AMP support to use BFloat16. Please refer to the BFloat16 example below.
|
||||
|
||||
Currently, speedup is achieved only for static shapes, although we'd soon add dynamic-shape support. When oneDNN Graph is enabled, weights are cached, as they're constant during inference.
|
||||
|
||||
|
|
@ -29,7 +29,7 @@ We have registered optimization passes in the custom pre-passes set of PyTorch:
|
|||
|
||||
## Graph Executor
|
||||
During runtime execution of a (re-written) PyTorch JIT graph, oneDNN graph partitions will be dispatched to the oneDNN graph JIT variadic Operator.
|
||||
Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#partition) and [executed](https://spec.oneapi.io/onednn-graph/latest/programming_model.html#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph.
|
||||
Inside the oneDNN graph JIT Op, input PyTorch tensors of each partition will be mapped to oneDNN graph tensors. The partition will then be [compiled](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#partition) and [executed](https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#compiled-partition). The output oneDNN graph tensor will be mapped back to PyTorch tensors to be fed to the next operator on the PyTorch JIT graph.
|
||||
|
||||
|
||||
## Tests
|
||||
|
|
|
|||
|
|
@ -143,7 +143,7 @@ ExprHandle abs(const ExprHandle& v) {
|
|||
}
|
||||
|
||||
// The default tanh is quite slow, use the Eigen version from here:
|
||||
// https://bitbucket.org/eigen/eigen/src/94875feeeeb9abe5509b314197da1991ba2070f5/Eigen/src/Core/MathFunctionsImpl.h#lines-26
|
||||
// https://github.com/TUW-VieVS/VieSchedpp/blob/master/Eigen/src/Core/MathFunctionsImpl.h#L26
|
||||
ExprHandle fast_tanh(const ExprHandle& v) {
|
||||
// TODO: use a dedicated bind-var to make sure v is not evaluated multiple
|
||||
// times. Clamp the input expression to [-9, 9]
|
||||
|
|
@ -205,7 +205,7 @@ ExprHandle fast_sigmoid(const ExprHandle& x) {
|
|||
|
||||
ExprHandle fast_log(const ExprHandle& v) {
|
||||
// this implementation is taken from sleef:
|
||||
// https://github.com/shibatch/sleef/blob/master/src/libm/sleefsp.c#L1131
|
||||
// https://github.com/shibatch/sleef/blob/master/src/libm/sleefsimdsp.c#L1277
|
||||
// to generate coefficients, this tool is provided
|
||||
// https://github.com/shibatch/sleef/blob/master/src/gencoef/gencoef.txt
|
||||
auto ilogb2kf = [](const ExprHandle& x) {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
* Cache utils in this file is adapted from PyTorch/XLA
|
||||
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/cache.h
|
||||
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/cache.h
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
* This file is adapted from PyTorch/XLA
|
||||
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h
|
||||
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
* This file is adapted from PyTorch/XLA
|
||||
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/multi_wait.h
|
||||
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/multi_wait.h
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
* This file is adapted from PyTorch/XLA
|
||||
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/metrics.h
|
||||
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/**
|
||||
* Unique in this file is adapted from PyTorch/XLA
|
||||
* https://github.com/pytorch/xla/blob/master/third_party/xla_client/unique.h
|
||||
* https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/unique.h
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user