From 64faa043f7dbf148a1e7935726c0fcc98735e017 Mon Sep 17 00:00:00 2001 From: Joel Schlosser Date: Sun, 13 Feb 2022 19:32:11 -0800 Subject: [PATCH 001/199] Mobile upgrader: add GELU test modules Test Plan: This is test code Reviewed By: tugsbayasgalan Differential Revision: D33858429 fbshipit-source-id: f5bc687ad81125f89cb281f9aa51dc5ac74d6954 (cherry picked from commit 202dff607e72d538c8b3fcd44382d11a401e52c5) --- test/jit/fixtures/test_versioned_gelu_out_v9.ptl | Bin 0 -> 3367 bytes test/jit/fixtures/test_versioned_gelu_v9.ptl | Bin 0 -> 3083 bytes test/jit/fixtures_srcs/fixtures_src.py | 15 +++++++++++++++ test/jit/fixtures_srcs/generate_models.py | 2 ++ 4 files changed, 17 insertions(+) create mode 100644 test/jit/fixtures/test_versioned_gelu_out_v9.ptl create mode 100644 test/jit/fixtures/test_versioned_gelu_v9.ptl diff --git a/test/jit/fixtures/test_versioned_gelu_out_v9.ptl b/test/jit/fixtures/test_versioned_gelu_out_v9.ptl new file mode 100644 index 0000000000000000000000000000000000000000..208ae5100757245d5935bef5b5b3c69ef5faef35 GIT binary patch literal 3367 zcmb_e2~ZPP7+%5=2p)hJwT?PrTENBx0WF8x0)lpeBt?WBk6Aa#CRwt{W-l%WK`lY8 zwWHS7YU@Fz%AnHLQf8|50OTM z8EIuPN855TUqY4)f2@?7X>f-Pw-WL+xq&Dtu_h)bDsj%7pfuuoLPygUrJlCR6>^Qq)*M^c1Ju$x%i0E~i6Cc>8HA&CBf7gHQ?hRq|^PT{Qsa zy#+TB7)wwD&OyDrh;GOF65XEgh~`NvfpI+R;u7p&O5j!?u!MwBagn`Ol?pjLBTt(# zD>*qiDJ3OkmO?Jr$TCde2q>ml$LY-!&B0ZrRxJ%J-xA}j4nPq2X`hIP82@-M;5@Fh zTd1duf3-x9VPXIBz|>e z913v^a7o%on~4cnhmHqpjN~w*Kv9?u1fvvZ4VW2#ijm|AIwnvF9k6dl;_20Z6MoPE zV?jbZnh{zg!(T!`u&6w?sGuz>V2jGf7N3>UUY?UV=V`L#(Y&&1J}qv0b;LR6xQW&U z$L~#Dp1w`-f;z5`y0N9PY5&x7uXi^!bs5485eq6bxB3_U*w`9NrfxQ0&UL2OR~)=@ z_e|yOWn%_x(&{p2U)6!=@W@g22WP&}kGzO4KJon|XYizroei7obO+)uUfX}=kB*H+ zSKDu2I<$SS=Fc3eB`G*m@m6~Z1w*5+R4&Yy43Z`}~D>^XGbA$xz-S8Wrz|N6Z$Jrx}|S_Ol#sC(x0 zE)a-U+-Q7!FqRU)D)#`cXaIMuMTs>HG{xp?_C)s_qgz&tnmp^m8YXeY=*|N>l>c5X?h#byQ$ASRn8hSDT+}t}uB(Wy zi@6n>LciC4ykkX7(Ql{!&bqZW!^{dxXHid7w@cGodQB~PdF>TEmO|5$hJ8woDwj7#x|LyGkM~>~S*=CWEwg$Q* z{p+I->fTt@^6@)!mwx%#l}l6Fe>i(h3d5AY>s#{x2vfG-q7jG8^dBZ0$Ky7hb5)KR z!*se=8dOW-#T2%x!iqp6L*V!18WxMh`cp?&&J!?=UE0J9N1JMiT8id^OFfxCoGPY# zd=*oI9s&y#gT$dDfJ_p2hZXcZsPh~X5Ql~U+M)tYes&%sQ3t3cwP-M)%*)qk^O-U zgUZY?p=bnH<2>kuX%z%Za~MTh2qp{#RD_YK18Py3hXKA3m~a$|1TO_<1klpq5~v4O zh{0w?=G8bv)nTvZm{IW#bR^(s2@}cjpzm^I1;LC~OB}qY3C@}XD*;f#M2Pmm7Rf}4 zQ;wjFOq4^MLt)GqbRLlOEP}y|b!lZFjJQC7HxEt;1T!wrQEQGw0qBu3X1v2ZMhr>; zVdH3)ncy(PePFLtGzauB5V&Y_q*(?y&F<|Q2VfCFtLhE9lL{gNzbAuMj=by-wC-0# zPG-_@@amO1;xtJHllj+Oro|8$@YAXT;Lr0W^nM*A!401Q+uT#W5Qkpi-F;(xghv2H z{9G{jyTX082jt|5pnidb@XFT;g;4E|8qh`u2xCYp7ejD`K3F9@(Id z_|?O!C=ZqD4cS;9sXhLsde_#WLcQT9_Yt}tR3bmBy0NcShlD8H=OzbE_eg+Q2*=?6 khzM*1iGt+{3~)}|YKO%Z44BngbucWm;EWW3{4U=A0L3aNx&QzG literal 0 HcmV?d00001 diff --git a/test/jit/fixtures/test_versioned_gelu_v9.ptl b/test/jit/fixtures/test_versioned_gelu_v9.ptl new file mode 100644 index 0000000000000000000000000000000000000000..5e4ffb20f823518a3730844f81fc515198e0b536 GIT binary patch literal 3083 zcmb_eeNa?Y6n`wRyMQ90sErD;z$g>!0|9|0G)+)hg_W;f<;?5t?%mx7yZiQitso;} z>LR8w+N3s3)F_tHSY~P2#Ps1ujfF<0;y)6^d}=9oHSS?9i8a2H)i%{z1F-E;4` zzjN=ozjMwr86%?zLaQZiYb23JNJx}uFA_wSb0aHlM^2gc7L%w+5*X4Wa8}uZ1e$f* zIDLc2xy$F0#54ETT67K$Y`7U0(iwDCGEq9aFdV=TzbagBNl=h9{A%rfh= z>I^OZ0qx?fcnjx|hSq37b|PA8_TYI>v*h61w3&4?LUV`)EZWQ#*ln;dHXE8jN}OPE zkTx^v;>@fQk+R1wFjlx2RA#4xnPx)3c&f?rGazz{Jm0Q9o>Rs7Sz(uyn{-lH?*&dkrx z&nqe_nwh1O-E0#riwMIsm_>`j$%$B6VKQp8&70FMw?z;H?xwJOND0peDy^U>>WC4k^C|JZ0DYh1L>W3IP9|&1r9Lp1#t5!^>Fo}Vf ztebV)`G|GYc#vsUq;0a(Nt@v%mN9~rb^xPzRi$j^BlY<9d}>BVD`?`5Z$A#A;r4J? zL}q*HMyJxnf?QeXmC(bBP=S|OVXJXqIR%6*ZO;G0r0 z;lRzv&UpHuzuqA zsjHq}c-VF1%%YbDOJXN)dSkEQ$CIaz9bDZzc-ZoB*ZGq1e?M1nRr8a#`+Dle(^kz5 zX4ac|dAmDBq}dyH>KIMrf>dtrIXZ` zE8oG6jha@l_yjnXxNA736Di>x>lP)(Es6e^oOHLG+@{J@qQFJQJOzb-z&(5pNFyE# zXVly*Ash*Qo73)QGq^&usf-$GAs8JFpnJ&z`a5I+JtSZp;5;#%8U=tXI6;st5+q5H zkC;xWVXmgOs=BO_R~aLWs&*}9IeEG#y^<{jNQH5m{$W* zn8(FR?gQ z;}fVkfb_q;$v@&hk`>t)*$F=Zj8jH@Vx_O$5l;azi6(xm&yk>bwHCQWPT&lR^AQH z$csY2;(@ziz#gc2jkFw`vg$)DmqScFOv|_O|43H`XR!2;GrSIg9`+1Dsd6M@FhSEo z5XV6>34<7%2S)-1(=0gzupRz5!T<(XZZfLXS%E#tk=et$ajfAs^xdkE7s@iuPS9YV Yva~zSFXPxX8KZGFggqJp>-%;811cG#$N&HU literal 0 HcmV?d00001 diff --git a/test/jit/fixtures_srcs/fixtures_src.py b/test/jit/fixtures_srcs/fixtures_src.py index 545152b6a3a..dff23702311 100644 --- a/test/jit/fixtures_srcs/fixtures_src.py +++ b/test/jit/fixtures_srcs/fixtures_src.py @@ -42,3 +42,18 @@ class TestVersionedLogspaceOutV8(torch.nn.Module): def forward(self, a: Union[int, float, complex], b: Union[int, float, complex], out: torch.Tensor): return torch.logspace(a, b, out=out) + +class TestVersionedGeluV9(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + return torch._C._nn.gelu(x) + +class TestVersionedGeluOutV9(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + out = torch.zeros_like(x) + return torch._C._nn.gelu(x, out=out) diff --git a/test/jit/fixtures_srcs/generate_models.py b/test/jit/fixtures_srcs/generate_models.py index 36b6b5ffe68..980e7dd0324 100644 --- a/test/jit/fixtures_srcs/generate_models.py +++ b/test/jit/fixtures_srcs/generate_models.py @@ -94,6 +94,8 @@ ALL_MODULES = { TestVersionedLinspaceOutV7(): "aten::linspace.out", TestVersionedLogspaceV8(): "aten::logspace", TestVersionedLogspaceOutV8(): "aten::logspace.out", + TestVersionedGeluV9(): "aten::gelu", + TestVersionedGeluOutV9(): "aten::gelu.out", } """ From 4f8b986e28736b59bc46cd0873a0f36fdaa6f5b8 Mon Sep 17 00:00:00 2001 From: Ryan Spring Date: Sun, 13 Feb 2022 19:32:11 -0800 Subject: [PATCH 002/199] Implement Tanh Gelu Approximation (#61439) Summary: 1. Implements https://github.com/pytorch/pytorch/issues/39853 2. Adds approximate boolean flag to Gelu 3. Enables Tanh Gelu approximation 4. Adds double backward support for Gelu 5. Enable Tanh Gelu in NvFuser ``` def gelu(x, approximate : str = 'none'): if approximate == 'tanh': # sqrt(2/pi) = 0.7978845608028654 return 0.5 * x * (1.0 + torch.tanh(0.7978845608028654 * (x + 0.044715 * torch.pow(x, 3.0)))) else: return x * normcdf(x) ``` Linking XLA PR - https://github.com/pytorch/xla/pull/3039 Pull Request resolved: https://github.com/pytorch/pytorch/pull/61439 Reviewed By: VitalyFedyunin Differential Revision: D33894937 Pulled By: jbschlosser fbshipit-source-id: b65e8fb6ea66168af8f34f45ed50e92737a33851 (cherry picked from commit 6e986f91a958dd73514b4e64984c0b149157dc6f) --- aten/src/ATen/autocast_mode.cpp | 2 +- aten/src/ATen/native/Activation.cpp | 22 ++- aten/src/ATen/native/Activation.h | 23 ++- aten/src/ATen/native/cpu/Activation.cpp | 173 +++++++++++++----- aten/src/ATen/native/cuda/Activation.cpp | 12 +- aten/src/ATen/native/cuda/Activation.cu | 79 ++++++-- aten/src/ATen/native/cuda/Activation.h | 5 +- aten/src/ATen/native/mkldnn/Gelu.cpp | 14 +- aten/src/ATen/native/native_functions.yaml | 8 +- .../cpu/kernels/QuantizedOpKernels.cpp | 106 +++++++---- aten/src/ATen/native/quantized/cpu/qgelu.cpp | 5 +- .../ATen/native/quantized/cpu/quantized_ops.h | 3 +- caffe2/serialize/versions.h | 8 +- test/cpp/api/functional.cpp | 11 +- test/cpp/api/modules.cpp | 12 +- .../check_forward_backward_compatibility.py | 8 +- test/jit/test_autodiff_subgraph_slicing.py | 4 +- test/onnx/test_custom_ops.py | 6 +- test/onnx/test_pytorch_onnx_caffe2.py | 12 +- test/onnx/test_pytorch_onnx_onnxruntime.py | 11 +- test/onnx/test_utility_funs.py | 8 +- test/quantization/core/test_quantized_op.py | 7 +- test/test_jit_cuda_fuser.py | 9 +- test/test_jit_fuser_te.py | 33 +++- test/test_nn.py | 42 ----- tools/autograd/derivatives.yaml | 9 +- .../include/torch/nn/functional/activation.h | 12 +- .../api/include/torch/nn/modules/activation.h | 5 + .../api/include/torch/nn/options/activation.h | 28 +++ torch/csrc/api/src/nn/modules/activation.cpp | 4 +- torch/csrc/autograd/FunctionsManual.cpp | 42 +++++ torch/csrc/autograd/FunctionsManual.h | 5 + torch/csrc/jit/codegen/cuda/graph_fuser.cpp | 4 + torch/csrc/jit/codegen/cuda/parser.cpp | 107 ++++++++++- torch/csrc/jit/mobile/upgrader_mobile.cpp | 55 +++++- .../operator_upgraders/upgraders_entry.cpp | 35 ++-- .../jit/operator_upgraders/version_map.cpp | 7 +- torch/csrc/jit/passes/shape_analysis.cpp | 2 +- torch/csrc/jit/runtime/symbolic_script.cpp | 12 +- .../runtime/symbolic_shape_registry_util.cpp | 2 +- torch/csrc/jit/tensorexpr/kernel.cpp | 2 + torch/csrc/jit/tensorexpr/lowerings.cpp | 48 +++-- torch/csrc/jit/tensorexpr/lowerings.h | 1 + torch/nn/functional.py | 9 +- torch/nn/functional.pyi.in | 2 +- torch/nn/modules/activation.py | 19 +- torch/onnx/symbolic_opset9.py | 25 ++- torch/overrides.py | 2 +- .../testing/_internal/autocast_test_lists.py | 3 +- .../_internal/common_methods_invocations.py | 28 ++- torch/testing/_internal/common_nn.py | 4 + 51 files changed, 825 insertions(+), 270 deletions(-) diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index 7bdab046419..bd9da6a4593 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -485,7 +485,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(ADD_NS(avg_pool1d), "avg_pool1d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool), fp32) KERNEL_CPU(ADD_NS(avg_pool2d), "avg_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional), fp32) KERNEL_CPU(ADD_NS(avg_pool3d), "avg_pool3d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional), fp32) - KERNEL_CPU(ADD_NS(gelu), "gelu", Tensor (const Tensor &), fp32) + KERNEL_CPU(ADD_NS(gelu), "gelu", Tensor (const Tensor &, c10::string_view), fp32) KERNEL_CPU(ADD_NS(upsample_nearest1d), "upsample_nearest1d", Tensor (const Tensor &, IntArrayRef, c10::optional), fp32) KERNEL_CPU(ADD_NS(upsample_nearest1d), "upsample_nearest1d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32) KERNEL_CPU(ADD_NS(_upsample_nearest_exact1d), "_upsample_nearest_exact1d", Tensor (const Tensor &, IntArrayRef, c10::optional), fp32) diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index ff79939830c..424dacf124e 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -164,12 +164,12 @@ TORCH_META_FUNC(softshrink_backward) ( build_borrowing_binary_op(maybe_get_output(), grad, self); } -TORCH_META_FUNC(gelu) (const Tensor & self) { +TORCH_META_FUNC(gelu) (const Tensor & self, c10::string_view approximate) { build_unary_op(maybe_get_output(), self); } TORCH_META_FUNC(gelu_backward) ( - const Tensor& grad, const Tensor& self + const Tensor& grad, const Tensor& self, c10::string_view approximate ) { build_borrowing_binary_op(maybe_get_output(), grad, self); } @@ -324,37 +324,39 @@ bool use_mkldnn(const Tensor& input) { } TORCH_IMPL_FUNC(gelu_out_cpu) ( - const Tensor& self, const Tensor& result + const Tensor& self, c10::string_view approximate, const Tensor& result ) { +auto approximate_type = get_gelutype_enum(approximate); #if AT_MKLDNN_ENABLED() - if (use_mkldnn(self)) { + if (use_mkldnn(self) && (approximate_type == GeluType::None)) { const ideep::tensor& x = itensor_from_tensor(self); ideep::tensor y = itensor_from_tensor(result); ideep::eltwise_forward::compute( x, y, ideep::algorithm::eltwise_gelu_erf, ideep::prop_kind::forward_training, /*alpha*/ 0.0); } else { - GeluKernel(kCPU, *this); + GeluKernel(kCPU, *this, approximate_type); } #else - GeluKernel(kCPU, *this); + GeluKernel(kCPU, *this, approximate_type); #endif } TORCH_IMPL_FUNC(gelu_backward_out_cpu) ( - const Tensor& grad, const Tensor& self, const Tensor& grad_input + const Tensor& grad, const Tensor& self, c10::string_view approximate, const Tensor& grad_input ) { +auto approximate_type = get_gelutype_enum(approximate); #if AT_MKLDNN_ENABLED() - if (use_mkldnn(self)) { + if (use_mkldnn(self) && (approximate_type == GeluType::None)) { const ideep::tensor& x = itensor_from_tensor(self); ideep::tensor grady = itensor_from_tensor(grad); ideep::tensor gradx = itensor_from_tensor(grad_input); ideep::eltwise_backward::compute(x, grady, gradx, ideep::algorithm::eltwise_gelu_erf, /*alpha*/ 0.0); } else { - GeluBackwardKernel(kCPU, *this); + GeluBackwardKernel(kCPU, *this, approximate_type); } #else - GeluBackwardKernel(kCPU, *this); + GeluBackwardKernel(kCPU, *this, approximate_type); #endif } diff --git a/aten/src/ATen/native/Activation.h b/aten/src/ATen/native/Activation.h index 963dc4665fd..6eb8182737b 100644 --- a/aten/src/ATen/native/Activation.h +++ b/aten/src/ATen/native/Activation.h @@ -14,6 +14,23 @@ class TensorBase; namespace at { namespace native { +// These constants control the approximation behavior of gelu function. +enum GeluType { + None, // Baseline Gelu + Tanh, // Tahn Gelu Approximation + END +}; + +static GeluType get_gelutype_enum(const c10::string_view approximate) { + if (approximate == "none") { + return GeluType::None; + } else if (approximate == "tanh") { + return GeluType::Tanh; + } else { + TORCH_CHECK(false, "approximate argument must be either none or tanh."); + } +} + using structured_activation_fn = void (*)(TensorIteratorBase&); using structured_activation_backward_fn = void (*)(TensorIteratorBase&); @@ -35,6 +52,8 @@ using elu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const using leaky_relu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); using leaky_relu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); using log_sigmoid_cpu_fn = void (*)(TensorBase&, TensorBase&, const TensorBase&); +using gelu_fn = void (*)(TensorIteratorBase&, GeluType); +using gelu_backward_fn = void (*)(TensorIteratorBase&, GeluType); DECLARE_DISPATCH(elu_fn, elu_stub); DECLARE_DISPATCH(elu_backward_fn, elu_backward_stub); @@ -43,8 +62,8 @@ DECLARE_DISPATCH(softplus_backward_fn, softplus_backward_stub); DECLARE_DISPATCH(log_sigmoid_cpu_fn, log_sigmoid_cpu_stub); DECLARE_DISPATCH(activation_backward_fn, log_sigmoid_backward_stub); DECLARE_DISPATCH(threshold_fn, threshold_stub); -DECLARE_DISPATCH(structured_activation_fn, GeluKernel); -DECLARE_DISPATCH(structured_activation_backward_fn, GeluBackwardKernel); +DECLARE_DISPATCH(gelu_fn, GeluKernel); +DECLARE_DISPATCH(gelu_backward_fn, GeluBackwardKernel); DECLARE_DISPATCH(hardtanh_backward_fn, hardtanh_backward_stub); DECLARE_DISPATCH(hardsigmoid_fn, hardsigmoid_stub); DECLARE_DISPATCH(hardsigmoid_backward_fn, hardsigmoid_backward_stub); diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index b192d0c4d70..1eebcde30c9 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -166,7 +166,7 @@ void elu_backward_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scal // TODO(yangxm): Add another fast kernel using formula // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3))) // and the fast tanh impl from Eigen. -void GeluKernelImpl(TensorIteratorBase& it) { +void GeluKernelImpl(TensorIteratorBase& it, GeluType approximate) { auto grain_size = at::internal::GRAIN_SIZE; // Numbers based on benchmarking. // Benchmark: benchmarks/operator_benchmarks/pt/gelu_test.py @@ -187,53 +187,134 @@ void GeluKernelImpl(TensorIteratorBase& it) { if (it.numel() > GELU_MIN_ELEMENTS_FOR_MULTI_THREADING) { grain_size = it.numel() / at::get_num_threads(); } - AT_DISPATCH_FLOATING_TYPES_AND( - ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() { - using Vec = vec::Vectorized; - const Vec kAlphaVec(scalar_t(M_SQRT1_2)); - const Vec kOneVec(scalar_t(1)); - const Vec kPointFiveVec(scalar_t(0.5)); - cpu_kernel_vec( - it, - [](scalar_t x) { - const scalar_t kAlpha = scalar_t(M_SQRT1_2); - return x * scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha)); - }, - [&](Vec x_vec) { - return x_vec * kPointFiveVec * - (kOneVec + (x_vec * kAlphaVec).erf()); - }, - grain_size); - }); + if (approximate == GeluType::Tanh) { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() { + using Vec = vec::Vectorized; + const Vec kBetaVec(scalar_t(M_SQRT2 * M_2_SQRTPI * 0.5)); + const Vec kKappaVec(scalar_t(0.044715)); + const Vec kOneVec(scalar_t(1)); + const Vec kPointFiveVec(scalar_t(0.5)); + cpu_kernel_vec( + it, + [](scalar_t x) { + const scalar_t kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + const scalar_t kKappa = 0.044715; + auto x_cube = x * x * x; + auto inner = kBeta * (x + kKappa * x_cube); + return scalar_t(0.5) * x * (scalar_t(1) + std::tanh(inner)); + }, + [&](Vec x_vec) { + auto x_cube = x_vec * x_vec * x_vec; + auto inner_vec = kBetaVec * (x_vec + kKappaVec * x_cube); + return kPointFiveVec * x_vec * (kOneVec + inner_vec.tanh()); + }, + grain_size); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, it.dtype(), "GeluKernelImpl", [&]() { + using Vec = vec::Vectorized; + const Vec kAlphaVec(scalar_t(M_SQRT1_2)); + const Vec kOneVec(scalar_t(1)); + const Vec kPointFiveVec(scalar_t(0.5)); + cpu_kernel_vec( + it, + [](scalar_t x) { + const scalar_t kAlpha = scalar_t(M_SQRT1_2); + return x * scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha)); + }, + [&](Vec x_vec) { + return x_vec * kPointFiveVec * + (kOneVec + (x_vec * kAlphaVec).erf()); + }, + grain_size); + }); + } } -void GeluBackwardKernelImpl(TensorIteratorBase& it) { - AT_DISPATCH_FLOATING_TYPES_AND( - ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() { - using Vec = vec::Vectorized; - const Vec kAlphaVec(scalar_t(M_SQRT1_2)); - const Vec kBetaVec(scalar_t(M_2_SQRTPI * M_SQRT1_2 * 0.5)); - const Vec kOneVec(scalar_t(1)); - const Vec kPointFiveVec(scalar_t(0.5)); - const Vec kMinusPointFiveVec(scalar_t(-0.5)); - cpu_kernel_vec( - it, - [](scalar_t dy, scalar_t x) { - const scalar_t kAlpha = scalar_t(M_SQRT1_2); - const scalar_t kBeta = M_2_SQRTPI * M_SQRT1_2 * scalar_t(0.5); - const scalar_t cdf = - scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha)); - const scalar_t pdf = kBeta * std::exp(x * x * scalar_t(-0.5)); - return dy * (cdf + x * pdf); - }, - [&](Vec dy_vec, Vec x_vec) { - const Vec cdf_vec = - kPointFiveVec * (kOneVec + (x_vec * kAlphaVec).erf()); - const Vec pdf_vec = - kBetaVec * (x_vec * x_vec * kMinusPointFiveVec).exp(); - return dy_vec * (cdf_vec + x_vec * pdf_vec); - }); - }); +void GeluBackwardKernelImpl(TensorIteratorBase& it, GeluType approximate) { + if (approximate == GeluType::Tanh) { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() { + using Vec = vec::Vectorized; + const Vec kBetaVec(scalar_t(M_SQRT2 * M_2_SQRTPI * 0.5)); + const Vec kKappaVec(scalar_t(0.044715)); + const Vec kOneVec(scalar_t(1)); + const Vec kThreeVec(scalar_t(3)); + const Vec kPointFiveVec(scalar_t(0.5)); + cpu_kernel_vec( + it, + [](scalar_t dy, scalar_t x) { + const scalar_t kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + const scalar_t kKappa = 0.044715; + auto x_sq = x * x; + auto x_cube = x_sq * x; + auto inner = kBeta * (x + kKappa * x_cube); + auto tanh_inner = std::tanh(inner); + + auto left = scalar_t(0.5) * x; + auto right = scalar_t(1) + tanh_inner; + + auto left_derivative = scalar_t(0.5) * right; + + auto tanh_derivative = scalar_t(1) - tanh_inner * tanh_inner; + auto inner_derivative = + kBeta * (scalar_t(1) + scalar_t(3) * kKappa * x_sq); + auto right_derivative = left * tanh_derivative * inner_derivative; + + return dy * (left_derivative + right_derivative); + }, + [&](Vec dy_vec, Vec x_vec) { + auto x_sq = x_vec * x_vec; + auto x_cube = x_vec * x_vec * x_vec; + auto inner_vec = + kBetaVec * (x_vec + kKappaVec * x_cube); + auto tanh_inner_vec = inner_vec.tanh(); + + auto left_vec = kPointFiveVec * x_vec; + auto right_vec = kOneVec + tanh_inner_vec; + + auto left_derivative_vec = kPointFiveVec * right_vec; + + auto tanh_derivative_vec = + kOneVec - tanh_inner_vec * tanh_inner_vec; + auto inner_derivative_vec = + kBetaVec * (kOneVec + kThreeVec * kKappaVec * x_sq); + auto right_derivative_vec = + left_vec * tanh_derivative_vec * inner_derivative_vec; + + return dy_vec * (left_derivative_vec + right_derivative_vec); + }); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND( + ScalarType::BFloat16, it.dtype(), "GeluBackwardKernelImpl", [&]() { + using Vec = vec::Vectorized; + const Vec kAlphaVec(scalar_t(M_SQRT1_2)); + const Vec kBetaVec(scalar_t(M_2_SQRTPI * M_SQRT1_2 * 0.5)); + const Vec kOneVec(scalar_t(1)); + const Vec kPointFiveVec(scalar_t(0.5)); + const Vec kMinusPointFiveVec(scalar_t(-0.5)); + cpu_kernel_vec( + it, + [](scalar_t dy, scalar_t x) { + const scalar_t kAlpha = scalar_t(M_SQRT1_2); + const scalar_t kBeta = M_2_SQRTPI * M_SQRT1_2 * scalar_t(0.5); + const scalar_t cdf = + scalar_t(0.5) * (scalar_t(1) + std::erf(x * kAlpha)); + const scalar_t pdf = kBeta * std::exp(x * x * scalar_t(-0.5)); + return dy * (cdf + x * pdf); + }, + [&](Vec dy_vec, Vec x_vec) { + const Vec cdf_vec = + kPointFiveVec * (kOneVec + (x_vec * kAlphaVec).erf()); + const Vec pdf_vec = + kBetaVec * (x_vec * x_vec * kMinusPointFiveVec).exp(); + return dy_vec * (cdf_vec + x_vec * pdf_vec); + }); + }); + } } void hardsigmoid_kernel(TensorIteratorBase& iter) { diff --git a/aten/src/ATen/native/cuda/Activation.cpp b/aten/src/ATen/native/cuda/Activation.cpp index 2dfe0a862ea..23e8bc697f7 100644 --- a/aten/src/ATen/native/cuda/Activation.cpp +++ b/aten/src/ATen/native/cuda/Activation.cpp @@ -156,15 +156,15 @@ std::tuple prelu_backward_cuda(const Tensor& grad_out_, const Te } TORCH_IMPL_FUNC(gelu_out_cuda) ( - const Tensor& /*self*/, const Tensor& /*result*/ - ) { - GeluCUDAKernelImpl(*this); + const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*result*/ +) { + GeluCUDAKernelImpl(*this, get_gelutype_enum(approximate)); } TORCH_IMPL_FUNC(gelu_backward_out_cuda) ( - const Tensor& /*grad*/, const Tensor& /*self*/, const Tensor& /*grad_input*/ - ) { - GeluBackwardCUDAKernelImpl(*this); + const Tensor& /*grad*/, const Tensor& /*self*/, c10::string_view approximate, const Tensor& /*grad_input*/ +) { + GeluBackwardCUDAKernelImpl(*this, get_gelutype_enum(approximate)); } }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Activation.cu b/aten/src/ATen/native/cuda/Activation.cu index 168e142dd29..e3acad92f90 100644 --- a/aten/src/ATen/native/cuda/Activation.cu +++ b/aten/src/ATen/native/cuda/Activation.cu @@ -392,30 +392,71 @@ void elu_backward_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Sc }); } -void GeluCUDAKernelImpl(TensorIteratorBase& it) { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() { - using T_ACC = acc_type; - gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t { - return static_cast(x) * - c10::cuda::compat::normcdf(static_cast(x)); +void GeluCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate) { + if (approximate == GeluType::Tanh) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() { + gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t { + using opmath_t = at::opmath_type; + constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5); + constexpr opmath_t kKappa = 0.044715; + auto x_cube = static_cast(x) * static_cast(x) * static_cast(x); + auto inner = kBeta * (static_cast(x) + kKappa * x_cube); + return opmath_t(0.5) * static_cast(x) * (opmath_t(1) + c10::cuda::compat::tanh(inner)); + }); }); - }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, it.dtype(), "GeluCUDAKernelImpl", [&]() { + gpu_kernel(it, [] GPU_LAMBDA(scalar_t x) -> scalar_t { + using opmath_t = at::opmath_type; + constexpr opmath_t kAlpha = M_SQRT1_2; + return static_cast(x) * opmath_t(0.5) * (opmath_t(1) + ::erf(static_cast(x) * kAlpha)); + }); + }); + } } -void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it) { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, - it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() { - using T_ACC = acc_type; - gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { - constexpr T_ACC kBeta = M_2_SQRTPI * M_SQRT1_2 * T_ACC(0.5); - const T_ACC cdf = c10::cuda::compat::normcdf(static_cast(x)); - const T_ACC pdf = - c10::cuda::compat::exp( - T_ACC(-0.5) * static_cast(x) * static_cast(x)) * - kBeta; - return static_cast(dy) * (cdf + static_cast(x) * pdf); +void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate) { + if (approximate == GeluType::Tanh) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, + it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() { + gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { + using opmath_t = at::opmath_type; + constexpr opmath_t kBeta = M_SQRT2 * M_2_SQRTPI * opmath_t(0.5); + constexpr opmath_t kKappa = 0.044715; + auto x_sq = static_cast(x) * static_cast(x); + auto x_cube = x_sq * static_cast(x); + auto inner = kBeta * (static_cast(x) + kKappa * x_cube); + auto tanh_inner = c10::cuda::compat::tanh(inner); + + auto left = opmath_t(0.5) * static_cast(x); + auto right = opmath_t(1) + tanh_inner; + + auto left_derivative = 0.5 * right; + + auto tanh_derivative = opmath_t(1) - tanh_inner * tanh_inner; + auto inner_derivative = kBeta * (opmath_t(1) + opmath_t(3) * kKappa * x_sq); + auto right_derivative = left * tanh_derivative * inner_derivative; + + return static_cast(dy) * (left_derivative + right_derivative); }); }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, + it.dtype(), "GeluBackwardCUDAKernelImpl", [&]() { + gpu_kernel(it, [] GPU_LAMBDA(scalar_t dy, scalar_t x) -> scalar_t { + using opmath_t = at::opmath_type; + constexpr opmath_t kBeta = M_2_SQRTPI * M_SQRT1_2 * opmath_t(0.5); + constexpr opmath_t kAlpha = M_SQRT1_2; + const opmath_t cdf = + opmath_t(0.5) * (opmath_t(1) + ::erf(static_cast(x) * kAlpha)); + const opmath_t pdf = + c10::cuda::compat::exp( + opmath_t(-0.5) * static_cast(x) * static_cast(x)) * + kBeta; + return static_cast(dy) * (cdf + static_cast(x) * pdf); + }); + }); + } } namespace { diff --git a/aten/src/ATen/native/cuda/Activation.h b/aten/src/ATen/native/cuda/Activation.h index 5e798316c9b..ca0ad3828da 100644 --- a/aten/src/ATen/native/cuda/Activation.h +++ b/aten/src/ATen/native/cuda/Activation.h @@ -1,4 +1,5 @@ +#include #include namespace at { @@ -24,7 +25,7 @@ void launch_prelu_cuda_backward_kernel_multi_weights( const TensorBase &input, const TensorBase &weight, const TensorBase &grad_out, const TensorBase &input_grad, const TensorBase &weight_grad_collector); -void GeluCUDAKernelImpl(TensorIteratorBase& it); -void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it); +void GeluCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate); +void GeluBackwardCUDAKernelImpl(TensorIteratorBase& it, GeluType approximate); }} // namespace at::native diff --git a/aten/src/ATen/native/mkldnn/Gelu.cpp b/aten/src/ATen/native/mkldnn/Gelu.cpp index fa78cd1c3a9..1d2a6725151 100644 --- a/aten/src/ATen/native/mkldnn/Gelu.cpp +++ b/aten/src/ATen/native/mkldnn/Gelu.cpp @@ -1,17 +1,17 @@ #include #include #include - +#include #if !AT_MKLDNN_ENABLED() namespace at { namespace native { -Tensor mkldnn_gelu(const Tensor& input) { +Tensor mkldnn_gelu(const Tensor& input, c10::string_view approximate) { TORCH_CHECK(false, "mkldnn_gelu: ATen not compiled with MKLDNN support"); } -Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) { +Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input, c10::string_view approximate) { TORCH_CHECK(false, "mkldnn_gelu_backward: ATen not compiled with MKLDNN support"); } @@ -24,11 +24,13 @@ Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) { namespace at { namespace native { -Tensor mkldnn_gelu(const Tensor& input) { +Tensor mkldnn_gelu(const Tensor& input, c10::string_view approximate) { if (input.scalar_type() == ScalarType::BFloat16) { TORCH_CHECK(mkldnn_bf16_device_check(), "mkldnn_gelu: bf16 path needs the cpu support avx512bw, avx512vl and avx512dq"); } + TORCH_CHECK(get_gelutype_enum(approximate) == GeluType::None, + "mkldnn_gelu: fast, approximate gelu is not supported"); const ideep::tensor& x = itensor_from_tensor(input); ideep::tensor y; ideep::eltwise_forward::compute( @@ -37,7 +39,9 @@ Tensor mkldnn_gelu(const Tensor& input) { input.options().device_opt()); } -Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input) { +Tensor mkldnn_gelu_backward(const Tensor& grad_output, const Tensor& input, c10::string_view approximate) { + TORCH_CHECK(get_gelutype_enum(approximate) == GeluType::None, + "mkldnn_gelu_backward: fast, approximate gelu is not supported"); const ideep::tensor& x = itensor_from_tensor(input); ideep::tensor grady = itensor_from_tensor(grad_output); ideep::tensor gradx; diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 8c333efd3bf..93c9ab24c79 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -3736,7 +3736,7 @@ CPU: prelu_backward_cpu CUDA: prelu_backward_cuda -- func: gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +- func: gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase device_check: NoCheck # TensorIterator @@ -3745,7 +3745,7 @@ CPU: gelu_out_cpu CUDA: gelu_out_cuda -- func: gelu(Tensor self) -> Tensor +- func: gelu(Tensor self, *, str approximate='none') -> Tensor structured_delegate: gelu.out device_check: NoCheck # TensorIterator python_module: nn @@ -3753,7 +3753,7 @@ MkldnnCPU: mkldnn_gelu QuantizedCPU: gelu_quantized_cpu -- func: gelu_backward.grad_input(Tensor grad, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +- func: gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) structured: True structured_inherits: TensorIteratorBase python_module: nn @@ -3761,7 +3761,7 @@ CPU: gelu_backward_out_cpu CUDA: gelu_backward_out_cuda -- func: gelu_backward(Tensor grad, Tensor self) -> Tensor +- func: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor structured_delegate: gelu_backward.grad_input python_module: nn dispatch: diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index 23afea3e52c..77c9756e366 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -615,7 +616,7 @@ static void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx, }); } -void qgelu_kernel(const Tensor& qx, Tensor& qy) { +void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) { int64_t zero_point = qx.q_zero_point(); // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) float scale = qx.q_scale(); @@ -626,40 +627,83 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy) { float output_scale = scale; float inv_output_scale = 1.0 / output_scale; const auto kAlphaVec = Vectorized(M_SQRT1_2); + const auto kBetaVec = Vectorized(M_SQRT2 * M_2_SQRTPI * 0.5); + const auto kKappaVec = Vectorized(0.044715); const auto kOneVec = Vectorized(1); const auto kPointFiveVec = Vectorized(0.5); - AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() { - qy = at::_empty_affine_quantized( - qx.sizes(), - // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) - at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()), - output_scale, - output_zero_point, - c10::nullopt); - auto iter = TensorIterator::unary_op(qy, qx); + if (approximate == GeluType::Tanh) { + AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() { + qy = at::_empty_affine_quantized( + qx.sizes(), + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()), + output_scale, + output_zero_point, + c10::nullopt); + auto iter = TensorIterator::unary_op(qy, qx); - using Vec = Vectorized; - cpu_kernel_vec( - iter, - [&](scalar_t value_qx) -> scalar_t { - const auto value_dx = - at::native::dequantize_val(scale, zero_point, value_qx); - const auto value_dy = - value_dx * 0.5 * (1 + std::erf(value_dx * M_SQRT1_2)); - return at::native::quantize_val( - output_scale, output_zero_point, value_dy); - }, - [&](Vec value_qx) -> Vec { - auto value_dx = value_qx.dequantize( - scale_vec, zero_point_vec, scale_neg_zp_premul_vec); - for (auto & value : value_dx) { - value = value * kPointFiveVec * (kOneVec + (value * kAlphaVec).erf()); - } - return Vec::quantize( - value_dx, output_scale, output_zero_point, inv_output_scale); - }); - }); + using Vec = Vectorized; + cpu_kernel_vec( + iter, + [&](scalar_t value_qx) -> scalar_t { + const auto value_dx = + at::native::dequantize_val(scale, zero_point, value_qx); + + const auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + const auto kKappa = 0.044715; + const auto x_cube = value_dx * value_dx * value_dx; + const auto inner = kBeta * (value_dx + kKappa * x_cube); + const auto value_dy = 0.5 * value_dx * (1.0 + std::tanh(inner)); + + return at::native::quantize_val( + output_scale, output_zero_point, value_dy); + }, + [&](Vec value_qx) -> Vec { + auto value_dx = value_qx.dequantize( + scale_vec, zero_point_vec, scale_neg_zp_premul_vec); + for (auto & value : value_dx) { + auto value_cube = value * value * value; + auto inner = kBetaVec * (value + kKappaVec * value_cube); + value = kPointFiveVec * value * (kOneVec + inner.tanh()); + } + return Vec::quantize( + value_dx, output_scale, output_zero_point, inv_output_scale); + }); + }); + } else { + AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qgelu", [&]() { + qy = at::_empty_affine_quantized( + qx.sizes(), + // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage) + at::device(kCPU).dtype(SCALAR_TYPE).memory_format(qx.suggest_memory_format()), + output_scale, + output_zero_point, + c10::nullopt); + auto iter = TensorIterator::unary_op(qy, qx); + + using Vec = Vectorized; + cpu_kernel_vec( + iter, + [&](scalar_t value_qx) -> scalar_t { + const auto value_dx = + at::native::dequantize_val(scale, zero_point, value_qx); + const auto value_dy = + value_dx * 0.5 * (1 + std::erf(value_dx * M_SQRT1_2)); + return at::native::quantize_val( + output_scale, output_zero_point, value_dy); + }, + [&](Vec value_qx) -> Vec { + auto value_dx = value_qx.dequantize( + scale_vec, zero_point_vec, scale_neg_zp_premul_vec); + for (auto & value : value_dx) { + value = value * kPointFiveVec * (kOneVec + (value * kAlphaVec).erf()); + } + return Vec::quantize( + value_dx, output_scale, output_zero_point, inv_output_scale); + }); + }); + } } diff --git a/aten/src/ATen/native/quantized/cpu/qgelu.cpp b/aten/src/ATen/native/quantized/cpu/qgelu.cpp index 7c0ee3cd784..c07796f608d 100644 --- a/aten/src/ATen/native/quantized/cpu/qgelu.cpp +++ b/aten/src/ATen/native/quantized/cpu/qgelu.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -15,9 +16,9 @@ namespace native { DEFINE_DISPATCH(qgelu_stub); -Tensor gelu_quantized_cpu(const Tensor& qx) { +Tensor gelu_quantized_cpu(const Tensor& qx, c10::string_view approximate) { Tensor qy; - qgelu_stub(qx.device().type(), qx, qy); + qgelu_stub(qx.device().type(), qx, qy, get_gelutype_enum(approximate)); return qy; } }} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cpu/quantized_ops.h b/aten/src/ATen/native/quantized/cpu/quantized_ops.h index a1766380fe5..bfa1f1f7756 100644 --- a/aten/src/ATen/native/quantized/cpu/quantized_ops.h +++ b/aten/src/ATen/native/quantized/cpu/quantized_ops.h @@ -1,4 +1,5 @@ #include +#include #include #include @@ -8,7 +9,7 @@ namespace native { using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, const Scalar& /*negval_*/); -using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */); using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point); using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); using qclamp_fn = void (*)( diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h index 9e89fe9acd6..fa18e46b2c6 100644 --- a/caffe2/serialize/versions.h +++ b/caffe2/serialize/versions.h @@ -12,7 +12,7 @@ namespace serialize { constexpr uint64_t kMinSupportedFileFormatVersion = 0x1L; #if ENABLE_UPGRADERS -constexpr uint64_t kMaxSupportedFileFormatVersion = 0x9L; +constexpr uint64_t kMaxSupportedFileFormatVersion = 0xAL; #else constexpr uint64_t kMaxSupportedFileFormatVersion = 0x6L; #endif @@ -79,7 +79,11 @@ constexpr uint64_t kMaxSupportedFileFormatVersion = 0x6L; // Bump the version number to 9 to update aten::logspace and // and aten::logspace.out to error out when steps is not // provided. (see: https://github.com/pytorch/pytorch/issues/55951) -constexpr uint64_t kProducedFileFormatVersion = 0x9L; +// 3) [02/11/2022] +// Bump the version number to 10 to update aten::gelu and +// and aten::gelu.out to support the new approximate kwarg. +// (see: https://github.com/pytorch/pytorch/pull/61439) +constexpr uint64_t kProducedFileFormatVersion = 0xAL; #else constexpr uint64_t kProducedFileFormatVersion = 0x3L; #endif diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp index db0f4d25168..1c2a042a471 100644 --- a/test/cpp/api/functional.cpp +++ b/test/cpp/api/functional.cpp @@ -973,10 +973,17 @@ TEST_F(FunctionalTest, GLU) { } TEST_F(FunctionalTest, GELU) { - GELU model; const auto x = torch::linspace(-3.0, 3.0, 100); const auto y_exp = x * 0.5 * (1.0 + torch::erf(x / std::sqrt(2.0))); - const auto y = F::gelu(x); + const auto y = F::gelu(x, F::GELUFuncOptions().approximate("none")); + ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05)); +} + +TEST_F(FunctionalTest, TanhGELU) { + const auto x = torch::linspace(-3.0, 3.0, 100); + const auto inner = std::sqrt(2 / M_PI) * (x + 0.044715 * x.pow(3.0)); + const auto y_exp = 0.5 * x * (1.0 + inner.tanh()); + const auto y = F::gelu(x, F::GELUFuncOptions().approximate("tanh")); ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05)); } diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 8632f3e195c..cdf4f0ea0de 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -2860,13 +2860,23 @@ TEST_F(ModulesTest, GLU) { } TEST_F(ModulesTest, GELU) { - GELU model; + GELU model(GELUOptions().approximate("none")); const auto x = torch::linspace(-3.0, 3.0, 100); const auto y_exp = x * 0.5 * (1.0 + torch::erf(x / std::sqrt(2.0))); const auto y = model(x); ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05)); } +TEST_F(ModulesTest, TanhGELU) { + GELU model(GELUOptions().approximate("tanh")); + const auto x = torch::linspace(-3.0, 3.0, 100); + const auto inner = std::sqrt(2 / M_PI) * (x + 0.044715 * x.pow(3.0)); + const auto y_exp = 0.5 * x * (1.0 + inner.tanh()); + const auto y = model(x); + ASSERT_TRUE(torch::allclose(y, y_exp, 1.4e-06, 1e-05)); +} + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) TEST_F(ModulesTest, Mish) { Mish model; auto x = torch::randn(100) * 10; diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py index 56070e62cfa..e15ac0f29bc 100644 --- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py +++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py @@ -50,12 +50,8 @@ ALLOW_LIST = [ ("aten::adaptive_avg_pool3d_backward", datetime.date(9999, 1, 1)), ("aten::_embedding_bag_dense_backward", datetime.date(9999, 1, 1)), ("aten::randperm", datetime.date(9999, 1, 1)), - ("aten::_conv_depthwise2d_backward", datetime.date(2022, 1, 31)), - ("aten::conv_depthwise3d_backward", datetime.date(2022, 1, 31)), - ("aten::cudnn_convolution.deprecated", datetime.date(2022, 1, 31)), - ("aten::cudnn_convolution.deprecated2", datetime.date(2022, 1, 31)), - ("aten::cudnn_convolution_transpose.deprecated", datetime.date(2022, 1, 31)), - ("aten::cudnn_convolution_transpose.deprecated2", datetime.date(2022, 1, 31)), + ("aten::gelu", datetime.date(2022, 3, 1)), + ("aten::gelu_backward", datetime.date(2022, 3, 1)), ("aten::cudnn_convolution_backward", datetime.date(2022, 1, 31)), ("aten::cudnn_convolution_backward_input", datetime.date(2022, 1, 31)), ("aten::cudnn_convolution_backward_weight", datetime.date(2022, 1, 31)), diff --git a/test/jit/test_autodiff_subgraph_slicing.py b/test/jit/test_autodiff_subgraph_slicing.py index 8454f786edb..4b72fc6f456 100644 --- a/test/jit/test_autodiff_subgraph_slicing.py +++ b/test/jit/test_autodiff_subgraph_slicing.py @@ -447,7 +447,7 @@ class TestAutodiffSubgraphSlicing(JitTestCase): %0 : int[] = prim::Constant[value=[2, 2, 1]]() %1 : int = prim::Constant[value=0]() %2 : Tensor = aten::t(%b) - %3 : Tensor = aten::gelu(%2) + %3 : Tensor = aten::relu(%2) %4 : (Tensor, Tensor, Tensor[]) = prim::TupleConstruct(%b, %3, %2) return (%4) """ @@ -471,7 +471,7 @@ class TestAutodiffSubgraphSlicing(JitTestCase): %1 : int = prim::Constant[value=0]() %d : Tensor = aten::t(%c) %2 : Tensor = aten::t(%b) - %3 : Tensor = aten::gelu(%2) + %3 : Tensor = aten::relu(%2) %4 : (Tensor, Tensor, Tensor[]) = prim::TupleConstruct(%3, %2, %d, %b, %c, %b) return (%4) """ diff --git a/test/onnx/test_custom_ops.py b/test/onnx/test_custom_ops.py index bed480fc2d8..c2d1ec27eed 100644 --- a/test/onnx/test_custom_ops.py +++ b/test/onnx/test_custom_ops.py @@ -137,7 +137,7 @@ class TestExportAsContribOps(unittest.TestCase): class M(torch.nn.Module): def __init__(self): super().__init__() - self.gelu = torch.nn.GELU() + self.gelu = torch.nn.GELU(approximate='none') def forward(self, x): res = [] @@ -150,7 +150,7 @@ class TestExportAsContribOps(unittest.TestCase): res.append(x[0]) return torch.stack(res), torch.stack(res2) - def symbolic_custom_gelu(g, input): + def symbolic_custom_gelu(g, input, approximate): return g.op("com.microsoft::Gelu", input).setType(input.type()) from torch.onnx import register_custom_op_symbolic @@ -158,7 +158,7 @@ class TestExportAsContribOps(unittest.TestCase): x = torch.randn(3, 3, 4, requires_grad=True) model = torch.jit.script(M()) - run_model_test(self, model, input=(x, )) + run_model_test(self, model, input=(x,)) if __name__ == "__main__": unittest.main() diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py index 77c2b85f27f..72ff9392254 100644 --- a/test/onnx/test_pytorch_onnx_caffe2.py +++ b/test/onnx/test_pytorch_onnx_caffe2.py @@ -2383,7 +2383,17 @@ class TestCaffe2Backend_opset9(unittest.TestCase): def test_gelu(self): class GeluModel(torch.nn.Module): def forward(self, x): - return torch.nn.functional.gelu(x) + return torch.nn.functional.gelu(x, approximate='none') + + model = GeluModel() + inputs = torch.randn(2, 4, 5, 6, requires_grad=True) + self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_tanh_gelu(self): + class GeluModel(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.gelu(x, approximate='tanh') model = GeluModel() inputs = torch.randn(2, 4, 5, 6, requires_grad=True) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 2ae439d7705..c71a9756408 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -6256,7 +6256,16 @@ class TestONNXRuntime(unittest.TestCase): def test_gelu(self): class GeluModel(torch.nn.Module): def forward(self, x): - return torch.nn.functional.gelu(x) + return torch.nn.functional.gelu(x, approximate='none') + + x = torch.randn(2, 4, 5, 6, requires_grad=True) + self.run_test(GeluModel(), x) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_tanh_gelu(self): + class GeluModel(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.gelu(x, approximate='tanh') x = torch.randn(2, 4, 5, 6, requires_grad=True) self.run_test(GeluModel(), x) diff --git a/test/onnx/test_utility_funs.py b/test/onnx/test_utility_funs.py index 01393915380..433a9d2cc75 100644 --- a/test/onnx/test_utility_funs.py +++ b/test/onnx/test_utility_funs.py @@ -822,11 +822,11 @@ class TestUtilityFuns_opset9(_BaseTestCase): def test_custom_opsets_gelu(self): self.addCleanup(unregister_custom_op_symbolic, "::gelu", 1) - def gelu(g, self): + def gelu(g, self, approximate): return g.op("com.microsoft::Gelu", self).setType(self.type()) register_custom_op_symbolic("::gelu", gelu, 1) - model = torch.nn.GELU() + model = torch.nn.GELU(approximate='none') x = torch.randn(3, 3) f = io.BytesIO() torch.onnx.export(model, (x, ), f, @@ -842,11 +842,11 @@ class TestUtilityFuns_opset9(_BaseTestCase): def test_register_aten_custom_op_symbolic(self): self.addCleanup(unregister_custom_op_symbolic, "aten::gelu", 1) - def gelu(g, self): + def gelu(g, self, approximate): return g.op("com.microsoft::Gelu", self).setType(self.type()) register_custom_op_symbolic("aten::gelu", gelu, 1) - model = torch.nn.GELU() + model = torch.nn.GELU(approximate='none') x = torch.randn(3, 3) f = io.BytesIO() torch.onnx.export(model, (x, ), f, opset_version=self.opset_version) diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py index be84e7bd4e8..2097a710d89 100644 --- a/test/quantization/core/test_quantized_op.py +++ b/test/quantization/core/test_quantized_op.py @@ -441,8 +441,9 @@ class TestQuantizedOps(TestCase): shapes = ((4,), (4, 4), (4, 4, 4), (4, 4, 4, 4)) dtypes = (torch.quint8, torch.qint8) memory_formats = (torch.channels_last, torch.contiguous_format) - test_cases = itertools.product(shapes, dtypes, memory_formats) - for shape, dtype, memory_format in test_cases: + approximation = ['none', 'tanh'] + test_cases = itertools.product(shapes, dtypes, memory_formats, approximation) + for shape, dtype, memory_format, approximate in test_cases: if memory_format == torch.channels_last and len(shape) != 4: continue X, scale, zero_point, torch_type = \ @@ -454,7 +455,7 @@ class TestQuantizedOps(TestCase): dqX = qX.dequantize() op = torch.nn.functional.gelu - dqY = op(dqX) + dqY = op(dqX, approximate=approximate) qY = torch.quantize_per_tensor(dqY, scale=scale, zero_point=zero_point, dtype=torch_type) qY_hat = op(qX) diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index c03ff0b3119..1d78a19c5ad 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -2181,18 +2181,21 @@ class TestCudaFuser(JitTestCase): @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") def test_gelu(self): + old_guard = torch._C._jit_set_nvfuser_guard_mode(True) dtype = torch.float device = "cuda" x = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=True) grads = torch.randn([1024, 1024], dtype=dtype, device=device, requires_grad=False) - def t(x: torch.Tensor): - o = torch.nn.functional.gelu(x) + def t(x: torch.Tensor, mode : str): + o = torch.nn.functional.gelu(x, approximate=mode) o = o * 2.0 return o t_jit = torch.jit.script(t) - self._run_training_helper(t_jit, t, grads, x) + self._run_training_helper(t_jit, t, grads, x, 'none') + self._run_training_helper(t_jit, t, grads, x, 'tanh') + torch._C._jit_set_nvfuser_guard_mode(old_guard) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py index a548a8df4c8..ab2b85c6bb3 100644 --- a/test/test_jit_fuser_te.py +++ b/test/test_jit_fuser_te.py @@ -1321,6 +1321,37 @@ class TestTEFuser(JitTestCase): " ".join(["Failed:", str(dtype), 'isnan', device]) ) + def test_gelu(self): + def apply(fn): + return lambda x, approximate: fn(x, approximate) + + unary_ops = [ + F.gelu, + ] + sizes = [(1,), (2,), (4, 4)] + for dtype, op, device, size in product(self.dtypes, unary_ops, self.devices, sizes): + # TODO: Add back when https://github.com/pytorch/pytorch/issues/55905 is closed + if dtype in [torch.float16, torch.bfloat16] and device == "cpu": + continue + try: + x = self.data_for(dtype, device, size=size) + cond = self.data_for(torch.bool, device) + fn = apply(op) + ref = fn(x, cond) + except Exception: + # If eager mode doesn't support a dtype/op/device combo, + # neither does the fuser. Catch everything to avoid needing to + # guess what errors might be thrown by eager. + continue + try: + t = torch.jit.trace(fn, (x, cond)) + torch.testing.assert_close(ref, t(x, cond)) + self.assertAllFused(t.graph_for(x, cond)) + except Exception as e: + raise RuntimeError( + " ".join(["Failed:", str(dtype), op.__name__, device, str(size)]) + ) + def test_unary_ops(self): def apply(fn): return lambda x: fn(x) @@ -1355,7 +1386,6 @@ class TestTEFuser(JitTestCase): F.softplus, torch.sqrt, torch.rsqrt, - F.gelu, torch.abs, torch.ceil, torch.floor, @@ -2367,7 +2397,6 @@ works_list = [ 'mul', 'ne', 'neg', - 'nn.functional.gelu', 'nn.functional.hardshrink', 'nn.functional.hardsigmoid', 'nn.functional.hardswish', diff --git a/test/test_nn.py b/test/test_nn.py index fb7a172161e..c6a2e24e6cf 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -9202,48 +9202,6 @@ class TestNN(NNTestCase): y.mean().backward() self.assertEqual(x.grad, None) - @unittest.skipIf( - not TEST_NUMPY or not TEST_SCIPY, "Numpy or Scipy not found") - def test_gelu(self): - def _test_gelu(n, m, dtype, contiguous, atol=None, rtol=None): - numpy_dtype = { - torch.bfloat16: torch.float, torch.float: torch.float, torch.double: torch.double - }[dtype] - devices = ['cpu'] - devices += ['cuda'] if TEST_CUDA else [] - - def _gelu_ref(X): - return X * stats.norm.cdf(X) - - for d in devices: - if contiguous: - X = torch.rand(n, m, dtype=dtype, requires_grad=True, device=d) - else: - X = torch.rand(n, m, dtype=dtype, requires_grad=True, device=d)[:, ::2] - res = F.gelu(X) - ref = _gelu_ref(X.to(numpy_dtype).cpu().detach().numpy()) - self.assertEqual(res, ref, rtol=rtol, atol=atol, exact_dtype=False) - if dtype == torch.float64: - gradcheck(F.gelu, [X], eps=1e-4) - - for n in range(1, 10): - for m in range(1, 10): - _test_gelu(n, m, torch.bfloat16, True, 1e-2, 0) - _test_gelu(n, m, torch.bfloat16, False, 1e-2, 0) - _test_gelu(n, m, torch.float32, True) - _test_gelu(n, m, torch.float32, False) - _test_gelu(n, m, torch.float64, True) - _test_gelu(n, m, torch.float64, False) - - # Test multi threaded - num_threads = torch.get_num_threads() - torch.set_num_threads(4) - try: - _test_gelu(32, 32, torch.float32, False) - finally: - torch.set_num_threads(num_threads) - - def test_bce_loss_always_nonnegative(self): target = torch.ones(5) input = torch.ones(5) diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 582ba69c362..7f7c13f01aa 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -1799,10 +1799,15 @@ - name: celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!) self: elu_backward(grad, alpha, 1, 1.0/alpha.toFloat(), /* is_result */ true, result) -- name: gelu(Tensor self) -> Tensor - self: "GradMode::is_enabled() ? infinitely_differentiable_gelu_backward(grad, self) : gelu_backward(grad, self)" +- name: gelu(Tensor self, *, str approximate='none') -> Tensor + self: gelu_backward(grad, self, approximate) result: auto_element_wise +- name: gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor + grad_output: gelu_backward(grad, self, approximate) + self: gelu_double_backward(grad, grad_output, self, approximate) + result: gelu_backward(grad_output_t, self_p, approximate) + gelu_double_backward(self_t, grad_output_p, self_p, approximate) + - name: glu(Tensor self, int dim=-1) -> Tensor self: glu_backward(grad, self, dim) diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h index b038f1bce6b..2258dd0c431 100644 --- a/torch/csrc/api/include/torch/nn/functional/activation.h +++ b/torch/csrc/api/include/torch/nn/functional/activation.h @@ -336,8 +336,16 @@ inline Tensor glu(const Tensor& input, const GLUFuncOptions& options = {}) { // ============================================================================ -inline Tensor gelu(const Tensor& input) { - return torch::gelu(input); +#ifndef DOXYGEN_SHOULD_SKIP_THIS +namespace detail { +inline Tensor gelu(const Tensor& input, string approximate) { + return torch::gelu(input, approximate); +} +} // namespace detail +#endif /* DOXYGEN_SHOULD_SKIP_THIS */ + +inline Tensor gelu(const Tensor& input, const GELUFuncOptions& options = {}) { + return detail::gelu(input, options.approximate()); } // ============================================================================ diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h index 28225ee0f68..e4fc02f310d 100644 --- a/torch/csrc/api/include/torch/nn/modules/activation.h +++ b/torch/csrc/api/include/torch/nn/modules/activation.h @@ -570,12 +570,17 @@ TORCH_MODULE(GLU); // NOLINTNEXTLINE(bugprone-exception-escape) class TORCH_API GELUImpl : public torch::nn::Cloneable { public: + explicit GELUImpl(GELUOptions options_ = {}); + Tensor forward(const Tensor& input); void reset() override; /// Pretty prints the `GELU` module into the given `stream`. void pretty_print(std::ostream& stream) const override; + + /// The options with which this `Module` was constructed. + GELUOptions options; }; /// A `ModuleHolder` subclass for `GELUImpl`. diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h index 651c800a84c..16ab0245fbb 100644 --- a/torch/csrc/api/include/torch/nn/options/activation.h +++ b/torch/csrc/api/include/torch/nn/options/activation.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -95,6 +96,33 @@ using GLUFuncOptions = GLUOptions; // ============================================================================ +/// Options for the `GELU` module. +/// +/// Example: +/// ``` +/// GELU model(GELUOptions().approximate("none")); +/// ``` +struct TORCH_API GELUOptions { + /// Specifies the approximation to apply to the output. + TORCH_ARG(std::string, approximate) = "none"; +}; + +namespace functional { +/// Options for `torch::nn::functional::gelu`. +/// +/// See the documentation for `torch::nn::GELUOptions` class to learn what +/// arguments are supported. +/// +/// Example: +/// ``` +/// namespace F = torch::nn::functional; +/// F::gelu(input, F::GELUFuncOptions().approximate("none")); +/// ``` +using GELUFuncOptions = GELUOptions; +} // namespace functional + +// ============================================================================ + /// Options for the `Hardshrink` module. /// /// Example: diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp index 677c9e1cc83..001199e98ed 100644 --- a/torch/csrc/api/src/nn/modules/activation.cpp +++ b/torch/csrc/api/src/nn/modules/activation.cpp @@ -284,8 +284,10 @@ void GLUImpl::pretty_print(std::ostream& stream) const { // ============================================================================ +GELUImpl::GELUImpl(GELUOptions options_) : options(std::move(options_)) {} + Tensor GELUImpl::forward(const Tensor& input) { - return F::gelu(input); + return F::detail::gelu(input, options.approximate()); } void GELUImpl::reset() {} diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index b4bcc4e4316..951b5eeca96 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2338,6 +2339,47 @@ std::tuple prelu_double_backward( } } +Tensor gelu_double_backward( + const Tensor & ggI, + const Tensor & gO, + const Tensor & input, + c10::string_view approximate) { + //if (at::native::get_gelutype_enum(approximate) == at::native::GeluType::Tanh) { + if (approximate == "tanh") { + constexpr auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5; + constexpr auto kKappa = 0.044715; + + auto inner = kBeta * (input + kKappa * pow(input, 3)); + auto tanh_inner = tanh(inner); + auto sech_inner = 1 / cosh(inner); + + auto f = 0.5 * input; + auto g = 1 - tanh_inner * tanh_inner; + auto h = kBeta * (1 + 3 * kKappa * input * input); + + auto f_prime_gh = 0.5 * g * h; + + auto g_prime = (2 * sech_inner) * (-sech_inner * tanh_inner) * h; + auto g_prime_fh = f * h * g_prime; + + auto h_prime = 6 * kKappa * input * kBeta; + auto h_prime_fg = f * g * h_prime; + + // left_derivative = f_prime_gh + // right_derivative = f_prime_gh + g_prime_fh + h_prime_fg + // dgrad_dX = left_derivative + right_derivative + auto gI = ggI * gO * (2 * f_prime_gh + g_prime_fh + h_prime_fg); + return gI; + } else { + constexpr auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5; + auto input_sq = input * input; + auto pdf = kBeta * at::exp(-0.5 * input_sq); + auto dgrad_dInput = 2 * pdf - input_sq * pdf; + auto gI = ggI * gO * dgrad_dInput; + return gI; + } +} + Tensor elu_double_backward( const Tensor& grad, const Tensor& grad_output, diff --git a/torch/csrc/autograd/FunctionsManual.h b/torch/csrc/autograd/FunctionsManual.h index 739b44b4d62..9451f5f49d2 100644 --- a/torch/csrc/autograd/FunctionsManual.h +++ b/torch/csrc/autograd/FunctionsManual.h @@ -303,6 +303,11 @@ std::tuple prelu_double_backward( const Tensor & grad_out, const Tensor & input_, const Tensor & weight_); +Tensor gelu_double_backward( + const Tensor & ggI, + const Tensor & gO, + const Tensor & input, + c10::string_view approximate); Tensor as_strided_backward(Tensor grad, TensorGeometry input_geometry, IntArrayRef sizes, IntArrayRef strides, optional storage_offset_); std::tuple atan2_backward(const Tensor& grad, const Tensor& self, const Tensor& other, std::array output_mask); std::tuple layer_norm_double_backward( diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp index 08d3e89d21c..47c0316abda 100644 --- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp +++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp @@ -77,6 +77,10 @@ Value* createConditionalConstant(Node* profile_ivalue) { // int val = IValue( static_cast(profile_ivalue->i(Symbol::attr("profiled_int")))); + } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_str"))) { + // str + val = IValue(static_cast( + profile_ivalue->s(Symbol::attr("profiled_str")))); } else { GRAPH_DEBUG("profile_ivalue: ", *profile_ivalue); TORCH_WARN( diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp index a33b33895c5..11c27cffec2 100644 --- a/torch/csrc/jit/codegen/cuda/parser.cpp +++ b/torch/csrc/jit/codegen/cuda/parser.cpp @@ -12,6 +12,8 @@ #include #include +#include + #include #include @@ -62,6 +64,7 @@ const auto& intListAttr = Symbol::attr("profiled_int_list"); const auto& intAttr = Symbol::attr("profiled_int"); const auto& boolListAttr = Symbol::attr("profiled_bool_list"); const auto& boolAttr = Symbol::attr("profiled_bool"); +const auto& strAttr = Symbol::attr("profiled_str"); typedef Val* CgValue; typedef Expr* CgOp; @@ -2273,7 +2276,8 @@ class IrParser { } { - auto ptr_op = getOperatorForLiteral("aten::gelu(Tensor self) -> Tensor"); + auto ptr_op = getOperatorForLiteral( + "aten::gelu(Tensor self, *, str approximate='none') -> Tensor"); REGISTER_PARSE_RULE( ptr_op, { @@ -2283,7 +2287,21 @@ class IrParser { c10::nullopt, value_map[node->inputs()[0]->unique()]); auto self = list_val.front(); list_val.pop_front(); - auto out = gelu(self); + + auto approximate = constant_as(node->input(1)); + TORCH_INTERNAL_ASSERT( + approximate.has_value(), + "The approximate parameter is required."); + const auto kApproximate = approximate.value(); + + Val* out = nullptr; + if (at::native::get_gelutype_enum(kApproximate) == + at::native::GeluType::Tanh) { + out = fast_gelu(self); + } else { + out = unaryOp(UnaryOpType::Gelu, self); + } + value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, @@ -2293,7 +2311,7 @@ class IrParser { { auto ptr_op = getOperatorForLiteral( - "aten::gelu_backward(Tensor grad, Tensor self) -> Tensor"); + "aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor"); REGISTER_PARSE_RULE( ptr_op, { @@ -2308,7 +2326,20 @@ class IrParser { auto self = list_val.front(); list_val.pop_front(); - auto grad_in = gelu_backward(grad_out, self); + auto approximate = constant_as(node->input(2)); + TORCH_INTERNAL_ASSERT( + approximate.has_value(), + "The approximate parameter is required."); + const auto kApproximate = approximate.value(); + + Val* grad_in = nullptr; + if (at::native::get_gelutype_enum(kApproximate) == + at::native::GeluType::Tanh) { + grad_in = fast_gelu_backward(grad_out, self); + } else { + grad_in = gelu_backward(grad_out, self); + } + value_map.emplace( node->output()->unique(), ValueHolder(grad_in, format)); }, @@ -2453,9 +2484,13 @@ class IrParser { } value_map_.emplace(val->unique(), cg_val); return true; - } else if (val->type()->isSubtypeOf( - static_cast(NoneType::get()))) { + } else if ( + val->type()->isSubtypeOf( + static_cast(StringType::get())) || + val->type()->isSubtypeOf(static_cast(NoneType::get()))) { // TODO: should we consider adding support for NoneType; + // String scalars are only used in parsing rules; + // Do not register string with codegen IR. return true; } else if (val->type()->cast()) { // TODO: we don't support list type in codegen yet; @@ -2646,6 +2681,34 @@ void profileIntList(ProfilingRecord* pr, Node* node, size_t offset) { pn->setCallback(ivalue_profiler); } +void profileString(ProfilingRecord* pr, Node* node, size_t offset) { + auto pn = insertProfileIValueOp(node, offset, pr); + + const auto ivalue_profiler = [pr, pn](Stack& stack) { + std::lock_guard lock(pr->mutex_); + + // TODO: we don't care about merging multiple profiling runs as we don't + // support it at all; + int64_t frame_id = 0; + pop(stack, frame_id); + IValue value; + pop(stack, value); + TORCH_INTERNAL_ASSERT( + value.isString(), "profiling seeing the wrong data type"); + if (!pn->hasAttribute(strAttr)) { + pn->s_(strAttr, value.toStringRef()); + } else { + const auto& profiled_str = pn->s(strAttr); + const auto& input_str = value.toStringRef(); + TORCH_INTERNAL_ASSERT( + input_str == profiled_str, "profiling ivalue doesn't support merge"); + } + push(stack, value); + }; + + pn->setCallback(ivalue_profiler); +} + void profileBool(ProfilingRecord* pr, Node* node, size_t offset) { auto pn = insertProfileIValueOp(node, offset, pr); @@ -3015,6 +3078,38 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { } } + static auto gelu_schema = + getOperatorForLiteral( + "aten::gelu(Tensor self, *, str approximate='none') -> Tensor") + ->schema(); + if (node->matches(gelu_schema)) { + switch (offset) { + // argument 1: approximate; + case 1: + profileString(pr, node, offset); + break; + default: + return false; + } + return true; + } + + static auto gelu_backward_schema = + getOperatorForLiteral( + "aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor") + ->schema(); + if (node->matches(gelu_backward_schema)) { + switch (offset) { + // argument 2: approximate; + case 2: + profileString(pr, node, offset); + break; + default: + return false; + } + return true; + } + static auto softmax_backward_data_schema = getOperatorForLiteral( "aten::_softmax_backward_data(Tensor grad_output, Tensor output, int dim, ScalarType input_dtype) -> Tensor") diff --git a/torch/csrc/jit/mobile/upgrader_mobile.cpp b/torch/csrc/jit/mobile/upgrader_mobile.cpp index 28192504558..83e23342d5c 100644 --- a/torch/csrc/jit/mobile/upgrader_mobile.cpp +++ b/torch/csrc/jit/mobile/upgrader_mobile.cpp @@ -43,21 +43,29 @@ getOperatorVersionMapForMobile() { std::vector({ Upgrader({0, 3, "div__Tensor_0_3", 3}) })}, + {std::string("aten::gelu"), + std::vector({ + Upgrader({0, 9, "gelu_0_9", 5}) + })}, + {std::string("aten::gelu.out"), + std::vector({ + Upgrader({0, 9, "gelu_out_0_9", 6}) + })}, {std::string("aten::linspace"), std::vector({ - Upgrader({0, 7, "linspace_0_7", 5}) + Upgrader({0, 7, "linspace_0_7", 7}) })}, {std::string("aten::linspace.out"), std::vector({ - Upgrader({0, 7, "linspace_out_0_7", 6}) + Upgrader({0, 7, "linspace_out_0_7", 8}) })}, {std::string("aten::logspace"), std::vector({ - Upgrader({0, 8, "logspace_0_8", 7}) + Upgrader({0, 8, "logspace_0_8", 9}) })}, {std::string("aten::logspace.out"), std::vector({ - Upgrader({0, 8, "logspace_out_0_8", 8}) + Upgrader({0, 8, "logspace_out_0_8", 10}) })}, }); return operatorVersionMapForMobile; @@ -292,6 +300,45 @@ const std::vector& getUpgraderBytecodeList() { OperatorString({"aten::div", "out_mode", 4}), }), // operators list }), + ByteCodeFunctionWithOperator({ + mobile::Function::registerFunc( + "gelu_0_9", + std::vector({ + Instruction{OpCode::STORE, 1, 0}, + Instruction{OpCode::MOVE, 1, 0}, + Instruction{OpCode::OP, 0, 0}, + Instruction{OpCode::RET, 0, 0}, + }), // instructions list, + std::vector({ + c10::IValue("none"), + }), // constants list, + std::vector(), // types list, + 1 + ), + std::vector({ + OperatorString({"aten::gelu", "", 1}), + }), // operators list + }), + ByteCodeFunctionWithOperator({ + mobile::Function::registerFunc( + "gelu_out_0_9", + std::vector({ + Instruction{OpCode::STOREN, 1, 2}, + Instruction{OpCode::MOVE, 1, 0}, + Instruction{OpCode::MOVE, 2, 0}, + Instruction{OpCode::OP, 0, 0}, + Instruction{OpCode::RET, 0, 0}, + }), // instructions list, + std::vector({ + c10::IValue("none"), + }), // constants list, + std::vector(), // types list, + 2 + ), + std::vector({ + OperatorString({"aten::gelu", "out", 2}), + }), // operators list + }), ByteCodeFunctionWithOperator({ mobile::Function::registerFunc( "linspace_0_7", diff --git a/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp b/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp index 75201cf5d67..7b09cc409a4 100644 --- a/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp +++ b/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp @@ -14,64 +14,64 @@ namespace torch { namespace jit { -static std::unordered_map kUpgradersEntryMap( - {{"logspace_0_8", R"SCRIPT( +static std::unordered_map kUpgradersEntryMap({ + {"logspace_0_8", R"SCRIPT( def logspace_0_8(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], base: float, *, dtype: Optional[int], layout: Optional[int], device: Optional[Device], pin_memory: Optional[bool]): if (steps is None): return torch.logspace(start=start, end=end, steps=100, base=base, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) return torch.logspace(start=start, end=end, steps=steps, base=base, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) )SCRIPT"}, - {"logspace_out_0_8", R"SCRIPT( + {"logspace_out_0_8", R"SCRIPT( def logspace_out_0_8(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], base: float, *, out: Tensor): if (steps is None): return torch.logspace(start=start, end=end, steps=100, base=base, out=out) return torch.logspace(start=start, end=end, steps=steps, base=base, out=out) )SCRIPT"}, - {"linspace_0_7", R"SCRIPT( + {"linspace_0_7", R"SCRIPT( def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int], device: Optional[Device], pin_memory: Optional[bool]): if (steps is None): return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) )SCRIPT"}, - {"linspace_out_0_7", R"SCRIPT( + {"linspace_out_0_7", R"SCRIPT( def linspace_out_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, out: Tensor): if (steps is None): return torch.linspace(start=start, end=end, steps=100, out=out) return torch.linspace(start=start, end=end, steps=steps, out=out) )SCRIPT"}, - {"div_Tensor_0_3", R"SCRIPT( + {"div_Tensor_0_3", R"SCRIPT( def div_Tensor_0_3(self: Tensor, other: Tensor) -> Tensor: if (self.is_floating_point() or other.is_floating_point()): return self.true_divide(other) return self.divide(other, rounding_mode='trunc') )SCRIPT"}, - {"div_Scalar_0_3", R"SCRIPT( + {"div_Scalar_0_3", R"SCRIPT( def div_Scalar_0_3(self: Tensor, other: number) -> Tensor: if (self.is_floating_point() or isinstance(other, float)): return self.true_divide(other) return self.divide(other, rounding_mode='trunc') )SCRIPT"}, - {"div_out_0_3", R"SCRIPT( + {"div_out_0_3", R"SCRIPT( def div_out_0_3(self: Tensor, other: Tensor, *, out: Tensor) -> Tensor: if (self.is_floating_point() or other.is_floating_point() or out.is_floating_point()): return self.true_divide(other, out=out) return self.divide(other, rounding_mode='trunc', out=out) )SCRIPT"}, - {"div__Tensor_0_3", R"SCRIPT( + {"div__Tensor_0_3", R"SCRIPT( def div__Tensor_0_3(self: Tensor, other: Tensor) -> Tensor: if (self.is_floating_point() or other.is_floating_point()): return self.true_divide_(other) return self.divide_(other, rounding_mode='trunc') )SCRIPT"}, - {"div__Scalar_0_3", R"SCRIPT( + {"div__Scalar_0_3", R"SCRIPT( def div__Scalar_0_3(self: Tensor, other: number) -> Tensor: if (self.is_floating_point() or isinstance(other, float)): return self.true_divide_(other) return self.divide_(other, rounding_mode='trunc') )SCRIPT"}, - {"full_0_4", R"SCRIPT( + {"full_0_4", R"SCRIPT( def full_0_4(size:List[int], fill_value:number, *, dtype:Optional[int]=None, layout:Optional[int]=None, device:Optional[Device]=None, pin_memory:Optional[bool]=None) -> Tensor: @@ -79,10 +79,19 @@ def full_0_4(size:List[int], fill_value:number, *, dtype:Optional[int]=None, fill_value = float(fill_value) return torch.full(size, fill_value, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) )SCRIPT"}, - {"full_out_0_4", R"SCRIPT( + {"full_out_0_4", R"SCRIPT( def full_out_0_4(size:List[int], fill_value:number, *, out:Tensor) -> Tensor: return torch.full(size, fill_value, out=out) -)SCRIPT"}}); +)SCRIPT"}, + {"gelu_0_9", R"SCRIPT( +def gelu_0_9(self: Tensor) -> Tensor: + return torch.gelu(self, approximate='none') +)SCRIPT"}, + {"gelu_out_0_9", R"SCRIPT( +def gelu_out_0_9(self: Tensor, *, out: Tensor) -> Tensor: + return torch.gelu(self, approximate='none', out=out) +)SCRIPT"}, +}); std::shared_ptr create_upgrader_graph( const std::string& upgrader_name, diff --git a/torch/csrc/jit/operator_upgraders/version_map.cpp b/torch/csrc/jit/operator_upgraders/version_map.cpp index e6860e318ce..1e19f4cc39d 100644 --- a/torch/csrc/jit/operator_upgraders/version_map.cpp +++ b/torch/csrc/jit/operator_upgraders/version_map.cpp @@ -59,7 +59,12 @@ static std::unordered_map> operatorVersi {"aten::full.out", {{5, "full_out_0_4", - "aten::full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)"}}}}); + "aten::full.out(int[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!)"}}}, + {"aten::gelu", {{10, "gelu_0_9", "aten::gelu(Tensor self) -> Tensor"}}}, + {"aten::gelu.out", + {{10, + "gelu_out_0_9", + "aten::gelu.out(Tensor self, *, Tensor(a!) out) -> Tensor"}}}}); const std::unordered_map>& get_operator_version_map() { diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 262e9b35110..0f79d01104a 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -872,7 +872,7 @@ class ShapePropagator : public PropertyPropBase { "aten::rrelu(Tensor self, Scalar lower, Scalar upper, bool training, Generator? generator) -> Tensor", "aten::rsqrt(Tensor self) -> Tensor", "aten::selu(Tensor self) -> Tensor", - "aten::gelu(Tensor self) -> Tensor", + "aten::gelu(Tensor self, *, str approximate='none') -> Tensor", "aten::sigmoid(Tensor self) -> Tensor", "aten::sign(Tensor self) -> Tensor", "aten::sin(Tensor self) -> Tensor", diff --git a/torch/csrc/jit/runtime/symbolic_script.cpp b/torch/csrc/jit/runtime/symbolic_script.cpp index f385ba3875b..64f8dd5b4c5 100644 --- a/torch/csrc/jit/runtime/symbolic_script.cpp +++ b/torch/csrc/jit/runtime/symbolic_script.cpp @@ -913,16 +913,10 @@ const std::vector functions = { return grad_output * torch.where(self > 0, 1.0, negative_slope).type_as(result), None return result, backward - def gelu(self): - result = torch.gelu(self) + def gelu(self : Tensor, *, approximate : str): + result = torch.gelu(self, approximate=approximate) def backward(grad_output): - m_2_sqrtpi = 1.12837916709551257390 - m_sqrt1_2 = 0.707106781186547524401 - alpha = m_sqrt1_2 - beta = m_2_sqrtpi * m_sqrt1_2 * 0.5 - cdf = (torch.erf(self * m_sqrt1_2) + 1.0) * 0.5 - pdf = beta * torch.exp(self * self * -0.5) - return grad_output * (cdf + self * pdf) + return torch.gelu_backward(grad_output, self, approximate=approximate), None return result, backward def hardswish(self): diff --git a/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp b/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp index fbe3a0c36c7..71c9730f5ba 100644 --- a/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp +++ b/torch/csrc/jit/runtime/symbolic_shape_registry_util.cpp @@ -76,7 +76,7 @@ const OperatorMap& get_tensorexpr_elementwise_set() { {"aten::leaky_relu(Tensor self, Scalar negative_slope=0.01) -> Tensor", "unary"}, {"aten::softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor", "unary"}, {"aten::relu6(Tensor self) -> Tensor", "unary"}, - {"aten::gelu(Tensor self) -> Tensor", "unary"}, + {"aten::gelu(Tensor self, *, str approximate='none') -> Tensor", "unary"}, {"aten::neg(Tensor self) -> Tensor", "unary"}, {"aten::reciprocal(Tensor self) -> Tensor", "unary"}, {"aten::expm1(Tensor self) -> Tensor", "unary"}, diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index cc6d2678685..a48c9d07e29 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -362,6 +362,8 @@ ArgValue TensorExprKernel::toArg(const torch::jit::Value* v) const { return val.toIntVector(); } else if (val.isDoubleList()) { return val.toDoubleVector(); + } else if (val.isString()) { + return val.toStringRef(); } else { throw unsupported_dtype(val.type()->str()); } diff --git a/torch/csrc/jit/tensorexpr/lowerings.cpp b/torch/csrc/jit/tensorexpr/lowerings.cpp index ee3a77084cd..c0905588c2d 100644 --- a/torch/csrc/jit/tensorexpr/lowerings.cpp +++ b/torch/csrc/jit/tensorexpr/lowerings.cpp @@ -3,6 +3,8 @@ #include #include +#include + namespace torch { namespace jit { namespace tensorexpr { @@ -641,22 +643,44 @@ int nnc_lowerings_lazy_registration() { }); RegisterNNCLoweringsFunction aten_gelu( - {"aten::gelu(Tensor self) -> (Tensor)"}, + {"aten::gelu(Tensor self, *, str approximate='none') -> (Tensor)"}, [](const std::vector& inputs, const std::vector& outputShape, const c10::optional& outputType, at::Device device) { - return computeOneOperand( - "aten_gelu", - inputs, - outputShape, - outputType, - [](const ExprHandle& a) { - auto m_sqrt1_2 = Cast::make(a.dtype(), M_SQRT1_2); - auto one = Cast::make(a.dtype(), 1.); - auto point_five = Cast::make(a.dtype(), .5); - return a * point_five * (one + erf(a * m_sqrt1_2)); - }); + const auto& kApproximate = c10::get(inputs[1]); + std::vector operands = {inputs.front()}; + if (at::native::get_gelutype_enum(kApproximate) == + at::native::GeluType::Tanh) { + // approximate == 'tanh' + return computeOneOperand( + "aten_tanh_gelu", + operands, + outputShape, + outputType, + [](const ExprHandle& a) { + auto one = Cast::make(a.dtype(), 1.); + auto point_five = Cast::make(a.dtype(), .5); + auto beta = Cast::make(a.dtype(), M_SQRT2 * M_2_SQRTPI * 0.5); + auto kappa = Cast::make(a.dtype(), 0.044715); + auto a_cube = a * a * a; + auto inner = beta * (a + kappa * a_cube); + return point_five * a * (one + tanh(inner)); + }); + } else { + // approximate == 'none' + return computeOneOperand( + "aten_gelu", + operands, + outputShape, + outputType, + [](const ExprHandle& a) { + auto m_sqrt1_2 = Cast::make(a.dtype(), M_SQRT1_2); + auto one = Cast::make(a.dtype(), 1.); + auto point_five = Cast::make(a.dtype(), .5); + return a * point_five * (one + erf(a * m_sqrt1_2)); + }); + } }); RegisterNNCLoweringsFunction aten_batch_norm( diff --git a/torch/csrc/jit/tensorexpr/lowerings.h b/torch/csrc/jit/tensorexpr/lowerings.h index 19aa85810b9..aac507ff132 100644 --- a/torch/csrc/jit/tensorexpr/lowerings.h +++ b/torch/csrc/jit/tensorexpr/lowerings.h @@ -26,6 +26,7 @@ using ArgValue = c10::variant< BufList, DoubleList, IntList, + std::string, ArgNone>; using NNCLoweringFunction = std::function Tensor +gelu(input, approximate = 'none') -> Tensor -Applies element-wise the function +When the approximate argument is 'none', it applies element-wise the function :math:`\text{GELU}(x) = x * \Phi(x)` where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution. +When the approximate argument is 'tanh', Gelu is estimated with: + :math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3))) + See `Gaussian Error Linear Units (GELUs) `_. """) - hardshrink = _add_docstr( torch.hardshrink, r""" diff --git a/torch/nn/functional.pyi.in b/torch/nn/functional.pyi.in index 8e92e29d6c6..0ab153991ca 100644 --- a/torch/nn/functional.pyi.in +++ b/torch/nn/functional.pyi.in @@ -141,7 +141,7 @@ def rrelu(input: Tensor, lower: float = ..., upper: float = ..., training: bool inplace: bool = ...) -> Tensor: ... -def gelu(input: Any): ... +def gelu(input: Any, approximate: str = ...): ... def hardshrink(input: Tensor, lambd: float = ...) -> Tensor: ... diff --git a/torch/nn/modules/activation.py b/torch/nn/modules/activation.py index 6066d855c8c..aeb5590bd47 100644 --- a/torch/nn/modules/activation.py +++ b/torch/nn/modules/activation.py @@ -654,6 +654,13 @@ class GELU(Module): where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution. + When the approximate argument is 'tanh', Gelu is estimated with: + :math:: \text{GELU}(x) = 0.5 * x * (1 + \text{Tanh}(\sqrt(2 / \pi) * (x + 0.044715 * x^3))) + + Args: + approximate (string, optional): the gelu approximation algorithm to use: + ``'none'`` | ``'tanh'``. Default: ``'none'`` + Shape: - Input: :math:`(*)`, where :math:`*` means any number of dimensions. - Output: :math:`(*)`, same shape as the input. @@ -666,8 +673,18 @@ class GELU(Module): >>> input = torch.randn(2) >>> output = m(input) """ + __constants__ = ['approximate'] + approximate: str + + def __init__(self, approximate: str = 'none') -> None: + super(GELU, self).__init__() + self.approximate = approximate + def forward(self, input: Tensor) -> Tensor: - return F.gelu(input) + return F.gelu(input, approximate=self.approximate) + + def extra_repr(self) -> str: + return 'approximate={}'.format(self.approximate) class Hardshrink(Module): diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 776a239fb10..acdde766120 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -2953,12 +2953,27 @@ def remainder(g, input, other): quo = g.op("Mul", div, other) return g.op("Sub", input, quo) +@parse_args("v", "s") +def gelu(g, self, approximate): + # none approximate : onnx::Constant[value={0}] + # tanh approximate : onnx::Constant[value={1}] + if approximate == 'tanh': + kBeta = math.sqrt(2 / math.pi) + kKappa = 0.044715 -def gelu(g, self): - _sqrt2 = 1.4142135623730951 - erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double))) - erf_plusone = add(g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))) - return mul(g, mul(g, self, erf_plusone), g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double))) + beta = torch.tensor(kBeta, dtype=torch.double) + kappa = torch.tensor(kKappa, dtype=torch.double) + one = torch.tensor(1., dtype=torch.double) + half = torch.tensor(0.5, dtype=torch.double) + + self_cube = mul(g, self, mul(g, self, self)) + inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube))) + return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner)))) + else: + _sqrt2 = 1.4142135623730951 + erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double))) + erf_plusone = add(g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))) + return mul(g, mul(g, self, erf_plusone), g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double))) @parse_args("v", "i", "v", "v", "f", "i") def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled): diff --git a/torch/overrides.py b/torch/overrides.py index 408012ea6e9..76a5fe67069 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -730,7 +730,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: lambda input, kernel_size, output_size=None, output_ratio=None, return_indices=False, _random_samples=None: -1), torch.nn.functional.gaussian_nll_loss: lambda input, target, var, full=False, eps=1e-06, reduction='mean': -1, - torch.nn.functional.gelu: lambda input: -1, + torch.nn.functional.gelu: lambda input, approximate='none': -1, torch.nn.functional.glu: lambda input, dim=-1: -1, torch.nn.functional.grid_sample: lambda input, grid, mode='bilinear', padding_mode='zeros', align_corners=None: -1, torch.nn.functional.group_norm: lambda input, num_groups, weight=None, bias=None, eps=1e-05: -1, diff --git a/torch/testing/_internal/autocast_test_lists.py b/torch/testing/_internal/autocast_test_lists.py index 00ed8072495..4b1058fe35a 100644 --- a/torch/testing/_internal/autocast_test_lists.py +++ b/torch/testing/_internal/autocast_test_lists.py @@ -327,7 +327,8 @@ class AutocastCPUTestLists(object): self.nn_fp32 = [ ("avg_pool2d", dummy_bf16[2], {"kernel_size": (3, 2), "stride": (1, 1)}), ("avg_pool3d", dummy_bf16[3], {"kernel_size": (3, 3, 3), "stride": (1, 1, 1)}), - ("gelu", dummy_bf16[3]), + ("gelu", dummy_bf16[3], {"approximate": 'none'}), + ("gelu", dummy_bf16[3], {"approximate": 'tanh'}), ("upsample_nearest1d", dummy_bf16[2], {"output_size": (n)}), ("upsample_nearest2d", dummy_bf16[3], {"output_size": (n, n)}), ("upsample_nearest3d", dummy_bf16[4], {"output_size": (n, n, n)}), diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index c25d04ebe61..e9332bd4f01 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -43,6 +43,7 @@ from distutils.version import LooseVersion has_scipy_fft = False if TEST_SCIPY: + from scipy import stats import scipy.special try: import scipy.fft @@ -3903,7 +3904,6 @@ def sample_inputs_layer_norm(opinfo, device, dtype, requires_grad, **kwargs): # With `None` weight and bias (tests failing for this, see the link above) # yield SampleInput(make_arg((1, 2)), args=((2,), None, make_arg((2,)))) - def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kwargs): make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) @@ -3925,7 +3925,6 @@ def sample_inputs_local_response_norm(opinfo, device, dtype, requires_grad, **kw for input_shape, size, kwargs in cases: yield SampleInput(make_arg(input_shape), args=(size,), kwargs=kwargs) - def sample_inputs_hardswish(self, device, dtype, requires_grad, **kwargs): N = 5 # make sure we are testing -3 -> 3 range. default is -10 -> 10 so maybe unnecessary ? @@ -4080,10 +4079,16 @@ def sample_inputs_upsample(mode, self, device, dtype, requires_grad, **kwargs): return sample_inputs + def sample_inputs_gelu(self, device, dtype, requires_grad, **kwargs): N = 5 - tensors = [SampleInput(make_tensor((N * 2, N * 2), device=device, dtype=dtype, - requires_grad=requires_grad, low=-3, high=3)) for _ in range(1, N)] + tensors = [] + for _ in range(1, N): + for approximate in ['none', 'tanh']: + tensors.append(SampleInput( + make_tensor((N * 2, N * 2), device=device, dtype=dtype, + requires_grad=requires_grad, low=-3, high=3), + kwargs=dict(approximate=approximate))) return tensors def sample_inputs_max_min_reduction_with_dim(op_info, device, dtype, requires_grad, **kwargs): @@ -7965,6 +7970,20 @@ def reference_softplus(input, beta=1, threshold=20): output[non_linear] = np.log(1 + np.exp(beta * input[non_linear])) / beta return output +def reference_gelu(X, *, approximate='none'): + def _gelu_ref(X): + return X * stats.norm.cdf(X) + + def _tanh_gelu_ref(X): + M_SQRT_2_PI = math.sqrt(2 / math.pi) + Z = M_SQRT_2_PI * (X + 0.044715 * np.power(X, 3.0)) + return 0.5 * X * (1.0 + np.tanh(Z)) + + if approximate == 'tanh': + return _tanh_gelu_ref(X) + else: + return _gelu_ref(X) + def reference_one_hot(a: np.ndarray, num_classes: int = -1) -> np.ndarray: if num_classes == -1: @@ -11772,6 +11791,7 @@ op_db: List[OpInfo] = [ ), OpInfo('nn.functional.gelu', aten_name="gelu", + ref=reference_gelu if TEST_SCIPY else _NOTHING, supports_autograd=True, assert_autodiffed=True, sample_inputs_func=sample_inputs_gelu, diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py index 599e4fecabe..dadcac9285f 100644 --- a/torch/testing/_internal/common_nn.py +++ b/torch/testing/_internal/common_nn.py @@ -3716,12 +3716,16 @@ new_module_tests = [ ), dict( module_name='GELU', + constructor_args=('none',), + cpp_constructor_args='torch::nn::GELUOptions().approximate(\"none\")', input_size=(), desc='scalar', reference_fn=lambda x, *_: x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))), ), dict( module_name='GELU', + constructor_args=('none',), + cpp_constructor_args='torch::nn::GELUOptions().approximate(\"none\")', input_size=(3, 2, 5), reference_fn=lambda x, *_: x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))), ), From 66902560216740db6f38cb7a8458bd29c213f296 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Mon, 14 Feb 2022 07:53:38 -0800 Subject: [PATCH 003/199] free up dispatch key space (in C++) (#72402) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72402 The original PR had an array-out-of-bounds access in `DispatchKeyExtractor.cpp`, that wasn't caught by ASAN and appeared to only manifest in a subset of android internal tests. After fixing the OOB access (and adding more asserts), I confirmed that the android internal test passes. Reland of D33255193 (https://github.com/pytorch/pytorch/commit/20b8653dfaad495ba172b8116f540c258ef42ffe) ghstack-source-id: 148830728 Test Plan: Steps to test: (1) connect to a mobile OD (2) run `one_world android emulator android-29` in a terminal to start the android emulator (3) In a separate terminal, run the test: `buck test //fbandroid/instrumentation_tests/com/facebook/pytorch/bi_xray:instrumentation_test -c test.external_runner=tpx -- --regex 'testBIXRayModel.*PyTorchBIXRayInstrumentationTest' --force-remote-execution --run-disabled` I also ran `buck test fbandroid/mode/dbg //fbandroid/instrumentation_tests/com/facebook/pytorch/bi_xray:instrumentation_test`, which failed before and passed after the PR. Reviewed By: albanD Differential Revision: D34034848 fbshipit-source-id: 9677ee2c0a1afd1183896f7055009445712523c5 (cherry picked from commit 9ab9b12d355540ad0923c6869ed088ff6c21490c) --- aten/src/ATen/TensorSubclassLikeUtils.h | 3 +- aten/src/ATen/core/TensorBase.h | 1 - .../core/dispatch/DispatchKeyExtractor.cpp | 41 ++ .../ATen/core/dispatch/DispatchKeyExtractor.h | 29 +- aten/src/ATen/core/dispatch/Dispatcher.cpp | 10 +- aten/src/ATen/core/dispatch/Dispatcher.h | 11 +- aten/src/ATen/core/dispatch/OperatorEntry.cpp | 36 +- aten/src/ATen/core/dispatch/OperatorEntry.h | 11 +- .../op_registration/op_registration_test.cpp | 42 +- c10/core/DispatchKey.cpp | 163 +++-- c10/core/DispatchKey.h | 487 ++++++++++--- c10/core/DispatchKeySet.cpp | 225 ++++-- c10/core/DispatchKeySet.h | 676 +++++++++++++++--- c10/core/TensorImpl.cpp | 2 +- c10/core/TensorImpl.h | 53 +- c10/test/core/DispatchKeySet_test.cpp | 373 ++++++++-- test/test_dispatch.py | 26 +- test/test_sparse.py | 6 +- tools/codegen/model.py | 62 +- torch/_python_dispatcher.py | 6 +- 20 files changed, 1748 insertions(+), 515 deletions(-) diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h index 7f5517bc081..e9f5e7d26e1 100644 --- a/aten/src/ATen/TensorSubclassLikeUtils.h +++ b/aten/src/ATen/TensorSubclassLikeUtils.h @@ -28,8 +28,7 @@ constexpr auto kFunctorchWrappedTensors = DispatchKeySet({ constexpr auto kTensorSubclassLike = kFunctorchWrappedTensors | DispatchKeySet({ DispatchKey::Batched, - DispatchKey::SparseCPU, - DispatchKey::SparseCUDA, + DispatchKey::Sparse, DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA, DispatchKey::Meta, diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index b05f74259dc..225b6c934c0 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -43,7 +43,6 @@ inline bool variable_excluded_from_dispatch() { // Please read the comment in `VariableFallbackKernel.cpp` about the background of this change. return true; #else - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::impl::tls_local_dispatch_key_set().excluded_.has(DispatchKey::Autograd)); return c10::impl::tls_local_dispatch_key_set().excluded_.isSupersetOf(c10::autograd_dispatch_keyset); #endif } diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp index a930edc2db6..9180d0d19e6 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp @@ -6,11 +6,52 @@ namespace c10 { void DispatchKeyExtractor::setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough) { + // (1) update nonFallthroughKeys_ if (has_fallthrough) { nonFallthroughKeys_ = nonFallthroughKeys_.remove(k); } else { nonFallthroughKeys_ = nonFallthroughKeys_.add(k); } + // (2) update nonFallthroughKeysPerBackend_ + if (isPerBackendFunctionalityKey(toFunctionalityKey(k))) { + // This is a per-backend functionality key. + // We need to figure out what the current backend is, + // and only update the bitset for that backend. + // subtracting 1 because the first backend should have index 0 (CPU), + // But the enum starts with BackendComponent::InvalidBit. + auto backend_idx = static_cast(toBackendComponent(k)) - 1; + TORCH_INTERNAL_ASSERT(backend_idx >= 0 && static_cast(backend_idx) < nonFallthroughKeysPerBackend_.size()); + if (has_fallthrough) { + nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].remove(k); + } else { + nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].add(k); + } + + // Set requiresBitsetPerBackend_ accordingly + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size() - 1)) { + if (nonFallthroughKeysPerBackend_[i] != nonFallthroughKeysPerBackend_[i+1]) { + requiresBitsetPerBackend_ = true; + return; + } + } + requiresBitsetPerBackend_ = false; + return; + } else { + // Otherwise, if a fallthrough is set for a functionality that isn't per backend, + // Then we update the fallthrough bitset for EVERY backend. + // TODO: we could probably optimize this by only lazily updating these values + // the first time that we see requiresBitsetPerBackend_ = true + // (which should almost never happen) + if (has_fallthrough) { + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { + nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].remove(k); + } + } else { + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { + nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].add(k); + } + } + } } std::string DispatchKeyExtractor::dumpState() const { diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h index 4d2e7d0d4bd..79ea44396bd 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h @@ -156,14 +156,24 @@ public: } }); // Keys that are fallthrough should be skipped - return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + if (requiresBitsetPerBackend_) { + auto backend_idx = ks.getBackendIndex(); + return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]); + } else { + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + } } template DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const { auto ks = detail::multi_dispatch_key_set(args...); // Keys that are fallthrough should be skipped - return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + if (requiresBitsetPerBackend_) { + auto backend_idx = ks.getBackendIndex(); + return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]); + } else { + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); + } } void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough); @@ -193,7 +203,12 @@ private: explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse) : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse) - , nonFallthroughKeys_(DispatchKeySet::FULL) {} + , nonFallthroughKeys_(DispatchKeySet::FULL) + , requiresBitsetPerBackend_(false) { + for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { + nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL; + } + } // this is a bitset that has ones for each argument index which has to be // considered for dispatch. This avoids having to iterate over the stack @@ -205,8 +220,14 @@ private: // fallthrough c10::utils::bitset dispatch_arg_indices_reverse_; - // Set of keys for which the operator does NOT have fallthrough kernel. + // Set of functionality keys for which the operator does NOT have fallthrough kernel. DispatchKeySet nonFallthroughKeys_; + // Set of functionality keys for which the operator does NOT have fallthrough kernel, defined PER BACKEND. + // This is only needed if we know that the operator has a different set of fallthroughs defined for some backends. + std::array nonFallthroughKeysPerBackend_; + // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast path), + // or if we need to fall back to the slower path and check nonFallthroughKeysPerBackend_ + bool requiresBitsetPerBackend_; }; } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 3dccc4645a8..f2426f6bb1f 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -267,14 +267,15 @@ void Dispatcher::cleanup(const OperatorHandle& op, const OperatorName& op_name) RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, KernelFunction kernel, std::string debug) { std::lock_guard lock(mutex_); + auto idx = getDispatchTableIndexForDispatchKey(dispatchKey); TORCH_CHECK( - !backendFallbackKernels_[static_cast(dispatchKey)].kernel.isValid(), + !backendFallbackKernels_[idx].kernel.isValid(), "Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ", - backendFallbackKernels_[static_cast(dispatchKey)].debug, ", new registration ", debug + backendFallbackKernels_[idx].debug, ", new registration ", debug ); // NB: inferred function schema is always nullptr for fallbacks, as fallbacks // cannot be unobxed - backendFallbackKernels_[static_cast(dispatchKey)] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug)); + backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug)); for (auto& op : operators_) { op.op.updateFallback(*this, dispatchKey); @@ -288,7 +289,8 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker void Dispatcher::deregisterFallback_(DispatchKey dispatchKey) { std::lock_guard lock(mutex_); - backendFallbackKernels_[static_cast(dispatchKey)] = {}; + auto idx = getDispatchTableIndexForDispatchKey(dispatchKey); + backendFallbackKernels_[idx] = {}; for (auto& op : operators_) { op.op.updateFallback(*this, dispatchKey); diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 14ffa2f94c9..8108c3c1928 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -291,7 +291,7 @@ private: // Map from namespace to debug string (saying, e.g., where the library was defined) ska::flat_hash_map libraries_; - std::array(DispatchKey::NumDispatchKeys)> backendFallbackKernels_; + std::array backendFallbackKernels_; std::unique_ptr listeners_; std::mutex mutex_; @@ -531,8 +531,7 @@ C10_DISPATCHER_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorH detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5 auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor() .template getDispatchKeySetUnboxed(args...); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKeySet.highestPriorityTypeId())); - const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet.highestPriorityTypeId()); + const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING // By default, when there're no high-frequency or non-sampled callbacks, // RecordFunction is pre-sampled as a perf optimization; @@ -553,7 +552,7 @@ template inline Return Dispatcher::redispatch(const TypedOperatorHandle& op, DispatchKeySet currentDispatchKeySet, Args... args) const { detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5 // do not use RecordFunction on redispatch - const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet.highestPriorityTypeId()); + const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet); return kernel.template call(op, currentDispatchKeySet, std::forward(args)...); } @@ -561,7 +560,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const // note: this doesn't need the mutex because write operations on the list keep iterators intact. const auto& entry = op.operatorDef_->op; auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack); - const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId()); + const auto& kernel = entry.lookup(dispatchKeySet); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING bool pre_sampled = false; if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { @@ -593,7 +592,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const { // note: this doesn't need the mutex because write operations on the list keep iterators intact. const auto& entry = op.operatorDef_->op; - const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId()); + const auto& kernel = entry.lookup(dispatchKeySet); return kernel.callBoxed(op, dispatchKeySet, stack); } diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index d4d997fde69..06165baf183 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -283,7 +283,7 @@ std::pair OperatorEntry::computeDispatchTab } // 3. Backend fallback - auto dispatch_ix = static_cast(dispatch_key); + auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key); if (dispatcher.backendFallbackKernels_[dispatch_ix].kernel.isValid()) { return {dispatcher.backendFallbackKernels_[dispatch_ix], "backend fallback"}; } @@ -299,10 +299,7 @@ std::pair OperatorEntry::computeDispatchTab // or alias keys and their associated keysets). // This function should be considered a private helper for updateDispatchTable_() void OperatorEntry::updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) { - const auto dispatch_ix = c10::getDispatchTableIndexForDispatchKey(dispatch_key); - if (C10_UNLIKELY(dispatch_ix == -1)) { - return; - } + const auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key); dispatchTable_[dispatch_ix] = computeDispatchTableEntry(dispatcher, dispatch_key); dispatchKeyExtractor_.setOperatorHasFallthroughForKey(dispatch_key, dispatchTable_[dispatch_ix].isFallthrough()); } @@ -329,8 +326,12 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp } // Note [Refresh Runtime Autograd entries in dispatchTable_] // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3). + // In theory, we should only have to check if the given runtime key has "dense" functionality, + // e.g. DispatchKey::CPU (which is composed of DispatchKey::Dense and BackendComponent::CPUBit). + // However, there are some backends that should be included in this set that don't have the dense key set. + // E.g. DispatchKey::Meta, DispatchKey::ORT. if (c10::isBackendDispatchKey(dispatch_key)) { - DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key); + DispatchKey autograd_key = getAutogradKeyFromBackend(toBackendComponent(dispatch_key)); updateDispatchTableEntry_(dispatcher, autograd_key); } } @@ -357,8 +358,9 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher) // catchAll. After catchAllKernel_ is removed, Undefined now can get a kernel from either CompositeExplicitAutograd // or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd) // should return true, it returns false because Undefined cannot be represented in a DispatchKeySet. - for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { - updateDispatchTable_(dispatcher, static_cast(iter)); + updateDispatchTable_(dispatcher, DispatchKey::Undefined); + for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { + updateDispatchTable_(dispatcher, k); } } @@ -371,9 +373,10 @@ void OperatorEntry::checkInvariants() const { for (const auto& kv : kernels_) { TORCH_INTERNAL_ASSERT(kv.second.size() > 0, dumpState()); } - for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { - auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), static_cast(iter)); - TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[iter]), + for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { + auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k); + auto idx = getDispatchTableIndexForDispatchKey(k); + TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[idx]), "Canonical state\n~~~~~~~~~~~\n", dumpState(), "\n\n" "Computed table:\n~~~~~~~~~~~\n", dumpComputedTable()); } @@ -384,7 +387,8 @@ std::string OperatorEntry::listAllDispatchKeys() const { str << "["; bool has_kernels = false; - for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { + for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { + auto iter = getDispatchTableIndexForDispatchKey(k); if (!dispatchTable_[iter].isValid()) { continue; } @@ -443,8 +447,12 @@ void OperatorEntry::reportError(DispatchKey dispatchKey) const { // updateDispatchTableFull_ would update the dispatch table to be) std::string OperatorEntry::dumpComputedTable() const { std::ostringstream oss; - for (uint8_t i = 0; i < static_cast(DispatchKey::NumDispatchKeys); i++) { - auto k = static_cast(i); + // Need to handle Undefined separately, because its a runtime key that can't be represented + // in a DispatchKeySet. + std::vector runtime_keys = {DispatchKey::Undefined}; + for (auto k : DispatchKeySet(DispatchKeySet::FULL)) runtime_keys.push_back(k); + + for (auto k : runtime_keys) { auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k); if (kernel_prov.first.kernel.isValid()) { oss << toString(k) << ": " diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index d98bd6bc690..d86f0cfef3e 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -173,11 +173,8 @@ public: [[noreturn]] void reportError(DispatchKey dispatchKey) const; - const KernelFunction& lookup(DispatchKey k) const { - const auto idx = getDispatchTableIndexForDispatchKey(k); - if (C10_UNLIKELY(idx == -1)) { - reportError(k); - } + const KernelFunction& lookup(DispatchKeySet ks) const { + const auto idx = ks.getDispatchTableIndexForDispatchKeySet(); const auto& kernel = dispatchTable_[idx]; // A valid kernel *always* has a boxed kernel and *may* have an // unboxed kernel. However, we typically do unboxed calls in at:: @@ -187,7 +184,7 @@ public: // in the common case. if (C10_UNLIKELY(!kernel.isValidUnboxed())) { if (!kernel.isValid()) { - reportError(k); + reportError(ks.highestPriorityTypeId()); } } return kernel; @@ -211,7 +208,7 @@ private: OperatorName name_; c10::optional schema_; - std::array dispatchTable_; + std::array dispatchTable_; DispatchKeyExtractor dispatchKeyExtractor_; // kernels_ stores all registered kernels for the corresponding dispatch key diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index 0a3f9236b75..970e7949131 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -591,7 +591,7 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) { void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) { auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() - .kernel(c10::getAutogradKeyFromBackend(key)) + .kernel(c10::getAutogradKeyFromBackend(toBackendComponent(key))) .kernel(DispatchKey::Autograd)); auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""}); @@ -1791,22 +1791,22 @@ TEST(NewOperatorRegistrationTest, dispatchAutogradPrecedence) { TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool sparsecpu_called, math_called = false; + bool fpga_called, math_called = false; auto m = MAKE_TORCH_LIBRARY(test); - m.def("fn", torch::dispatch(c10::DispatchKey::SparseCPU, [&](const Tensor& x) { sparsecpu_called = true; return x; })); + m.def("fn", torch::dispatch(c10::DispatchKey::FPGA, [&](const Tensor& x) { fpga_called = true; return x; })); m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }); auto op = Dispatcher::singleton().findSchema({"test::fn", ""}); ASSERT_TRUE(op.has_value()); { - callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU)); - ASSERT_TRUE(sparsecpu_called); + callOp(*op, dummyTensor(c10::DispatchKey::FPGA)); + ASSERT_TRUE(fpga_called); } { expectThrows([&] { - callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU, /*requires_grad=*/true)); + callOp(*op, dummyTensor(c10::DispatchKey::FPGA, /*requires_grad=*/true)); }, "test::fn has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther."); } } @@ -1849,18 +1849,15 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) { } { - // TODO(#43908): currently this will fallthrough AutogradPrivateUse1 then call catchall kernel - // at AutogradCPU, while backend extenders are indeed expecting to call PrivateUse1 kernel. - // This confusing behavior is caused by we registering fallthrough as backend fallback for - // Autograd keys. Note users could always work around this by registering the same kernel to - // AutogradPrivateUse1 as shown below until we support it. auto op = Dispatcher::singleton().findOp({"test::fn", ""}); ASSERT_TRUE(op.has_value()); catchall_called = false; + privateuse1_called = false; callOp(*op, dummyTensor(c10::DispatchKey::PrivateUse1, /*requires_grad=*/true), dummyTensor(c10::DispatchKey::CPU, /*requires_grad=*/true)); - ASSERT_TRUE(catchall_called); + ASSERT_FALSE(catchall_called); + ASSERT_TRUE(privateuse1_called); } m.impl("fn", c10::DispatchKey::AutogradPrivateUse1, [&](const Tensor& x, const Tensor& y) { privateuse1_called = true; return x; }); @@ -1876,6 +1873,27 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) { } } +TEST(NewOperatorRegistrationTest, registerCompositeImplicitAutogradWithCPUKernel_andCallAutogradOtherKernel_callsComposite) { + bool math_called = false; + bool cpu_called = false; + auto m = MAKE_TORCH_LIBRARY(test); + m.def("fn(Tensor dummy) -> Tensor"); + m.impl("fn", c10::DispatchKey::CPU, [&](const Tensor& x) { cpu_called = true; return x; }); + m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }); + + auto op = Dispatcher::singleton().findSchema({"test::fn", ""}); + ASSERT_TRUE(op.has_value()); + + { + math_called = cpu_called = false; + // Meta should redispatch to the AutogradOther backend, + // which the composite kernel should be registered to. + callOp(*op, dummyTensor(c10::DispatchKey::Meta, /*requires_grad=*/true)); + ASSERT_TRUE(math_called); + ASSERT_FALSE(cpu_called); + } +} + TEST(NewOperatorRegistrationTest, dispatchMultiple) { bool cpu_called = false; bool cuda_called = false; diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp index 7d2f9e7fcb6..b95558563bf 100644 --- a/c10/core/DispatchKey.cpp +++ b/c10/core/DispatchKey.cpp @@ -1,14 +1,47 @@ #include +#include #include namespace c10 { +const char* toString(BackendComponent t) { + switch (t) { + case BackendComponent::CPUBit: + return "CPUBit"; + case BackendComponent::CUDABit: + return "CUDABit"; + case BackendComponent::HIPBit: + return "HIPBit"; + case BackendComponent::XLABit: + return "XLABit"; + case BackendComponent::LazyBit: + return "LazyBit"; + case BackendComponent::XPUBit: + return "XPUBit"; + case BackendComponent::MLCBit: + return "MLCBit"; + case BackendComponent::HPUBit: + return "HPUBit"; + case BackendComponent::VEBit: + return "VEBit"; + case BackendComponent::PrivateUse1Bit: + return "PrivateUse1Bit"; + case BackendComponent::PrivateUse2Bit: + return "PrivateUse2Bit"; + case BackendComponent::PrivateUse3Bit: + return "PrivateUse3Bit"; + case BackendComponent::InvalidBit: + return "InvalidBit"; + default: + return "UNKNOWN_BACKEND_BIT"; + } +} + const char* toString(DispatchKey t) { switch (t) { case DispatchKey::Undefined: return "Undefined"; - case DispatchKey::CPU: return "CPU"; case DispatchKey::CUDA: @@ -101,8 +134,6 @@ const char* toString(DispatchKey t) { return "AutogradMLC"; case DispatchKey::AutogradHPU: return "AutogradHPU"; - case DispatchKey::AutogradNestedTensor: - return "AutogradNestedTensor"; case DispatchKey::AutogradPrivateUse1: return "AutogradPrivateUse1"; case DispatchKey::AutogradPrivateUse2: @@ -111,6 +142,8 @@ const char* toString(DispatchKey t) { return "AutogradPrivateUse3"; case DispatchKey::AutogradOther: return "AutogradOther"; + case DispatchKey::AutogradNestedTensor: + return "AutogradNestedTensor"; case DispatchKey::ZeroTensor: return "ZeroTensor"; @@ -168,6 +201,15 @@ const char* toString(DispatchKey t) { case DispatchKey::FuncTorchBatched: return "FuncTorchBatched"; + case DispatchKey::Dense: + return "Dense"; + case DispatchKey::Quantized: + return "Quantized"; + case DispatchKey::Sparse: + return "Sparse"; + case DispatchKey::AutogradFunctionality: + return "AutogradFunctionality"; + default: return "UNKNOWN_TENSOR_TYPE_ID"; } @@ -176,76 +218,37 @@ const char* toString(DispatchKey t) { std::ostream& operator<<(std::ostream& str, DispatchKey rhs) { return str << toString(rhs); } +std::ostream& operator<<(std::ostream& str, BackendComponent rhs) { + return str << toString(rhs); +} -// for a given backend key, return the associated autograd key. -// for non-backend keys, return AutogradOther as a default. -// Note: it's convenient and fast to return a default here rather than (say) -// returning an optional, or throwing. But it makes callers -// responsible for either a) enforcing the invariant that only backend keys -// be passed as arguments, or b) interpreting our return value carefully. -// -DispatchKey getAutogradKeyFromBackend(DispatchKey t) { - switch (t) { - case DispatchKey::CPU: - return DispatchKey::AutogradCPU; - case DispatchKey::XPU: - return DispatchKey::AutogradXPU; - case DispatchKey::CUDA: - return DispatchKey::AutogradCUDA; - case DispatchKey::XLA: - return DispatchKey::AutogradXLA; - case DispatchKey::Lazy: - return DispatchKey::AutogradLazy; - case DispatchKey::MLC: - return DispatchKey::AutogradMLC; - case DispatchKey::HPU: - return DispatchKey::AutogradHPU; - case DispatchKey::NestedTensor: - return DispatchKey::AutogradNestedTensor; - case DispatchKey::PrivateUse1: - return DispatchKey::AutogradPrivateUse1; - case DispatchKey::PrivateUse2: - return DispatchKey::AutogradPrivateUse2; - case DispatchKey::PrivateUse3: - return DispatchKey::AutogradPrivateUse3; - default: - return DispatchKey::AutogradOther; - } +DispatchKey getAutogradKeyFromBackend(BackendComponent k) { + // We want this to return an autograd key. We're relying on the fact that + // getAutogradRelatedKeySetFromBackend returns an autograd key + + // ADInplaceOrView, and autograd has higher precedence. The core mapping from + // backend -> autograd key lives in `getAutogradRelatedKeySetFromBackend` + // instead of here for performance. `getAutogradRelatedKeySetFromBackend` is a + // hotpath function, and we want to make sure that it doesn't have to + // construct any DispatchKeySets at runtime. + return getAutogradRelatedKeySetFromBackend(k).highestPriorityTypeId(); } c10::DispatchKey parseDispatchKey(const std::string& k) { static std::unordered_map key_map = { {"Undefined", c10::DispatchKey::Undefined}, - {"CPU", c10::DispatchKey::CPU}, - {"CUDA", c10::DispatchKey::CUDA}, - {"HIP", c10::DispatchKey::HIP}, + {"Dense", c10::DispatchKey::Dense}, {"FPGA", c10::DispatchKey::FPGA}, {"ORT", c10::DispatchKey::ORT}, - {"XLA", c10::DispatchKey::XLA}, - {"MLC", c10::DispatchKey::MLC}, {"Vulkan", c10::DispatchKey::Vulkan}, {"Metal", c10::DispatchKey::Metal}, - {"XPU", c10::DispatchKey::XPU}, - {"HPU", c10::DispatchKey::HPU}, {"VE", c10::DispatchKey::VE}, - {"Lazy", c10::DispatchKey::Lazy}, {"Meta", c10::DispatchKey::Meta}, - {"QuantizedCPU", c10::DispatchKey::QuantizedCPU}, - {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA}, - {"QuantizedXPU", c10::DispatchKey::QuantizedXPU}, + {"Quantized", c10::DispatchKey::Quantized}, {"CustomRNGKeyId", c10::DispatchKey::CustomRNGKeyId}, {"MkldnnCPU", c10::DispatchKey::MkldnnCPU}, - {"SparseCPU", c10::DispatchKey::SparseCPU}, - {"SparseCUDA", c10::DispatchKey::SparseCUDA}, - {"SparseHIP", c10::DispatchKey::SparseHIP}, - {"SparseXPU", c10::DispatchKey::SparseXPU}, - {"SparseVE", c10::DispatchKey::SparseVE}, + {"Sparse", c10::DispatchKey::Sparse}, {"SparseCsrCPU", c10::DispatchKey::SparseCsrCPU}, {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA}, - {"NestedTensor", c10::DispatchKey::NestedTensor}, - {"PrivateUse1", c10::DispatchKey::PrivateUse1}, - {"PrivateUse2", c10::DispatchKey::PrivateUse2}, - {"PrivateUse3", c10::DispatchKey::PrivateUse3}, {"BackendSelect", c10::DispatchKey::BackendSelect}, {"Python", c10::DispatchKey::Python}, {"Named", c10::DispatchKey::Named}, @@ -256,17 +259,8 @@ c10::DispatchKey parseDispatchKey(const std::string& k) { c10::DispatchKey::FuncTorchDynamicLayerBackMode}, {"ADInplaceOrView", c10::DispatchKey::ADInplaceOrView}, {"AutogradOther", c10::DispatchKey::AutogradOther}, - {"AutogradCPU", c10::DispatchKey::AutogradCPU}, - {"AutogradCUDA", c10::DispatchKey::AutogradCUDA}, - {"AutogradXLA", c10::DispatchKey::AutogradXLA}, - {"AutogradLazy", c10::DispatchKey::AutogradLazy}, - {"AutogradXPU", c10::DispatchKey::AutogradXPU}, - {"AutogradMLC", c10::DispatchKey::AutogradMLC}, - {"AutogradHPU", c10::DispatchKey::AutogradHPU}, + {"AutogradFunctionality", c10::DispatchKey::AutogradFunctionality}, {"AutogradNestedTensor", c10::DispatchKey::AutogradNestedTensor}, - {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1}, - {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2}, - {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3}, {"Tracer", c10::DispatchKey::Tracer}, {"AutocastCPU", c10::DispatchKey::AutocastCPU}, {"AutocastCUDA", c10::DispatchKey::AutocastCUDA}, @@ -280,6 +274,41 @@ c10::DispatchKey parseDispatchKey(const std::string& k) { {"TESTING_ONLY_GenericWrapper", c10::DispatchKey::TESTING_ONLY_GenericWrapper}, {"TESTING_ONLY_GenericMode", c10::DispatchKey::TESTING_ONLY_GenericMode}, + + {"CPU", c10::DispatchKey::CPU}, + {"CUDA", c10::DispatchKey::CUDA}, + {"HIP", c10::DispatchKey::HIP}, + {"XLA", c10::DispatchKey::XLA}, + {"MLC", c10::DispatchKey::MLC}, + {"XPU", c10::DispatchKey::XPU}, + {"HPU", c10::DispatchKey::HPU}, + {"Lazy", c10::DispatchKey::Lazy}, + {"NestedTensor", c10::DispatchKey::NestedTensor}, + {"PrivateUse1", c10::DispatchKey::PrivateUse1}, + {"PrivateUse2", c10::DispatchKey::PrivateUse2}, + {"PrivateUse3", c10::DispatchKey::PrivateUse3}, + + {"QuantizedCPU", c10::DispatchKey::QuantizedCPU}, + {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA}, + {"QuantizedXPU", c10::DispatchKey::QuantizedXPU}, + + {"SparseCPU", c10::DispatchKey::SparseCPU}, + {"SparseCUDA", c10::DispatchKey::SparseCUDA}, + {"SparseHIP", c10::DispatchKey::SparseHIP}, + {"SparseXPU", c10::DispatchKey::SparseXPU}, + {"SparseVE", c10::DispatchKey::SparseVE}, + + {"AutogradCPU", c10::DispatchKey::AutogradCPU}, + {"AutogradCUDA", c10::DispatchKey::AutogradCUDA}, + {"AutogradXLA", c10::DispatchKey::AutogradXLA}, + {"AutogradLazy", c10::DispatchKey::AutogradLazy}, + {"AutogradXPU", c10::DispatchKey::AutogradXPU}, + {"AutogradMLC", c10::DispatchKey::AutogradMLC}, + {"AutogradHPU", c10::DispatchKey::AutogradHPU}, + {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1}, + {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2}, + {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3}, + {"Autograd", c10::DispatchKey::Autograd}, {"CompositeImplicitAutograd", c10::DispatchKey::CompositeImplicitAutograd}, diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index 1bb8268e2bd..b5860bd608c 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -9,20 +9,98 @@ namespace c10 { +// Semantically, each value of BackendComponent identifies a "backend" for our +// dispatch. Some functionalities that we may dispatch to are allowed to +// register different handlers for each backend. The BackendComponent is then +// used to figure out which backend implementation to dispatch to. + +// In implementation terms, the backend component identifies a specific "bit" in +// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom +// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to +// functionalities. When we encounter a functionality bit that is known to be +// customizeable per-backend, then we also look at the lower BackendComponent +// bits and take the highest bit to determine which backend's implementation to +// use. + +enum class BackendComponent : uint8_t { + + // A "backend" is colloquially used to refer to handlers for dispatch + // which actually implement the numerics of an operation in question. + // + // Due to the nature of the enum, these backends are specified in + // an ordered way, but for most backends this order is not semantically + // meaningful (e.g., it's valid to reorder these backends without changing + // semantics). The only situation when backend ordering is meaningful + // is when the backend participates in multiple dispatch with another + // backend; e.g., CPU and CUDA (cuda must have higher priority). + + // These keys don't correspond to individual kernels. + // Instead, they represent the backends that are allowed to override specific + // pieces of functionality: + // - dense kernels (e.g. DispatchKey::CPU) + // - sparse kernels (e.g. DispatchKey::SparseCPU) + // - quantized kernels (e.g. DispatchKey::QuantizedCPU) + // - autograd kernels (e.g. DispatchKey::AutogradCPU) + // We reserve space in the runtime operator table for this full cross product + // of + // [backends in this enum] x [keys below that are explicitly marked as having + // per-backend functionality] + + InvalidBit = 0, + CPUBit, + CUDABit, + HIPBit, + XLABit, + MLCBit, + XPUBit, + HPUBit, + VEBit, + LazyBit, + PrivateUse1Bit, + PrivateUse2Bit, + PrivateUse3Bit, + // Define an alias to represent end of backend dispatch keys. + // If you add new backend keys after PrivateUse3, please also update it here. + // (But you shouldn't: private use keys should have higher precedence than + // all built-in keys) + EndOfBackendKeys = PrivateUse3Bit, +}; + // Semantically, a dispatch key identifies a possible "level" in our -// dispatch, for which a handler may be registered. Traditional -// backends like CPU and CUDA get dispatch keys; however, so do -// "wrapping" layers like Variable (for autograd handling). +// dispatch, for which a handler may be registered. Each handler corresponds +// to a type of functionality. // // In implementation terms, the dispatch key identifies a specific "bit" in a // DispatchKeySet. Higher bit indexes get handled by dispatching first (because // we "count leading zeros" when we extract the highest priority dispatch // key.) // +// Note [DispatchKey Classification] +// This enum actually contains several types of keys, which are explained +// in more detail further down: +// (1) non-customizable backends (e.g. FPGA) +// (2) non-customizable functionalities (e.g. Functionalize) +// (3) functionalized that are customizable per backend (e.g. Dense, Sparse, +// AutogradFunctionality) (4) per-backend instances of customizable +// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g. +// CompositeImplicitAutograd) +// +// Of the categories above, it's important to note: +// (a) which keys are assigned individual bits in a DispatchKeySet +// (b) which keys are assigned individual slots in the runtime operator table +// ("Runtime keys") +// +// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet. +// (1), (2) and (4) all get their own dedicated slots in the runtime operator +// table. + +// See Note [DispatchKeySet Internal Representation] for more details. +// // NOTE: Keep the list in sync with `DispatchKey` in tools/codegen/model.py -enum class DispatchKey : uint8_t { +enum class DispatchKey : uint16_t { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // This is not a "real" tensor id, but it exists to give us a "nullopt" + // This is not a "real" functionality, but it exists to give us a "nullopt" // element we can return for cases when a DispatchKeySet contains no elements. // You can think a more semantically accurate definition of DispatchKey is: // @@ -38,24 +116,31 @@ enum class DispatchKey : uint8_t { // this will get eliminated, but for now it's convenient) CatchAll = Undefined, - // ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // A "backend" is colloquially used to refer to handlers for dispatch - // which actually implement the numerics of an operation in question. + // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ // + // Every value in the enum (up to EndOfFunctionalityKeys) + // corresponds to an individual "functionality" that can be dispatched to. + // This is represented in the DispatchKeySet by assigning each of these enum + // values + // to each of the remaining (64 - len(BackendComponent)) bits. // - // Due to the nature of the enum, these backends are specified in - // an ordered way, but for most backends this order is not semantically - // meaningful (e.g., it's valid to reorder these backends without changing - // semantics). The only situation when backend ordering is meaningful - // is when the backend participates in multiple dispatch with another - // backend; e.g., CPU and SparseCPU (sparse must have - // higher priority). + // Most of these functionalities have a single handler assigned to them, + // making them "runtime keys". + // That map to a single slot in the runtime operator table. + // + // A few functionalities are allowed to be customizable per backend. + // See [Note: Per-Backend Functionality Dispatch Keys] for details. + + // See [Note: Per-Backend Functionality Dispatch Keys] + Dense, + + // Below are non-extensible backends. + // These are backends that currently don't have their own overrides for + // Autograd/Sparse/Quantized kernels, + // and we therefore don't waste space in the runtime operator table allocating + // space for them. + // If any of these backends ever need to customize, e.g., Autograd, then we'll + // need to add a DispatchKey::*Bit for them. - // Here are backends which you think of as traditionally specifying - // how to implement operations on some device. - CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp - CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp - HIP, // NB: I think this is not actually used, due to Note [Masquerading as - // CUDA] FPGA, // Xilinx support lives out of tree at // https://gitlab.com/pytorch-complex/vitis_kernels @@ -67,14 +152,8 @@ enum class DispatchKey : uint8_t { // - aten/src/ATen/test/extension_backend_test.cpp ORT, - XLA, // lives out of tree at https://github.com/pytorch/xla - MLC, // lives out of tree at https://github.com/pytorch/MLCompute Vulkan, Metal, - XPU, // For out of tree Intel's heterogeneous computing plug-in - HPU, // For out of tree & closed source integration of HPU / Habana - VE, // For out of tree & closed source integration of SX-Aurora / NEC - Lazy, // For lazy tensor backends // A meta tensor is a tensor without any data associated with it. (They // have also colloquially been referred to as tensors on the "null" device). @@ -83,11 +162,8 @@ enum class DispatchKey : uint8_t { // tensor with the output shape and dtype, but wouldn't actually add anything. Meta, - // Here are backends which specify more specialized operators - // based on the dtype of the tensor. - QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp - QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp - QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in + // See [Note: Per-Backend Functionality Dispatch Keys] + Quantized, // This backend is to support custom RNGs; it lets you go // to a different kernel if you pass in a generator that is not a @@ -106,31 +182,29 @@ enum class DispatchKey : uint8_t { // the corresponding dense tensors, and must be handled before them. MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp // NB: not to be confused with MKLDNN, which is Caffe2 only - SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp - SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp - SparseHIP, // TODO: I think this is not actually used, due to Note - // [Masquerading as CUDA] - SparseXPU, // For out of tree Intel's heterogeneous computing plug-in - SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC + + // See [Note: Per-Backend Functionality Dispatch Keys] + Sparse, SparseCsrCPU, SparseCsrCUDA, + // Note [Non-Customizable Backend Keys] + // Every key above here is considered a "non-customizable backend". + // These are backends that will work correctly with autograd, but + // but currently don't require separate implementations + // for autograd sparse or quantized kernels. + // Any new backends that don't need to be customized should go above here. + // If an existing backend needs to e.g. override autograd, then we can + // consider promoting it into the "BackendComponent" enum + // + // For all intents and purposes from the perspective of DispatchKeySet, + // "non-customizable backend" keys are treated the same way + // as other functionality keys + EndOfNonCustomizableBackends = SparseCsrCUDA, + NestedTensor, // lives out of tree at https://github.com/pytorch/nestedtensor - // Here are reserved backends for user-defined backends, see Note [Private use - // DispatchKey] - // To see some example about how to use this, check out ORT - PrivateUse1, - PrivateUse2, - PrivateUse3, - - // Define an alias key to represent end of backend dispatch keys. - // If you add new backend keys after PrivateUse3, please also update it here. - // (But you shouldn't: private use keys should have higher precedence than - // all built-in keys) - EndOfBackendKeys = PrivateUse3, - // In some situations, it is not immediately obvious what the correct // backend for function is, because the function in question doesn't // have any "tensor" arguments. In this case, a BackendSelect function @@ -233,20 +307,18 @@ enum class DispatchKey : uint8_t { // AutogradOther key. We can add specific autograd key for those backends // upon request. AutogradOther, - AutogradCPU, - AutogradCUDA, - AutogradXLA, - AutogradLazy, - AutogradXPU, - AutogradMLC, - AutogradHPU, - AutogradNestedTensor, // lives out of tree at + + // See [Note: Per-Backend Functionality Dispatch Keys] + AutogradFunctionality, + + // NestedTensor is an example of something that isn't a "real backend" + // (because it mostly consists of redispatching kernels) + // but it would like to override autograd functionality in C++. + // We can handle cases like this by adding an extra functionality key + // exclusively for handling autograd for NestedTensor. + // lives out of tree at // https://github.com/pytorch/nestedtensor - // Here are some reserved pre-autograd keys for user-defined backends, see - // Note [Private use DispatchKey] - AutogradPrivateUse1, - AutogradPrivateUse2, - AutogradPrivateUse3, + AutogradNestedTensor, Tracer, @@ -299,9 +371,100 @@ enum class DispatchKey : uint8_t { TESTING_ONLY_GenericMode, // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - NumDispatchKeys, // Sentinel, end of runtime keys. + EndOfFunctionalityKeys, // End of functionality keys. + + // ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ // + // Here are backends which you think of as traditionally specifying + // how to implement operations on some device. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfDenseBackends, + CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp + CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp + HIP, // NB: I think this is not actually used, due to Note [Masquerading as + // CUDA] + XLA, // lives out of tree at https://github.com/pytorch/xla + MLC, // lives out of tree at https://github.com/pytorch/MLCompute + XPU, // For out of tree Intel's heterogeneous computing plug-in + HPU, // For out of tree & closed source integration of HPU / Habana + VE, // For out of tree & closed source integration of SX-Aurora / NEC + Lazy, // For lazy tensor backends + // Here are reserved backends for user-defined backends, see Note [Private use + // DispatchKey] + // To see some example about how to use this, check out ORT + PrivateUse1, + PrivateUse2, + PrivateUse3, + EndOfDenseBackends = PrivateUse3, + + // ~~~~~~~~~~~~~~ "Quantized" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~ // + // keys starting with an _ are not currently used, + // but are needed to ensure that every backend is indexed correctly. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfQuantizedBackends, + QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp + QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp + _QuantizedHIP, + _QuantizedXLA, + _QuantizedMLC, + QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in + _QuantizedHPU, + _QuantizedVE, + _QuantizedLazy, + _QuantizedPrivateUse1, + _QuantizedPrivateUse2, + _QuantizedPrivateUse3, + EndOfQuantizedBackends = _QuantizedPrivateUse3, + + // ~~~~~~~~~~~~~~ "Sparse" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~ // + // keys starting with an _ are not currently used, + // but are needed to ensure that every backend is indexed correctly. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfSparseBackends, + SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp + SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp + SparseHIP, // TODO: I think this is not actually used, due to Note + // [Masquerading as CUDA] + _SparseXLA, + _SparseMLC, + SparseXPU, // For out of tree Intel's heterogeneous computing plug-in + _SparseHPU, + SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC + _SparseLazy, + _SparsePrivateUse1, + _SparsePrivateUse2, + _SparsePrivateUse3, + EndOfSparseBackends = _SparsePrivateUse3, + + // ~~~~~~~~~~~~~~ "Autograd" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~ // + // keys starting with an _ are not currently used, + // but are needed to ensure that every backend is indexed correctly. + + // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] + StartOfAutogradBackends, + AutogradCPU, + AutogradCUDA, + _AutogradHIP, + AutogradXLA, + AutogradMLC, + AutogradXPU, + AutogradHPU, + _AutogradVE, + AutogradLazy, + // Here are some reserved pre-autograd keys for user-defined backends, see + // Note [Private use DispatchKey] + AutogradPrivateUse1, + AutogradPrivateUse2, + AutogradPrivateUse3, + EndOfAutogradBackends = AutogradPrivateUse3, + // If we add a new per-backend functionality key that has higher priority + // than Autograd, then this key should be updated. + EndOfRuntimeBackendKeys = EndOfAutogradBackends, // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ // + // Note [Alias Dispatch Keys] // Alias dispatch keys are synthetic dispatch keys which map to multiple // runtime dispatch keys. Alisa keys have precedence, but they are always // lower precedence than runtime keys. You can register a kernel to an @@ -321,6 +484,7 @@ enum class DispatchKey : uint8_t { // Define an alias key to represent end of alias dispatch keys. // If you add new alias keys after Autograd, please also update it here. + StartOfAliasKeys = Autograd, EndOfAliasKeys = CompositeExplicitAutograd, // // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // @@ -360,54 +524,83 @@ enum class DispatchKey : uint8_t { // built-in autograd formulas for operators are not appropriate. static_assert( - static_cast(DispatchKey::NumDispatchKeys) < 64, - "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries"); + (static_cast(BackendComponent::EndOfBackendKeys) + + static_cast(DispatchKey::EndOfFunctionalityKeys)) <= 64, + "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)" + " both map to backend and functionality bits" + " into a 64-bit bitmask; you must have less than 64 total entries between them"); -#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) -/** - * The method below maps the dispatch key in the enum DispatchKey to an - * integer index in the dispatchTable_ array in OperatorEntry. The array - * is trimmed for mobile to reduce peak memory usage since it's - * unnecessary to reserve additional space for dispatch keys that will - * never be used on mobile. - */ -C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) { - switch (dk) { - case DispatchKey::Undefined: - return 0; - case DispatchKey::CPU: - return 1; - case DispatchKey::QuantizedCPU: - return 2; - case DispatchKey::SparseCPU: - return 3; - case DispatchKey::BackendSelect: - return 4; - case DispatchKey::ADInplaceOrView: - return 5; - case DispatchKey::AutogradOther: - return 6; - case DispatchKey::AutogradCPU: - return 7; - case DispatchKey::NumDispatchKeys: // Sentinel, end of runtime keys. - return 8; - default: - return -1; +// Check if a DispatchKey is an alias mapping to other runtime keys. +constexpr bool isAliasDispatchKey(DispatchKey k) { + return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys; +} + +// [Note: Per-Backend Functionality Dispatch Keys] +// Check if a DispatchKey is a per-backend functionality key +// Any functionalities that can be customized per-backend should be added here. +// These keys correspond to functionalities that can be customized indivually +// per backend. While they only take up one bit in the `DispatchKeySet` bitset, +// they map to (# backends) slots in the operator table. +// Each of these keys also has a separate set of "runtime keys" in the dispatch +// key enum, per backend, which *do* map to the individual operator table slots. +// For example, the "Sparse" key maps to an individual bit in the +// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual +// slots in the runtime operator table. + +constexpr bool isPerBackendFunctionalityKey(DispatchKey k) { + if (k == DispatchKey::Dense || k == DispatchKey::Quantized || + k == DispatchKey::Sparse || k == DispatchKey::AutogradFunctionality) { + return true; + } else { + return false; } } -#else -/** - * For the server use-case, make this a simple pass-through. - */ -C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) { - return static_cast(dk); + +// Note that this includes Undefined in the total count. +// BUT EndOfFunctionalityKeys is its own (placeholder) key. +// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3. +// In the above example, there are 3 total functionality keys. +constexpr uint8_t num_functionality_keys = + static_cast(DispatchKey::EndOfFunctionalityKeys); + +// Note [No More Than 16 Backends] +// Search for this note to find places in the code where the "no more than 16 +// backends" invariant is baked in. +static_assert( + static_cast(BackendComponent::EndOfBackendKeys) <= 16, + "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \ +there are a few places where this invariant is baked in"); + +constexpr uint8_t numPerBackendFunctionalityKeys() { + uint8_t count = 0; + for (uint8_t k = 0; k <= num_functionality_keys; ++k) { + if (isPerBackendFunctionalityKey(static_cast(k))) + ++count; + } + return count; } + +#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) +// See [Note: Trimmed Mobile Dispatch Keys] +constexpr uint8_t num_backends = 1; // Only CPU +constexpr uint16_t num_runtime_entries = 8; +#else +constexpr uint8_t num_backends = + static_cast(BackendComponent::EndOfBackendKeys); +constexpr uint16_t num_runtime_entries = num_functionality_keys + + (numPerBackendFunctionalityKeys() * (num_backends - 1)); #endif -C10_API const char* toString(DispatchKey); -C10_API std::ostream& operator<<(std::ostream&, DispatchKey); +// See Note [No More Than 16 Backends] +constexpr uint16_t full_backend_mask = + (static_cast(1) << num_backends) - 1; -C10_API DispatchKey getAutogradKeyFromBackend(DispatchKey t); +C10_API const char* toString(DispatchKey); +C10_API const char* toString(BackendComponent); +C10_API std::ostream& operator<<(std::ostream&, DispatchKey); +C10_API std::ostream& operator<<(std::ostream&, BackendComponent); + +C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k); // Parses a string into a dispatch key. // If the string cannot be correctly parsed, throws an exception. @@ -420,10 +613,86 @@ C10_API c10::DispatchKey parseDispatchKey(const std::string& k); // torch::dispatch(torch::kCPU, ...) is also valid. constexpr DispatchKey kAutograd = DispatchKey::Autograd; -// Check if a DispatchKey is an alias mapping to other runtime keys. -inline bool isAliasDispatchKey(DispatchKey k) { - return k > DispatchKey::NumDispatchKeys && k <= DispatchKey::EndOfAliasKeys; +// See Note [The Ordering of Per-Backend Dispatch Keys Matters!] +// This function relies on the invariant that the dispatch keys between +// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend +// in the same order as `BackendComponent`. +constexpr BackendComponent toBackendComponent(DispatchKey k) { + if (k >= DispatchKey::StartOfDenseBackends && + k <= DispatchKey::EndOfDenseBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfDenseBackends)); + } else if ( + k >= DispatchKey::StartOfQuantizedBackends && + k <= DispatchKey::EndOfQuantizedBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfQuantizedBackends)); + } else if ( + k >= DispatchKey::StartOfSparseBackends && + k <= DispatchKey::EndOfSparseBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfSparseBackends)); + } else if ( + k >= DispatchKey::StartOfAutogradBackends && + k <= DispatchKey::EndOfAutogradBackends) { + return static_cast( + static_cast(k) - + static_cast(DispatchKey::StartOfAutogradBackends)); + } else { + return BackendComponent::InvalidBit; + } } + +constexpr DispatchKey toFunctionalityKey(DispatchKey k) { + if (k <= DispatchKey::EndOfFunctionalityKeys) { + return k; + } else if (k <= DispatchKey::EndOfDenseBackends) { + return DispatchKey::Dense; + } else if (k <= DispatchKey::EndOfQuantizedBackends) { + return DispatchKey::Quantized; + } else if (k <= DispatchKey::EndOfSparseBackends) { + return DispatchKey::Sparse; + } else if (k <= DispatchKey::EndOfAutogradBackends) { + return DispatchKey::AutogradFunctionality; + } else { + return DispatchKey::Undefined; + } +} + +// Given (DispatchKey::Dense, DispatchKey::CUDABit), returns DispatchKey::CUDA +// See Note [The Ordering of Per-Backend Dispatch Keys Matters!] +// This function relies on the invariant that the dispatch keys between +// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend +// in the same order as `BackendComponent`. +constexpr DispatchKey toRuntimePerBackendFunctionalityKey( + DispatchKey functionality_k, + BackendComponent backend_k) { + if (functionality_k == DispatchKey::Dense) { + return static_cast( + static_cast(DispatchKey::StartOfDenseBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::Sparse) { + return static_cast( + static_cast(DispatchKey::StartOfSparseBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::Quantized) { + return static_cast( + static_cast(DispatchKey::StartOfQuantizedBackends) + + static_cast(backend_k)); + } + if (functionality_k == DispatchKey::AutogradFunctionality) { + return static_cast( + static_cast(DispatchKey::StartOfAutogradBackends) + + static_cast(backend_k)); + } + return DispatchKey::Undefined; +} + } // namespace c10 namespace torch { diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp index 7f85567f886..d5c11c02399 100644 --- a/c10/core/DispatchKeySet.cpp +++ b/c10/core/DispatchKeySet.cpp @@ -1,37 +1,29 @@ #include +#include namespace c10 { -// backend_dispatch_keyset should include all runtime backend keys. +// backend_dispatch_keyset includes all dispatch keys that map to backends. // Alias key DispatchKey::CompositeExplicitAutograd maps to -// backend_dispatch_keyset NestedTensor has been explicitly removed due to -// incompatibility with some kernels, such as structured kernels, that use the -// DefaultBackend key. -constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends | - DispatchKeySet({ - DispatchKey::CPU, - DispatchKey::CUDA, - DispatchKey::XLA, - DispatchKey::Lazy, - DispatchKey::XPU, - DispatchKey::PrivateUse1, - DispatchKey::PrivateUse2, - DispatchKey::PrivateUse3, - DispatchKey::MLC, - DispatchKey::HPU, - DispatchKey::ORT, - DispatchKey::Meta, - }); +// backend_dispatch_keyset +constexpr DispatchKeySet backend_dispatch_keyset = + autogradother_backends | DispatchKeySet(DispatchKey::Dense); bool isBackendDispatchKey(DispatchKey t) { return t != DispatchKey::Undefined // See Note [No Alias Keys in DispatchKeySet] - && !isAliasDispatchKey(t) && backend_dispatch_keyset.has(t); + && !isAliasDispatchKey(t) + // Note [NestedTensor Not Included in Backend Keys] + // NestedTensor has been explicitly removed from the "backend keyset" due + // to incompatibility with some kernels, so we don't want it to be + // included in CompositeImplicitAutograd or CompositeExplicitAutograd + // kernels. + && t != DispatchKey::NestedTensor && backend_dispatch_keyset.has(t); } // math_dispatch_keyset contains all keys in backend_dispatch_keyset and // autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd -// maps to math_dispatch_keyset. +// maps to [math_dispatch_keyset x full_backend_mask] constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset | autograd_dispatch_keyset; @@ -39,7 +31,12 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); switch (t) { case DispatchKey::Autograd: - return autograd_dispatch_keyset; + // See Note [autograd_dispatch_keyset Does Not Include Backend Bits] + // That's why we OR it with a mask of the backend bits here. + // getRuntimeDispatchKeySet() expects to return a keyset of runtime + // dispatch keys, like AutogradCPU, but that requires having backend bits. + return autograd_dispatch_keyset | + DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); case DispatchKey::CompositeImplicitAutograd: return math_dispatch_keyset; case DispatchKey::CompositeExplicitAutograd: @@ -53,11 +50,13 @@ bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); switch (t) { case DispatchKey::Autograd: - return autograd_dispatch_keyset.has(k); + return autograd_dispatch_keyset.has(toFunctionalityKey(k)); case DispatchKey::CompositeImplicitAutograd: - return math_dispatch_keyset.has(k); + // See Note [NestedTensor Not Included in Backend Keys] + return k != DispatchKey::NestedTensor && math_dispatch_keyset.has(k); case DispatchKey::CompositeExplicitAutograd: - return backend_dispatch_keyset.has(k); + // See Note [NestedTensor Not Included in Backend Keys] + return k != DispatchKey::NestedTensor && backend_dispatch_keyset.has(k); default: return t == k; } @@ -79,8 +78,6 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) { return DispatchKeySet(DispatchKey::MLC); case DispatchKey::AutogradHPU: return DispatchKeySet(DispatchKey::HPU); - case DispatchKey::AutogradNestedTensor: - return DispatchKeySet(DispatchKey::NestedTensor); case DispatchKey::AutogradXPU: return DispatchKeySet(DispatchKey::XPU); case DispatchKey::AutogradPrivateUse1: @@ -96,23 +93,6 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) { } } -DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t) { - switch (t) { - case DispatchKey::CPU: - return DispatchKeySet(DispatchKey::AutocastCPU); - case DispatchKey::CUDA: - case DispatchKey::XLA: - return DispatchKeySet(DispatchKey::AutocastCUDA); - default: - return DispatchKeySet(); - } -} - -DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t) { - return DispatchKeySet( - {DispatchKey::ADInplaceOrView, getAutogradKeyFromBackend(t)}); -} - bool isIncludedInAlias(DispatchKey k, DispatchKey alias) { return k != DispatchKey::Undefined && runtimeDispatchKeySetHas(alias, k); } @@ -129,18 +109,167 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) { return os; } os << "DispatchKeySet("; - DispatchKey tid; bool first = true; - while ((tid = ts.highestPriorityTypeId()) != DispatchKey::Undefined) { + for (auto k : ts) { if (!first) { os << ", "; } - os << tid; - ts = ts.remove(tid); + os << k; first = false; } os << ")"; return os; } +DispatchKeySet::iterator& DispatchKeySet::iterator::operator++() { + TORCH_INTERNAL_ASSERT(next_functionality_ >= num_backends); + TORCH_INTERNAL_ASSERT(next_functionality_ <= iterator::end_iter_mask_val); + TORCH_INTERNAL_ASSERT(next_backend_ <= num_backends); + + // Create a masked version of the set representation to ignore previous + // keys that we've iterated through. + uint64_t masked_functionality_bits = + llvm::maskTrailingZeros(next_functionality_) & *data_ptr_; + uint64_t masked_backend_bits = + llvm::maskTrailingZeros(next_backend_) & full_backend_mask & + *data_ptr_; + + uint64_t first_functionality_idx = + llvm::findFirstSet(masked_functionality_bits); + uint64_t first_backendcomponent_idx = llvm::findFirstSet(masked_backend_bits); + + // If there are no keys, set to end iterator value + if (first_functionality_idx == std::numeric_limits::max() || + next_functionality_ == iterator::end_iter_mask_val) { + // Set up state to be the same as end() + next_functionality_ = iterator::end_iter_mask_val; + current_dispatchkey_idx_ = iterator::end_iter_key_val; + next_backend_ = 0; + current_backendcomponent_idx_ = iterator::end_iter_key_val; + return *this; + } + + // The +1 is because of DispatchKey::Undefined and + // BackendComponent::InvalidBit + auto new_next_functionality = first_functionality_idx + 1; + auto new_backendcomponent_idx = first_backendcomponent_idx + 1; + // and the -num_backends is because the first bits in the + // keyset are not Dispatch Keys. + auto next_dispatchkey_idx = new_next_functionality - num_backends; + + // If the current functionality bit is a per-backend bit, we need special + // handling + if (isPerBackendFunctionalityKey( + static_cast(next_dispatchkey_idx))) { + // case 1: if the current backend is undefined, then there is no valid + // backend instance of this functionality key so we can skip it. + if (first_backendcomponent_idx == std::numeric_limits::max()) { + // increment the functionality mask so we skip the current functionality + // bit on the next increment. + next_functionality_ = new_next_functionality; + ++(*this); + return *this; + } + + // Otherwise, at this point we know what the current backend and + // functionality bits are. + current_dispatchkey_idx_ = next_dispatchkey_idx; + current_backendcomponent_idx_ = new_backendcomponent_idx; + + // Next, we need to set up the masks for the next increment. + uint64_t next_backendcomponent_bits = + llvm::maskTrailingZeros(first_backendcomponent_idx + 1) & + full_backend_mask & *data_ptr_; + uint64_t next_backendcomponent_idx = + llvm::findFirstSet(next_backendcomponent_bits); + if (next_backendcomponent_idx == std::numeric_limits::max()) { + // case 2: the current backend is valid, but there is not another backend + // in the keyset. In this case, we need to bump the functionality mask and + // reset the backend mask for the next increment + next_functionality_ = new_next_functionality; + next_backend_ = 0; + } else { + // case 3: we have another backend to iterate over. We want to iterate + // over the same functionality bit next time, but a different backend bit. + next_backend_ = first_backendcomponent_idx + 1; + } + } else { + // Functionality bits that aren't per backend are simpler to handle. We can + // ignore the backend bits. + TORCH_INTERNAL_ASSERT(next_backend_ == 0); + current_dispatchkey_idx_ = next_dispatchkey_idx; + next_functionality_ = new_next_functionality; + } + return *this; +} + +std::array +initializeFunctionalityOffsetsAndMasks() { + std::array + offsets_and_masks; + // manualy set the first entry, which corresponds to Undefined. + offsets_and_masks[0] = FunctionalityOffsetAndMask(0, 0); + // loop through every functionality key (aside from Undefined). + for (const auto functionality_idx : c10::irange(1, num_functionality_keys)) { + // functionality_idx should be Dense -> 1, ... + auto prev_offset_and_mask = offsets_and_masks[functionality_idx - 1]; + auto k = static_cast(functionality_idx); + +#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) + // [Note: Trimmed Mobile Dispatch Keys] + uint16_t mask = 0; + uint16_t offset = 0; + switch (k) { + case DispatchKey::Undefined: + offset = 0; + case DispatchKey::CPU: + offset = 1; + case DispatchKey::QuantizedCPU: + offset = 2; + case DispatchKey::SparseCPU: + offset = 3; + case DispatchKey::BackendSelect: + offset = 4; + case DispatchKey::ADInplaceOrView: + offset = 5; + case DispatchKey::AutogradOther: + offset = 6; + case DispatchKey::AutogradCPU: + offset = 7; + default: + // All other keys which are unsupported on mobile will get sent + // to the undefined kernel, causing them to error. + offset = 0; + } + offsets_and_masks[functionality_idx] = + FunctionalityOffsetAndMask(offset, 0); + } +#else + // If the previous functionality was not per-backend, then we can just + // increment the previous offset. Otherwise, the next offset = + // previous_offset + num_backends. + auto next_offset = prev_offset_and_mask.offset + + (prev_offset_and_mask.mask == 0 ? 1 : num_backends); + // the mask is used in the runtime index calculation to find the offset of + // the backend. For non-per-backend functionalities, this offset should + // always be 0. Otherwise, we need to get the index of the backend (which we + // can do using a backend mask). + auto next_mask = isPerBackendFunctionalityKey(k) ? full_backend_mask : 0; + offsets_and_masks[functionality_idx] = + FunctionalityOffsetAndMask(next_offset, next_mask); + } + // Sanity check that the computed offset index of the last functionality key + // is correct. This assumes that the highest priority functionality key is not + // per backend. + TORCH_INTERNAL_ASSERT( + offsets_and_masks[num_functionality_keys - 1].offset == + (num_runtime_entries - 1), + "num_runtime_entries: ", + num_runtime_entries, + "last_offset: ", + offsets_and_masks[num_functionality_keys - 1].offset); +#endif + return offsets_and_masks; +} + } // namespace c10 diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 79d39652219..1834ca0aa96 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -1,5 +1,4 @@ #pragma once - #include #include #include @@ -8,29 +7,147 @@ namespace c10 { +struct FunctionalityOffsetAndMask { + // empty constructor shouldn't be used; only needed to initialize + // the array before populating it. + FunctionalityOffsetAndMask() {} + FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask) + : offset(offset), mask(mask) {} + // This needs to big enough to cover the size of the operator table. + uint16_t offset; + // See Note [No More Than 16 Backends] + // This mask needs to be big enough to mask all of the backend bits. + // We probably don't ever want to have more than 16 backend bits, so uint16_t + // should be enough. + uint16_t mask; +}; +static_assert( + c10::num_runtime_entries < 65536, + "The dispatcher currently only supports up to 2^16 runtime entries"); + +C10_API std::array +initializeFunctionalityOffsetsAndMasks(); + +C10_ALWAYS_INLINE static const std:: + array& + offsetsAndMasks() { + static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks(); + return offsets_and_masks_; +} + +// A representation of a set of DispatchKeys. A DispatchKeySet contains both +// "functionality" bits and "backend bits", and every tensor holds its own +// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the +// keyset on every input tensor, or’ing them together, and dispatching to a +// specific piece of functionality. The functionality bits are *ordered*. When +// multiple functionality bits are set, we use the highest priority +// functionality. Similarly, multiple backend bits can theoretically be set if +// you call an operator with multiple tensors from difference devices (e.g. CPU +// and CUDA), although support for mixed device dispatch is limited (the only +// kernels that gracefully handle mixed device inputs for now are cuda kernels +// that take in a scalar cpu tensor). + // A representation of a set of DispatchKeys. A tensor may have multiple // tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the // DispatchKeySet specifies what type ids apply. The internal representation is // as a 64-bit bit set (this means only 64 tensor type ids are supported). // -// Note that DispatchKeys are ordered; thus, we can ask questions like "what is -// the highest priority DispatchKey in the set"? (The set itself is not -// ordered; two sets with the same ids will always have the ids ordered in the -// same way.) +// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like +// "what is the highest priority DispatchKey in the set"? (The set itself is +// not ordered; two sets with the same ids will always have the ids ordered in +// the same way.) // -// At the moment, there are no nontrivial uses of this set; tensors are always -// singletons. In the near future, this set will represent variable? + tensor -// type id. In the far future, it will be requires grad? + profiling? + -// tracing? + lazy? + tensor type id. +// Note [DispatchKeySet Internal Representation] +// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects +// that get passed around at runtime. +// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset +// and individual dispatch keys. // -// (The difference between variable and requires grad, is that -// there are currently three states a tensor can be: -// 1. Not a variable -// 2. Variable with requires_grad=False -// 3. Variable with requires_grad=True -// Eventually, we want to kill state (1), and only dispatch to autograd -// handling code if one of the inputs requires grad.) +// First: why do we have this distinction, and why not map every dispatch key +// directly to a bit? This is mostly because we have several types of +// functionalities that different backends would like to customize. For example, +// we have: +// - "Dense": CPU, CUDA, XLA, ... (~12 keys) +// - "Sparse": SparseCPU, SparseCUDA, ... +// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ... +// - "Autograd": AutogradCPU, AutogradCUDA, Autograd XLA, ... +// The problem is that total number of keys grows quadratically with [# +// backends] x [# functionalities], making it very difficult to map each key +// directly to a bit in a bitset without dramatically increasing the size of the +// bitset over time. // +// The two enums (BackendComponent and DispatchKey) can be divided roughly into +// 5 categories. +// +// (1) "Building block" keys +// (a) backends: jEverything in the BackendComponent enum (e.g. CPUBit, +// CUDABIt) (b) functionalities: (per-backend) functionality-bit DispatchKeys +// (e.g. AutogradFunctionality, Sparse, Dense) +// (2) "Runtime" keys +// (a) "non-customizable backends" (e.g. FPGA) +// (b) "non-customizable functionalities" (e.g. Functionalize) +// (c) "per-backend instances of customizable functionalities" (e.g. CPU, +// SparseCPU, AutogradCPU) +// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys]) +// +// (1) Building block keys always correspond to individual bits in a +// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual +// runtime keys. e.g. +// auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit, +// DispatchKey::Dense}); +// // The keyset has the runtime dense-cpu key. +// dense_cpu_ks.has(DispatchKey::CPU); +// // And it contains the building block keys too. +// dense_cpu_ks.has(DispatchKey::CPUBit); +// dense_cpu_ks.has(DispatchKey::Dense); +// +// Not every backend and not every functionality counts as a "building block +// key". This is mostly to give us more levers to pull in the design space. +// Backend keys and functionality keys that count as "building blocks" will +// contribute to a full cross product of functionality that can be overriden. +// +// For example, right now we have at least 12 "backend" building blocks (CPU, +// CUDA, XLA, ...) and at least 4 "functionality" building blocks (Dense, +// Sparse, Quantized, AutogradFunctionality, ...). These keys together allow +// every dispatcher operator to be customized in up to 12*4 different ways. Each +// of those requires a slot in the operator table of every dispatcher operator. +// Not every piece of functionality necessarily needs to be customizeable +// per-backend, and not every backend necessarily needs to be able to customize +// every type of functionality. +// +// +// (2) Every runtime key corresponds directly to a slot in an operator's runtime +// dispatch table, and you can directly register kernels to a runtime dispatch +// key. +// +// For per-backend functionalities like "Dense" or "AutogradFunctionality", +// you can think of the corresponding runtime dispatch keys as "instances" of +// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all +// runtime instances of the "Dense" building block key. + +// (2a) and (2b) are represented identically in the DispatchKeySet logic: +// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT +// customizeable per backend. +// In order to do so, we'd need to promote it to a per-backend functionality +// "building block" key. +// - non-customizeable backends (e.g. FPGA) can NOT customize existing +// functionality like Sparse, Autograd, etc. +// In order to do so, we'd need to promote it to a backend "building block" +// key. +// +// In both cases, these keys directly correspond to runtime slots in the +// operator table. +// +// +// (3) "Alias" keys +// See Note [Alias Dispatch Keys] +// +// Final note: for anyone making future changes to the Dispatcher + +// DispatchKeySet internals, there's a closed PR with a basic +// python-implementation of the Dispatcher that might be useful in quickly +// testing out and validating changes. See it at +// https://github.com/pytorch/pytorch/pull/68743 + // An undefined tensor is one with an empty tensor type set. class DispatchKeySet final { public: @@ -41,29 +158,146 @@ class DispatchKeySet final { // NB: default constructor representation as zero is MANDATORY as // use of DispatchKeySet in TLS requires this. constexpr DispatchKeySet() : repr_(0) {} + constexpr DispatchKeySet(Full) - : repr_(std::numeric_limits::max()) {} + : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {} + constexpr DispatchKeySet(FullAfter, DispatchKey t) // LSB after t are OK, but not t itself. - : repr_((1ULL << (static_cast(t) - 1)) - 1) {} + // "functionalities" have a notion of ordering (e.g. Autograd > Sparse > + // Quantized > Dense). But backends don't really have an ordering. + // Therefore, we're enforcing that FullAfter can only be used on + // "functionality" keys. + : repr_( + (1ULL + << (num_backends + static_cast(toFunctionalityKey(t)) - + 1)) - + 1) {} + // Public version of DispatchKeySet(uint64_t) API; external users // must be explicit when they do this! constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {} - explicit constexpr DispatchKeySet(DispatchKey t) - : repr_( - t == DispatchKey::Undefined - ? 0 - : 1ULL << (static_cast(t) - 1)) {} - explicit constexpr DispatchKeySet(std::initializer_list ks) - : repr_(0) { - for (auto k : ks) { - repr_ |= DispatchKeySet(k).repr_; + + constexpr explicit DispatchKeySet(BackendComponent k) { + if (k == BackendComponent::InvalidBit) { + repr_ = 0; + } else { + repr_ = 1ULL << (static_cast(k) - 1); } } + + constexpr explicit DispatchKeySet(DispatchKey k) { + if (k == DispatchKey::Undefined) { + // Case 1: handle Undefined specifically + repr_ = 0; + } else if (k <= DispatchKey::EndOfFunctionalityKeys) { + // Case 2: handle "functionality-only" keys + // These keys have a functionality bit set, but no backend bits + // These can technically be either: + // - valid runtime keys (e.g. DispatchKey::AutogradOther, + // DispatchKey::FuncTorchBatched, etc) + // - "building block" keys that aren't actual runtime keys (e.g. + // DispatchKey::Dense or Sparse) + uint64_t functionality_val = 1ULL + << (num_backends + static_cast(k) - 1); + repr_ = functionality_val; + } else if (k <= DispatchKey::EndOfRuntimeBackendKeys) { + // Case 3: "runtime" keys that have a functionality bit AND a backend bit. + // First compute which bit to flip for the functionality. + auto functionality_k = toFunctionalityKey(k); + // The - 1 is because Undefined is technically a "functionality" that + // doesn't show up in the bitset. So e.g. Dense is technically the second + // functionality, but the lowest functionality bit. + uint64_t functionality_val = 1ULL + << (num_backends + static_cast(functionality_k) - 1); + + // then compute which bit to flip for the backend + // Case 4a: handle the runtime instances of "per-backend functionality" + // keys For example, given DispatchKey::CPU, we should set: + // - the Dense functionality bit + // - the CPUBit backend bit + // first compute which bit to flip for the backend + auto backend_k = toBackendComponent(k); + uint64_t backend_val = backend_k == BackendComponent::InvalidBit + ? 0 + : 1ULL << (static_cast(backend_k) - 1); + repr_ = functionality_val + backend_val; + } else { + // At this point, we should have covered every case except for alias keys. + // Technically it would be possible to add alias dispatch keys to a + // DispatchKeySet, but the semantics are a little confusing and this + // currently isn't needed anywhere. + repr_ = 0; + } + } + + constexpr uint64_t keys_to_repr(std::initializer_list ks) { + uint64_t repr = 0; + for (auto k : ks) { + repr |= DispatchKeySet(k).repr_; + } + return repr; + } + + constexpr uint64_t backend_bits_to_repr( + std::initializer_list ks) { + uint64_t repr = 0; + for (auto k : ks) { + repr |= DispatchKeySet(k).repr_; + } + return repr; + } + + explicit constexpr DispatchKeySet(std::initializer_list ks) + : repr_(keys_to_repr(ks)) {} + + explicit constexpr DispatchKeySet(std::initializer_list ks) + // Note: for some reason, putting this logic directly in the constructor + // appears to fail to compile on CUDA 10.1. + // See an example internal failure at + // https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr + : repr_(backend_bits_to_repr(ks)) {} + // Test if a DispatchKey is in the set - bool inline has(DispatchKey t) const { + inline bool has(DispatchKey t) const { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined); - return static_cast(repr_ & DispatchKeySet(t).repr_); + return has_all(DispatchKeySet(t)); + } + constexpr bool has_backend(BackendComponent t) const { + return has_all(DispatchKeySet(t)); + } + + // Test if a DispatchKey is in the set + // Given a DispatchKeySet of functionality keys and (potentially) backend + // keys, tests if all of them are in the current set. + constexpr bool has_all(DispatchKeySet ks) const { + return static_cast((repr_ & ks.repr_) == ks.repr_); + } + + // Given a DispatchKeySet of functionality keys and (potentially) backend + // keys, tests if any of them are in the current set. This could technically + // be pretty easily implemented using has(). It is strictly a perf + // optimization though. There are many places in the code base where we want + // to test for multiple functionality keys together. HOWEVER, runtime + // per-backend functionality keys aren't allowed to be used with this + // function, because you can end up with weird results. e.g. + // DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU)) + // would return true. + inline bool has_any(DispatchKeySet ks) const { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + // Either there are no backend bits in the input keyset + ((ks.repr_ & full_backend_mask) == 0) || + // or there are no per-backend-functionality bits + // See [Note: Per-Backend Functionality Dispatch Keys] + ((ks & + DispatchKeySet({ + DispatchKey::Dense, + DispatchKey::Quantized, + DispatchKey::Sparse, + DispatchKey::AutogradFunctionality, + }) + .repr_) == 0)); + return static_cast((repr_ & ks.repr_) != 0); } // Test if DispatchKeySet is a superset of ks. bool isSupersetOf(DispatchKeySet ks) const { @@ -74,31 +308,64 @@ class DispatchKeySet final { return DispatchKeySet(repr_ | other.repr_); } // Perform set intersection - DispatchKeySet operator&(DispatchKeySet other) const { + constexpr DispatchKeySet operator&(DispatchKeySet other) const { return DispatchKeySet(repr_ & other.repr_); } - // Compute the set difference self - other + // Compute the set difference self - other, + // but ONLY for the functionality keys. + // Any backend bits set on self will remain unchanged. + // See Note [Removing keys from DispatchKeySet Only Affects Functionality + // Keys] DispatchKeySet operator-(DispatchKeySet other) const { - return DispatchKeySet(repr_ & ~other.repr_); + return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_)); } + // Compute self ^ other constexpr DispatchKeySet operator^(DispatchKeySet other) const { return DispatchKeySet(repr_ ^ other.repr_); } - // Perform set equality bool operator==(DispatchKeySet other) const { return repr_ == other.repr_; } + bool operator!=(DispatchKeySet other) const { + return repr_ != other.repr_; + } // Add a DispatchKey to the DispatchKey set. Does NOT mutate, // returns the extended DispatchKeySet! C10_NODISCARD DispatchKeySet add(DispatchKey t) const { return *this | DispatchKeySet(t); } - // Remove a DispatchKey from the DispatchKey set. This is - // generally not an operation you should be doing (it's - // used to implement operator<<) - C10_NODISCARD constexpr DispatchKeySet remove(DispatchKey t) const { - return DispatchKeySet(repr_ & ~DispatchKeySet(t).repr_); + C10_NODISCARD DispatchKeySet add(DispatchKeySet ks) const { + return *this | ks; + } + + // Remove a DispatchKey from the DispatchKey set. + // This is generally not an operation you should be doing + // (it's used to implement the printing overload, operator<<) + // + // Note [Removing keys from DispatchKeySet Only Affects Functionality Keys] + // Only functionality bits are allowed to be removed from a keyset. + // For now, we're only allowing removal of "functionality bits" from the + // keyset, which is specifically needed by the fallthrough key calculation + // logic. Why is removing backend bits problematic? Consider this example: + // + // DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA, + // DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA) + // DispatchKeySet([DispatchKey.CPU, + // DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA) + // + // What do we want to happen? + // Technically, we'd like it to be true that after removal, + // the first keyset still has the CUDA dispatch key while the second doesn't. + // Unfortunately there's no way to represent that, because the two keysets are + // represented the same way internally: functionality bits: Autograd, Dense + // backend bits: CPU, CUDA + // + // Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd" + // bit from the bitset. + constexpr DispatchKeySet remove(DispatchKey t) const { + return DispatchKeySet( + repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask)); } // Is the set empty? (AKA undefined tensor) bool empty() const { @@ -107,22 +374,78 @@ class DispatchKeySet final { uint64_t raw_repr() { return repr_; } - // Return the type id in this set with the highest priority (i.e., - // is the largest in the DispatchKey enum). Intuitively, this - // type id is the one that should handle dispatch (assuming there - // aren't any further exclusions or inclusions). - DispatchKey highestPriorityTypeId() const { - // TODO: If I put Undefined as entry 64 and then adjust the - // singleton constructor to shift from the right, we can get rid of the - // subtraction here. It's modestly more complicated to get right so I - // didn't do it for now. - return static_cast(64 - llvm::countLeadingZeros(repr_)); + + DispatchKey highestFunctionalityKey() const { + auto functionality_idx = indexOfHighestBit(); + // This means that none of the functionality bits were set. + if (functionality_idx < num_backends) + return DispatchKey::Undefined; + // The first num_backend bits in the keyset don't correspond to real + // dispatch keys. + return static_cast(functionality_idx - num_backends); } - DispatchKey highestPriorityBackendTypeId() const { - return (*this & - ((1ULL << static_cast(DispatchKey::EndOfBackendKeys)) - 1)) - .highestPriorityTypeId(); + // This is similar like toBackendComponent(DispatchKey), but less restrictive. + // toBackendComponent() errors out if the key that it was passed has no + // backend bits, which is useful for error checking. We need a version of that + // here that can also handle "fake" backends like FPGA, because they need to + // map to the AutogradOther key. For those backends, we return + // BackendComponent::InvalidBit. + BackendComponent highestBackendKey() const { + // mask to mask out functionality bits + auto backend_idx = + DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit(); + // all zeros across the backend bits means that no backend bits are set. + if (backend_idx == 0) + return BackendComponent::InvalidBit; + return static_cast(backend_idx); + } + + // returns the DispatchKey of highest priority in the set. + DispatchKey highestPriorityTypeId() const { + auto functionality_k = highestFunctionalityKey(); + if (isPerBackendFunctionalityKey(functionality_k)) { + return toRuntimePerBackendFunctionalityKey( + functionality_k, highestBackendKey()); + } + return functionality_k; + } + + // Returns the index of the most-significant bit in the keyset. + // This is used to as part of the calculation into the operator table to get: + // - the highest "functionality" bit in the keyset. + // - the highest "backend" bit in the keyset. + uint8_t indexOfHighestBit() const { + return 64 - llvm::countLeadingZeros(repr_); + } + + // returns the index in the operator table of highest priority key in the the + // keyset Note that we could in theory implement this using + // highestPriorityTypeId(), but this code is very hotpath and we can do it + // faster without it. + uint64_t getDispatchTableIndexForDispatchKeySet() const { + auto functionality_idx = + DispatchKeySet(repr_ >> num_backends).indexOfHighestBit(); + auto offset_and_mask = offsetsAndMasks()[functionality_idx]; + // Mask the functionality bits out first, then right-shift by 1. + // right-shifting by 1 because everything is zero-indexed. + // E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should + // give us an offset of 1, etc. + auto backend_idx = + DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit(); + return offset_and_mask.offset + backend_idx; + } + + // returns the "index" of the highest priority backend in the keyset. + // This is pretty similar to getBackendKey(), but: + // - It's hotpath code (part of the runtime bitset calculation) + // - I's returns an integer index, not an enum value + // - Everything is shifted to the right by 1. + // BackendComponent::InvalidBit is technically the lowest enum value, + // but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2, + // etc. + uint64_t getBackendIndex() const { + return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit(); } private: @@ -130,42 +453,47 @@ class DispatchKeySet final { uint64_t repr_ = 0; public: - // STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the - // set. The iterator is only invalidated by the destruction of the underlying - // DispatchKeySet as the iterator stores a pointer to the raw representation - // of the DispatchKeySet. + // STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys + // in the set. The iterator is only invalidated by the destruction of the + // underlying DispatchKeySet as the iterator stores a pointer to the raw + // representation of the DispatchKeySet. Note: When we encounter a per-backend + // functionality (e.g. Dense or Sparse), we will iterate through EVERY backend + // in the keyset, for that functionality. For example, if the next + // functionality key to iterate over is Autograd, and the backend bits in the + // keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit], + // then the next two keys we return will be DispatchKey::AutogradCPU, + // DispatchKey::AutogradCUDA (CPU first because it has lower precedence than + // CUDA in DispatchKey.h). class iterator { public: using self_type = iterator; using iterator_category = std::input_iterator_tag; using value_type = DispatchKey; using difference_type = ptrdiff_t; + // final mask value should mask out the entire keyset + static const uint8_t end_iter_mask_val = + num_backends + num_functionality_keys; + // final key value should be the last DispatchKey + static const uint8_t end_iter_key_val = num_functionality_keys; - explicit iterator(const uint64_t* data_ptr, uint8_t i = 0) - : data_ptr_(data_ptr), i_(i) { + // current_dispatchkey_idx_ will iterate through all functionality bits. + // current_backendcomponent_idx_ will iterate through all backend bits. + explicit iterator( + const uint64_t* data_ptr, + uint8_t next_functionality = num_backends, + uint8_t next_backend = 0) + : data_ptr_(data_ptr), + next_functionality_(next_functionality), + next_backend_(next_backend), + // These are in an invalid state at construction time, and set by the + // first increment call + current_dispatchkey_idx_(end_iter_key_val), + current_backendcomponent_idx_(end_iter_key_val) { // Go to the first key in the set ++(*this); } - self_type& operator++() { - TORCH_INTERNAL_ASSERT( - i_ <= static_cast(DispatchKey::NumDispatchKeys)); - - // Create a masked version of the set representation to ignore previous - // keys that we've iterated through. - uint64_t masked_data = llvm::maskTrailingZeros(i_) & *data_ptr_; - uint64_t firstKeyIndex = llvm::findFirstSet(masked_data); - - // If there are no keys, set to end iterator value - if (firstKeyIndex == std::numeric_limits::max() || - i_ == static_cast(DispatchKey::NumDispatchKeys)) { - i_ = static_cast(DispatchKey::NumDispatchKeys); - return *this; - } - - i_ = static_cast(firstKeyIndex) + 1; - return *this; - } + C10_API self_type& operator++(); self_type operator++(int) { self_type previous_iterator = *this; @@ -174,18 +502,50 @@ class DispatchKeySet final { } bool operator==(const self_type& rhs) const { - return i_ == rhs.i_; + return next_functionality_ == rhs.next_functionality_ && + current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ && + next_backend_ == rhs.next_backend_ && + current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_; } bool operator!=(const self_type& rhs) const { - return i_ != rhs.i_; + return next_functionality_ != rhs.next_functionality_ || + current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ || + next_backend_ != rhs.next_backend_ || + current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_; } DispatchKey operator*() const { - return static_cast(i_); + auto functionality_key = + static_cast(current_dispatchkey_idx_); + if (isPerBackendFunctionalityKey(functionality_key)) { + auto next_key = toRuntimePerBackendFunctionalityKey( + functionality_key, + static_cast(current_backendcomponent_idx_)); + // We expect all of the Dense, Sparse, Quantized, and Autograd keys to + // be ordered the same way with respect to their backends + TORCH_INTERNAL_ASSERT( + toBackendComponent(next_key) == + static_cast(current_backendcomponent_idx_), + "Tried to map functionality key ", + toString(functionality_key), + " and backend bit ", + toString( + static_cast(current_backendcomponent_idx_)), + " to a runtime key, but ended up with ", + toString(next_key), + ". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.", + " Please double check that enum for inconsistencies."); + return next_key; + } else { + return functionality_key; + } } private: const uint64_t* data_ptr_; - uint8_t i_; + uint8_t next_functionality_; + uint8_t next_backend_; + uint8_t current_dispatchkey_idx_; + uint8_t current_backendcomponent_idx_; }; public: @@ -195,31 +555,35 @@ class DispatchKeySet final { return iterator(&repr_); } - // We do not need to iterate beyond NumDispatchKeys so we will treat this as - // the end iterator. NumDispatchKeys will always be strictly less than 64. + // We do not need to iterate beyond EndOfFunctionalityKeys so we will treat + // this as the end iterator. iterator end() const { - return iterator(&repr_, static_cast(DispatchKey::NumDispatchKeys)); + return iterator(&repr_, iterator::end_iter_mask_val); } }; C10_API std::string toString(DispatchKeySet); C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet); -// autograd_dispatch_keyset should include all runtime autograd keys. -// Alias key DispatchKey::Autograd maps to autograd_dispatch_keyset. +C10_API inline uint64_t getDispatchTableIndexForDispatchKey(DispatchKey k) { + return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet(); +} + +// Alias key DispatchKey::Autograd maps to +// (autograd_dispatch_keyset x full_backend_mask) // NB: keys in this set also get associated with CompositeImplicitAutograd +// +// Note [autograd_dispatch_keyset Does Not Include Backend Bits] +// We don't want to include any backend bits (BackendComponent::CPUBit, etc) +// directly in autograd_dispatch_keyset. +// Why? keysets like autograd_dispatch_keyset are commonly used to remove +// autograd keys from a DispatchKeySet throughout the code base. However, you +// are only allowed to remove functionality bits from a keyset, not backend +// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality +// Keys] for details. To be consistent and avoid confusion, we're explicitly +// setting up autograd_dispatch_keyset to not have any backend bits. constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({ - DispatchKey::AutogradCPU, - DispatchKey::AutogradCUDA, - DispatchKey::AutogradXLA, - DispatchKey::AutogradLazy, - DispatchKey::AutogradNestedTensor, - DispatchKey::AutogradMLC, - DispatchKey::AutogradHPU, - DispatchKey::AutogradXPU, - DispatchKey::AutogradPrivateUse1, - DispatchKey::AutogradPrivateUse2, - DispatchKey::AutogradPrivateUse3, + DispatchKey::AutogradFunctionality, DispatchKey::AutogradOther, }); @@ -244,25 +608,28 @@ constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView = // backend dispatch keys that map to DispatchKey::AutogradOther // NB: keys in this set also get associated with CompositeImplicitAutograd -constexpr DispatchKeySet autogradother_backends = DispatchKeySet( - {DispatchKey::HIP, - DispatchKey::VE, - DispatchKey::FPGA, - DispatchKey::ORT, - DispatchKey::Vulkan, - DispatchKey::Metal, - DispatchKey::QuantizedCPU, - DispatchKey::QuantizedCUDA, - DispatchKey::CustomRNGKeyId, - DispatchKey::MkldnnCPU, - DispatchKey::SparseCPU, - DispatchKey::SparseCUDA, - DispatchKey::SparseHIP, - DispatchKey::SparseVE, - DispatchKey::SparseXPU, - DispatchKey::SparseCsrCPU, - DispatchKey::SparseCsrCUDA, - DispatchKey::Meta}); +constexpr DispatchKeySet autogradother_backends = + DispatchKeySet( + // HIP and VE aren't in this list: they now have their own backend bits + // which means that they can now have their own Autograd keys. + // Technically, HIP will now redispatch to its own custom AutogradHIP + // slot in the runtime table. + {DispatchKey::FPGA, + DispatchKey::ORT, + DispatchKey::Vulkan, + DispatchKey::Metal, + DispatchKey::SparseCsrCPU, + DispatchKey::SparseCsrCUDA, + DispatchKey::CustomRNGKeyId, + DispatchKey::MkldnnCPU, + DispatchKey::Meta, + // Sparse and Quantized backends also live here. + DispatchKey::Sparse, + DispatchKey::Quantized}) + // Including the backend bits because this keyset is used during op + // registration, which requires looping over all runtime autogradother + // backend keys. + | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); // The set of dispatch keys that come after autograd // n.b. this relies on the fact that AutogradOther is currently the lowest @@ -292,6 +659,36 @@ constexpr DispatchKeySet after_func_keyset = // away with it by explicitly removing the key here. c10::DispatchKey::ADInplaceOrView); +constexpr DispatchKeySet backend_bitset_mask = + DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1); + +constexpr auto inplace_or_view_ks = + DispatchKeySet(DispatchKey::ADInplaceOrView); +constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU); +constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU); +constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA); +constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA); +constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy); +constexpr auto autograd_mlc_ks = DispatchKeySet(DispatchKey::AutogradMLC); +constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU); +constexpr auto autograd_privateuse1_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse1); +constexpr auto autograd_privateuse2_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse2); +constexpr auto autograd_privateuse3_ks = + DispatchKeySet(DispatchKey::AutogradPrivateUse3); +constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther); + +struct OpTableOffsetAndMask { + uint16_t offset; + uint16_t backend_mask; +}; + +static_assert( + num_backends <= 16, + "Right now we expect the number of backends not to exceed 16. In the (unlikely) event" + " that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too."); + // true if t is a backend dispatch key C10_API bool isBackendDispatchKey(DispatchKey t); @@ -307,10 +704,53 @@ C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k); C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t); // Returns a DispatchKeySet of autograd related keys mapped to backend. -C10_API DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t); +// for a given backend key, use the associated autograd key. +// for non-backend keys, use AutogradOther as a default. +// Note: it's convenient and fast to return a default here rather than (say) +// returning an optional, or throwing. But it makes callers +// responsible for either a) enforcing the invariant that only backend keys +// be passed as arguments, or b) interpreting our return value carefully. +inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) { + switch (t) { + case BackendComponent::CPUBit: + return inplace_or_view_ks | autograd_cpu_ks; + case BackendComponent::XPUBit: + return inplace_or_view_ks | autograd_xpu_ks; + case BackendComponent::CUDABit: + return inplace_or_view_ks | autograd_cuda_ks; + case BackendComponent::XLABit: + return inplace_or_view_ks | autograd_xla_ks; + case BackendComponent::LazyBit: + return inplace_or_view_ks | autograd_lazy_ks; + case BackendComponent::MLCBit: + return inplace_or_view_ks | autograd_mlc_ks; + case BackendComponent::HPUBit: + return inplace_or_view_ks | autograd_hpu_ks; + case BackendComponent::PrivateUse1Bit: + return inplace_or_view_ks | autograd_privateuse1_ks; + case BackendComponent::PrivateUse2Bit: + return inplace_or_view_ks | autograd_privateuse2_ks; + case BackendComponent::PrivateUse3Bit: + return inplace_or_view_ks | autograd_privateuse3_ks; + default: + return inplace_or_view_ks | autograd_other_ks; + } +} // Returns a DispatchKeySet of autocast related keys mapped to backend. -C10_API DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t); +inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) { + constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU); + constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA); + switch (t) { + case BackendComponent::CPUBit: + return autocast_cpu_ks; + case BackendComponent::CUDABit: + case BackendComponent::XLABit: + return autocast_cuda_ks; + default: + return DispatchKeySet(); + } +} // This API exists because we have a use case for checking // getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined) diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index b83ee395045..379807df0c7 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -190,7 +190,7 @@ TensorImpl::TensorImpl( // TODO: be more explicit about the full key set at call sites so we // don't have to keep recomputing it here - DispatchKey k = key_set.highestPriorityBackendTypeId(); + auto k = key_set.highestBackendKey(); key_set = key_set | getAutocastRelatedKeySetFromBackend(k); diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 86aca278c9d..d703cb2abb8 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -838,10 +838,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_sparse() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::SparseCPU) || - key_set_.has(DispatchKey::SparseCUDA) || - key_set_.has(DispatchKey::SparseHIP) || - key_set_.has(DispatchKey::SparseXPU); + return key_set_.has(DispatchKey::Sparse); } // Whether a tensor is sparse COO or not. Use is_sparse_csr for checking CSR @@ -854,9 +851,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_quantized() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::QuantizedCPU) || - key_set_.has(DispatchKey::QuantizedCUDA) || - key_set_.has(DispatchKey::QuantizedXPU); + return key_set_.has(DispatchKey::Quantized); } bool is_meta() const { @@ -868,53 +863,46 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_cpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::CPU) || - key_set_.has(DispatchKey::SparseCPU) || + return key_set_.has_backend(BackendComponent::CPUBit) || key_set_.has(DispatchKey::SparseCsrCPU) || - key_set_.has(DispatchKey::QuantizedCPU) || key_set_.has(DispatchKey::MkldnnCPU); } bool is_cuda() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::CUDA) || - key_set_.has(DispatchKey::SparseCUDA) || - key_set_.has(DispatchKey::SparseCsrCUDA) || - key_set_.has(DispatchKey::QuantizedCUDA); + return key_set_.has_backend(BackendComponent::CUDABit) || + key_set_.has(DispatchKey::SparseCsrCUDA); } bool is_xpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::XPU) || - key_set_.has(DispatchKey::SparseXPU) || - key_set_.has(DispatchKey::QuantizedXPU); + return key_set_.has_backend(BackendComponent::XPUBit); } bool is_xla() const { - return key_set_.has(DispatchKey::XLA); + return key_set_.has_backend(BackendComponent::XLABit); } bool is_hpu() const { - return key_set_.has(DispatchKey::HPU); + return key_set_.has_backend(BackendComponent::HPUBit); } bool is_lazy() const { - return key_set_.has(DispatchKey::Lazy); + return key_set_.has_backend(BackendComponent::LazyBit); } bool is_hip() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::HIP) || - key_set_.has(DispatchKey::SparseHIP); + return key_set_.has_backend(BackendComponent::HIPBit); } bool is_ve() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::VE) || key_set_.has(DispatchKey::SparseVE); + return key_set_.has_backend(BackendComponent::VEBit); } bool is_mkldnn() const { @@ -1548,13 +1536,22 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { */ inline bool has_compatible_shallow_copy_type(DispatchKeySet from) { auto is_dense = [](DispatchKeySet ts) { - return ts.has(DispatchKey::CPU) || ts.has(DispatchKey::CUDA) || - ts.has(DispatchKey::HIP) || ts.has(DispatchKey::XPU); + constexpr auto dense_backends = DispatchKeySet( + {BackendComponent::CPUBit, + BackendComponent::CUDABit, + BackendComponent::HIPBit, + BackendComponent::XPUBit}); + constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense); + return ts.has_any(dense_k) && ts.has_any(dense_backends); }; auto is_sparse = [](DispatchKeySet ts) { - return ts.has(DispatchKey::SparseCPU) || - ts.has(DispatchKey::SparseCUDA) || ts.has(DispatchKey::SparseHIP) || - ts.has(DispatchKey::SparseXPU); + constexpr auto sparse_backends = DispatchKeySet( + {BackendComponent::CPUBit, + BackendComponent::CUDABit, + BackendComponent::HIPBit, + BackendComponent::XPUBit}); + constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse); + return ts.has_any(sparse_k) && ts.has_any(sparse_backends); }; return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) || (is_sparse(key_set_) && is_sparse(from)); diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp index 43b06c110e5..2c0de14405d 100644 --- a/c10/test/core/DispatchKeySet_test.cpp +++ b/c10/test/core/DispatchKeySet_test.cpp @@ -3,25 +3,163 @@ #include #include +#include using namespace c10; +// This test exists not to be comprehensive, but to more clearly show +// what the semantics of DispatchKeySet are. +TEST(DispatchKeySet, ShowSemantics) { + // the "CPU" dispatch key is an instance of a per-backend-functionality key. + // It corresponds to "dense" functionality, "CPU" backend. + // This means that it gets a dense functionality bit, and a cpu backend bit + // set. + auto undefined_set = DispatchKeySet(); + auto dense_cpu_set = DispatchKeySet(DispatchKey::CPU); + ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense)); + ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit)); + ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU)); + + auto dense_lazy_set = DispatchKeySet(DispatchKey::Lazy); + ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Dense)); + ASSERT_TRUE(dense_lazy_set.has_backend(BackendComponent::LazyBit)); + ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Lazy)); + + // You can think of "Dense/Sparse", and "CPUBit/CUDABit", as "building block" + // dispatch keys. You are allowed to directly create keysets out of them! + auto dense_cpu_set_from_building_blocks = DispatchKeySet(DispatchKey::Dense) | + DispatchKeySet(BackendComponent::CPUBit); + ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense)); + ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit)); + ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU)); + ASSERT_EQ(dense_cpu_set, dense_cpu_set_from_building_blocks); + + // Similarly, the AutogradCUDA key gets 2 bits in the keyset: + // The "Autograd" functionality bit, and the "CUDA" backend bit + auto autograd_cuda = DispatchKeySet(DispatchKey::AutogradCUDA); + ASSERT_TRUE(autograd_cuda.has(DispatchKey::AutogradFunctionality)); + ASSERT_TRUE(autograd_cuda.has_backend(BackendComponent::CUDABit)); + + // Because DispatchKeySet uses a condensed internal representation, you cannot + // use it to represent the FULL cross product of backends and functionalities + // for example: + auto autograd_dense_cpu_cuda = DispatchKeySet( + {DispatchKey::AutogradFunctionality, + DispatchKey::Dense, + DispatchKey::CUDA, + DispatchKey::CPU}); + auto fpga = DispatchKeySet(DispatchKey::FPGA); + auto fpga_and_cpu = DispatchKeySet({DispatchKey::FPGA, DispatchKey::CPU}); + // this keyset has all of the building block keys: + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradFunctionality)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::Dense)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CUDABit)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CPUBit)); + + // and it also has the "runtime" keys that correspond to the full + // cross-product of functionality + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CPU)); + ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CUDA)); + + // This means that there's no way to represent a keyset with, say, only + // Autograd CUDA + Dense CPU. Instead, you should think of a keyset as + // inheriting the full set of functionalities + backends of its keys. This + // means that the below keysets are all indistinguishable from each other. + ASSERT_EQ( + autograd_dense_cpu_cuda, + DispatchKeySet( + {DispatchKey::AutogradCUDA, + DispatchKey::AutogradCPU, + DispatchKey::CUDA, + DispatchKey::CPU})); + ASSERT_EQ( + autograd_dense_cpu_cuda, + DispatchKeySet({DispatchKey::AutogradCUDA, DispatchKey::CPU})); + ASSERT_EQ( + autograd_dense_cpu_cuda, + DispatchKeySet({DispatchKey::CUDA, DispatchKey::AutogradCPU})); + + // ~~~~~~~~~~ DispatchKeySet iterators ~~~~~~~~~~~ + + // Iterators allow you to iterate individually through the DispatchKey's in a + // DispatchKeySet + auto empty_set = DispatchKeySet(); + auto t1 = empty_set.begin(); + auto t2 = empty_set.end(); + ASSERT_EQ(*empty_set.begin(), *empty_set.end()); + + // However, only keys that correspond to actual runtime indices of kernels in + // the operator table show up when you iterate through a keyset. i.e. + // DispatchKey::Dense, and BackendComponent::CPUBit won't show up in an + // iterator. + auto dense_cpu_iter = dense_cpu_set.begin(); + ASSERT_EQ(*dense_cpu_iter++, DispatchKey::CPU); + ASSERT_EQ(*dense_cpu_iter, *dense_cpu_set.end()); + + auto autograd_dense_cpu_cuda_iter = autograd_dense_cpu_cuda.begin(); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CPU); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CUDA); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCPU); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCUDA); + ASSERT_EQ(*autograd_dense_cpu_cuda_iter, *autograd_dense_cpu_cuda.end()); + + // But other "functionality bits" that are not defined per-backend DO get + // their own slots in the operator table. + auto mixed_keyset = DispatchKeySet(BackendComponent::CPUBit) | + DispatchKeySet( + {DispatchKey::FPGA, // runtime key + DispatchKey::Functionalize, // runtime key + DispatchKey::Dense}); // NOT a runtime key + auto mixed_iter = mixed_keyset.begin(); + ASSERT_EQ(*mixed_iter++, DispatchKey::CPU); + ASSERT_EQ(*mixed_iter++, DispatchKey::FPGA); + ASSERT_EQ(*mixed_iter++, DispatchKey::Functionalize); + ASSERT_EQ(*mixed_iter, *mixed_keyset.end()); +} + TEST(DispatchKeySet, Empty) { DispatchKeySet empty_set; - for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); + for (uint8_t i = 0; + i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); i++) { auto tid = static_cast(i); + if (tid == DispatchKey::Undefined) + continue; ASSERT_FALSE(empty_set.has(tid)); } ASSERT_TRUE(empty_set.empty()); DispatchKeySet empty_set2; ASSERT_TRUE(empty_set == empty_set2); - ASSERT_EQ(empty_set.highestPriorityTypeId(), DispatchKey::Undefined); } -TEST(DispatchKeySet, Singleton) { - for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); - i++) { +// This covers all keys that correspond to a single backend bit, e.g. +// BackendComponent::CPUBit. Even though these are NOT runtime keys, we still +// allow adding them directly to a keyset +TEST(DispatchKeySet, SingletonBackendComponent) { + for (const auto i : c10::irange(1, num_backends)) { + auto tid = static_cast(i); + DispatchKeySet sing(tid); + ASSERT_EQ(sing, sing); + ASSERT_EQ(sing, DispatchKeySet().add(tid)); + ASSERT_EQ(sing, sing.add(tid)); + ASSERT_EQ(sing, sing | sing); + ASSERT_FALSE(sing.empty()); + ASSERT_TRUE(sing.has(tid)); + } +} + +// This covers all keys that correspond to a single functionality bit: +// - runtime, not-per-backend functionality keys, e.g. +// DispatchKey::FuncTorchBatched +// - runtime, "fake backend" keys, e.g. DispatchKey::FPGA +// - NOT-runtime, per-backend functionality keys, e.g. DispatchKey::Dense +// Even though it's not a runtime key, we still allow adding it directly to a +// keyset. +// DispatchKey:: +TEST(DispatchKeySet, SingletonFunctionalityKeys) { + for (const auto i : c10::irange(1, num_functionality_keys)) { auto tid = static_cast(i); DispatchKeySet sing(tid); ASSERT_EQ(sing, sing); @@ -30,47 +168,145 @@ TEST(DispatchKeySet, Singleton) { ASSERT_EQ(sing, sing | sing); ASSERT_FALSE(sing.empty()); ASSERT_TRUE(sing.has(tid)); - ASSERT_EQ(sing.highestPriorityTypeId(), tid); ASSERT_EQ(sing.remove(tid), DispatchKeySet()); } } -TEST(DispatchKeySet, Doubleton) { - for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); +// This covers runtime keys that are per-backend, +// and take up more than one bit in a DispatchKeySet. They take up one +// functionality bit + one backend bit. e.g. CPU, CUDA, SparseCPU, SparseCUDA, +// AutogradCPU, AutogradCUDA +TEST(DispatchKeySet, SingletonPerBackendFunctionalityKeys) { + for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); + i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); + i++) { + auto tid = static_cast(i); + // Skip these because they aren't real keys. + if (tid == DispatchKey::StartOfDenseBackends || + tid == DispatchKey::StartOfSparseBackends || + tid == DispatchKey::StartOfQuantizedBackends || + tid == DispatchKey::StartOfAutogradBackends) { + continue; + } + DispatchKeySet sing(tid); + ASSERT_EQ(sing, sing); + ASSERT_EQ(sing, DispatchKeySet().add(tid)); + ASSERT_EQ(sing, sing.add(tid)); + ASSERT_EQ(sing, sing | sing); + ASSERT_FALSE(sing.empty()); + ASSERT_TRUE(sing.has(tid)); + + auto functionality_key = toFunctionalityKey(tid); + auto backend_key = toBackendComponent(tid); + // These two sets should be equivalent: + // DispatchKeySet(DispatchKey::CPU) + // DispatchKeySet({DispatchKey::Dense, BackendComponent::CPUBit}) + auto expected_ks = + DispatchKeySet(functionality_key) | DispatchKeySet(backend_key); + ASSERT_EQ(sing, expected_ks); + // These two sets should be equivalent: + // DispatchKeySet(DispatchKey::CPU).remove(DispatchKey::Dense) + // DispatchKeySet(BackendComponent::CPUBit) + expected_ks = DispatchKeySet(toBackendComponent(tid)); + ASSERT_EQ(sing.remove(tid), expected_ks); + } +} + +TEST(DispatchKeySet, DoubletonPerBackend) { + for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); + i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); i++) { for (uint8_t j = i + 1; - j < static_cast(DispatchKey::NumDispatchKeys); + j <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); j++) { ASSERT_LT(i, j); auto tid1 = static_cast(i); auto tid2 = static_cast(j); - auto doub = DispatchKeySet(tid1).add(tid2); - ASSERT_EQ(doub, DispatchKeySet(tid1) | DispatchKeySet(tid2)); - ASSERT_TRUE(doub.has(tid1)); - ASSERT_TRUE(doub.has(tid2)); - ASSERT_EQ(doub.highestPriorityTypeId(), tid2); // relies on i < j + + // Skip these because they aren't real keys. + if (tid1 == DispatchKey::StartOfDenseBackends || + tid1 == DispatchKey::StartOfSparseBackends || + tid1 == DispatchKey::StartOfQuantizedBackends || + tid1 == DispatchKey::StartOfAutogradBackends) + continue; + if (tid2 == DispatchKey::StartOfDenseBackends || + tid2 == DispatchKey::StartOfSparseBackends || + tid2 == DispatchKey::StartOfQuantizedBackends || + tid2 == DispatchKey::StartOfAutogradBackends) + continue; + + auto backend1 = toBackendComponent(tid1); + auto backend2 = toBackendComponent(tid2); + auto functionality1 = toFunctionalityKey(tid1); + auto functionality2 = toFunctionalityKey(tid2); + + auto combined = DispatchKeySet({tid1, tid2}); + // The combined set has the backend bits + ASSERT_TRUE(combined.has_backend(backend1)); + ASSERT_TRUE(combined.has_backend(backend2)); + // and it has the backend bits + ASSERT_TRUE(combined.has(functionality1)); + ASSERT_TRUE(combined.has(functionality2)); + // and it has the original two runtime keys + ASSERT_TRUE(combined.has(tid1)); + ASSERT_TRUE(combined.has(tid2)); + + // Add all of the keys in the keyset to a real set + std::unordered_set visited_keys; + auto iter = combined.begin(); + while (*iter != *combined.end()) { + visited_keys.insert(*iter); + ++iter; + } + std::unordered_set expected_keys; + expected_keys.insert( + toRuntimePerBackendFunctionalityKey(functionality1, backend1)); + expected_keys.insert( + toRuntimePerBackendFunctionalityKey(functionality1, backend2)); + expected_keys.insert( + toRuntimePerBackendFunctionalityKey(functionality2, backend1)); + expected_keys.insert( + toRuntimePerBackendFunctionalityKey(functionality2, backend2)); + ASSERT_EQ(expected_keys, visited_keys); + + if (backend1 == backend2 || functionality1 == functionality2) { + // We have two runtime keys, with either the same backend or the same + // per-backend functionalities. E.g. {AutogradCUDA, CUDA} or + // {AutogradCPU, AutogradCUDA} There should be 2 total runtime keys in + // this set. + ASSERT_EQ(2, visited_keys.size()); + } else { + // since i and j are different keys, they should not have the same + // functionality and backend + ASSERT_TRUE(backend1 != backend2 && functionality1 != functionality2); + // We have two runtime keys, that have different backends + per-backend + // functionalities. So we should expect the full cross product of + // runtime keys to be in the set. e.g. if i = AutogradCUDA, and j = CPU, + // then combined = {AutogradCUDA, AutogradCPU, CUDA, CPU} + ASSERT_EQ(4, visited_keys.size()); + } } } } TEST(DispatchKeySet, Full) { DispatchKeySet full(DispatchKeySet::FULL); - for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); - i++) { + for (const auto i : c10::irange(1, num_functionality_keys)) { auto tid = static_cast(i); ASSERT_TRUE(full.has(tid)); } + ASSERT_FALSE(full.has(DispatchKey::EndOfFunctionalityKeys)); } TEST(DispatchKeySet, IteratorBasicOps) { DispatchKeySet empty_set; DispatchKeySet full_set(DispatchKeySet::FULL); - DispatchKeySet mutated_set = empty_set.add(static_cast(1)); + DispatchKeySet mutated_set = empty_set.add(DispatchKey::CPU); // Constructor + Comparison - ASSERT_EQ(*empty_set.begin(), DispatchKey::NumDispatchKeys); - ASSERT_EQ(*empty_set.end(), DispatchKey::NumDispatchKeys); - ASSERT_EQ(*mutated_set.begin(), static_cast(1)); + ASSERT_EQ(*empty_set.begin(), DispatchKey::EndOfFunctionalityKeys); + ASSERT_EQ(*empty_set.end(), DispatchKey::EndOfFunctionalityKeys); + ASSERT_EQ(*mutated_set.begin(), DispatchKey::CPU); ASSERT_TRUE(empty_set.begin() == empty_set.end()); ASSERT_TRUE(full_set.begin() != full_set.end()); @@ -90,16 +326,37 @@ TEST(DispatchKeySet, IteratorEmpty) { ASSERT_EQ(i, 0); } +TEST(DispatchKeySet, IteratorCrossProduct) { + // The iterator should return all runtime keys in the set, + // including the cross product of {backends} x {functionalities} + auto ks = + DispatchKeySet({BackendComponent::CPUBit, BackendComponent::CUDABit}) | + DispatchKeySet( + {DispatchKey::Dense, + DispatchKey::FPGA, + DispatchKey::AutogradFunctionality}); + + auto iter = ks.begin(); + // iterate through dense backends first. + ASSERT_EQ(DispatchKey::CPU, *(iter++)); + ASSERT_EQ(DispatchKey::CUDA, *(iter++)); + // FPGA doesn't have a backend bit, so it isn't included in the cross product. + ASSERT_EQ(DispatchKey::FPGA, *(iter++)); + // iterate through the autograd keys laster. + ASSERT_EQ(DispatchKey::AutogradCPU, *(iter++)); + ASSERT_EQ(DispatchKey::AutogradCUDA, *(iter++)); +} + TEST(DispatchKeySet, IteratorFull) { DispatchKeySet full_set(DispatchKeySet::FULL); uint8_t i = 0; for (const auto& it : full_set) { i++; - ASSERT_TRUE(it == static_cast(i)); - ASSERT_TRUE(it != DispatchKey::NumDispatchKeys); } - ASSERT_EQ(i, static_cast(DispatchKey::NumDispatchKeys) - 1); + // Total # of runtime entries includes an entry for DispatchKey::Undefined, + // which is not included when iterating through the DispatchKeySet. + ASSERT_EQ(i, num_runtime_entries - 1); } TEST(DispatchKeySet, IteratorRangeFull) { @@ -108,41 +365,61 @@ TEST(DispatchKeySet, IteratorRangeFull) { for (DispatchKey dispatch_key : full_set) { i++; - ASSERT_TRUE(dispatch_key == static_cast(i)); } - ASSERT_EQ(i, static_cast(DispatchKey::NumDispatchKeys) - 1); -} - -TEST(DispatchKeySet, SpecificKeys) { - DispatchKeySet keyset({ - static_cast(0), // Undefined should be ignored - static_cast(4), - static_cast(10), - static_cast(15), - }); - std::unordered_set visited_keys; - - for (DispatchKey key : keyset) { - visited_keys.insert(key); - } - - ASSERT_EQ(visited_keys.size(), 3); - ASSERT_TRUE( - visited_keys.find(static_cast(4)) != visited_keys.end()); - ASSERT_TRUE( - visited_keys.find(static_cast(10)) != visited_keys.end()); - ASSERT_TRUE( - visited_keys.find(static_cast(15)) != visited_keys.end()); + // Total # of runtime entries includes an entry for DispatchKey::Undefined, + // which is not included when iterating through the DispatchKeySet. + ASSERT_EQ(i, num_runtime_entries - 1); } TEST(DispatchKeySet, FailAtEndIterator) { DispatchKeySet full_set(DispatchKeySet::FULL); uint64_t raw_repr = full_set.raw_repr(); + // doesn't throw + DispatchKeySet::iterator(&raw_repr, num_backends + num_functionality_keys); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) EXPECT_THROW( DispatchKeySet::iterator( - &raw_repr, static_cast(DispatchKey::NumDispatchKeys) + 1), + &raw_repr, num_backends + num_functionality_keys + 1), c10::Error); } + +TEST(DispatchKeySet, TestKeyOrderingInvariants) { + for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); + i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); + i++) { + auto k = static_cast(i); + // Note [The Ordering of Per-Backend Dispatch Keys Matters!] + // The DispatchKey enum includes all of the runtime keys for + // Dense/Sparse/Quantized/Autograd, (e.g. CPU, CUDA, SparseCPU, SparseCUDA, + // AutogradCPU, AutogradCUDA, etc). And we expect the ordering of those keys + // to be the same as the ordering of the backends in the `BackendComponent` + // enum. This makes several utilities in `DispatchKey.h` and + // `DispatchKeySet.h` significantly easier to implement. The purpose of the + // test is to assert (through CI) that this invariant is maintained. + // + // The only way that we can really check this invariant is by + // comparing the string names of each enum. + // We only really care about the ordering for "real" keys that are actually + // used, which we expect to be able to print properly. This saves us from + // having to enumerate the full set of possible runtime keys in + // DispatchKey::toString(). It also relies on toString() being implemented + // correctly. + auto functionality_str = std::string(toString(k)); + if (functionality_str == "UNKNOWN_TENSOR_TYPE_ID") + continue; + + auto computed_backend_k = toBackendComponent(k); + auto computed_backend_str = std::string(toString(computed_backend_k)); + // Skip, e.g., the "Bit" from "CPUBit" + computed_backend_str = + computed_backend_str.substr(0, computed_backend_str.size() - 3); + + ASSERT_TRUE( + functionality_str.find(computed_backend_str) != std::string::npos) + << "DispatchKey invariant broken! Found a key that is not ordered correctly" + << " with its backend bit. key = " << toString(k) << ", " << k + << ", computed backend = " << toString(computed_backend_k); + } +} diff --git a/test/test_dispatch.py b/test/test_dispatch.py index 37a6054f915..c97e9e382fc 100644 --- a/test/test_dispatch.py +++ b/test/test_dispatch.py @@ -532,8 +532,8 @@ AutogradXLA: fn_math [math kernel] lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x }) lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd", debug="fn_math"), - # m.impl("foo", torch::kQuantizedCPU, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "QuantizedCPU", debug="fn_quantizedcpu"), + # m.impl("foo", torch::kFPGA, [](const Tensor & x) { return x }) + lambda m: m.impl_t_t("foo", "FPGA", debug="fn_fpga"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -541,12 +541,12 @@ name: test::foo schema: test::foo(Tensor x) -> (Tensor) debug: registered at /dev/null:0 alias analysis kind: FROM_SCHEMA -QuantizedCPU: fn_quantizedcpu :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +FPGA: fn_fpga :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] CompositeImplicitAutograd[alias]: fn_math :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] ''') # computed dispatch table is too big, so we only check on a few entries we're interested in. - extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('QuantizedCPU',)) + extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',)) self.assertExpectedInline(extracted_table, '''\ Undefined: fn_math [math kernel] @@ -557,7 +557,7 @@ AutogradOther: ambiguous_autogradother [ambiguous autogradother] AutogradCPU: fn_math [math kernel] AutogradCUDA: fn_math [math kernel] AutogradXLA: fn_math [math kernel] -QuantizedCPU: fn_quantizedcpu [kernel] +FPGA: fn_fpga [kernel] ''') def test_computed_table_with_cpu_defaultbackend(self): @@ -616,7 +616,7 @@ CompositeExplicitAutograd[alias]: fn_defaultbackend :: (Tensor _0) -> (Tensor _0 ''') # computed dispatch table is too big, so we only check on a few entries we're interested in. - extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('QuantizedCPU',)) + extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',)) self.assertExpectedInline(extracted_table, '''\ Undefined: fn_defaultbackend [default backend kernel] @@ -627,7 +627,7 @@ AutogradOther: fn_autograd [autograd kernel] AutogradCPU: fn_autograd [autograd kernel] AutogradCUDA: fn_autograd [autograd kernel] AutogradXLA: fn_autograd [autograd kernel] -QuantizedCPU: fn_defaultbackend [default backend kernel] +FPGA: fn_defaultbackend [default backend kernel] ''') def test_computed_table_with_cpu_autograd_math_defaultbackend(self): @@ -808,7 +808,7 @@ key kernel CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -QuantizedCPU fn_CompositeImplicitAutograd [math kernel] +FPGA fn_CompositeImplicitAutograd [math kernel] AutogradOther fn_CompositeImplicitAutograd [math kernel] AutogradCPU fallthrough [backend fallback] AutogradXLA fallthrough [backend fallback] @@ -829,7 +829,7 @@ key kernel CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -QuantizedCPU fn_CompositeImplicitAutograd [math kernel] +FPGA fn_CompositeImplicitAutograd [math kernel] AutogradOther fn_CompositeImplicitAutograd [math kernel] AutogradCPU fn_AutogradCPU [kernel] AutogradXLA fallthrough [backend fallback] @@ -864,7 +864,7 @@ key kernel CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -QuantizedCPU fn_CompositeExplicitAutograd [default backend kernel] +FPGA fn_CompositeExplicitAutograd [default backend kernel] AutogradOther fallthrough [backend fallback] AutogradCPU fn_AutogradCPU [kernel] AutogradXLA fallthrough [backend fallback] @@ -889,7 +889,7 @@ CompositeExplicitAutograd[alias] fn_CompositeExplicitAutograd def test_autogradother(self): dispatcher = PythonDispatcher() - dispatcher.register(["CPU", "QuantizedCPU", "CompositeImplicitAutograd"]) + dispatcher.register(["CPU", "FPGA", "CompositeImplicitAutograd"]) self.assertExpectedInline( dispatcher.dispatchTable(), '''\ @@ -900,7 +900,7 @@ key kernel CPU fn_CPU [kernel] XLA fn_CompositeImplicitAutograd [math kernel] Lazy fn_CompositeImplicitAutograd [math kernel] -QuantizedCPU fn_QuantizedCPU [kernel] +FPGA fn_FPGA [kernel] AutogradOther ambiguous_autogradother [ambiguous autogradother] AutogradCPU fallthrough [backend fallback] AutogradXLA fn_CompositeImplicitAutograd [math kernel] @@ -915,8 +915,8 @@ AutogradLazy fn_CompositeImplicitAutograd [math kernel] Registered Kernels key kernel --------------------------- +FPGA fn_FPGA CPU fn_CPU -QuantizedCPU fn_QuantizedCPU CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd ''' ) diff --git a/test/test_sparse.py b/test/test_sparse.py index cbc98f572bd..34d5155bfa8 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -3410,21 +3410,21 @@ class TestSparseOneOff(TestCase): def test_cuda_from_cpu(self): with self.assertRaisesRegex( RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), torch.randn(4, 4, 4), [3, 4, 4]) with self.assertRaisesRegex( RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), torch.randn(4, 4, 4, 0), [3, 4, 4, 0]) with self.assertRaisesRegex( RuntimeError, - "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): + "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(), torch.randn(0, 4, 4, 0), [0, 4, 4, 0]) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 6bc0d7df100..5abbc3487e8 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -48,58 +48,66 @@ class DispatchKey(Enum): Undefined = 0 CatchAll = Undefined - CPU = auto() - CUDA = auto() - HIP = auto() + Dense = auto() FPGA = auto() ORT = auto() - XLA = auto() - Lazy = auto() Vulkan = auto() Metal = auto() - XPU = auto() MKLDNN = auto() OpenGL = auto() OpenCL = auto() IDEEP = auto() - QuantizedCPU = auto() - QuantizedCUDA = auto() - QuantizedXPU = auto() + Quantized = auto() CustomRNGKeyId = auto() MkldnnCPU = auto() - SparseCPU = auto() - SparseCUDA = auto() + Sparse = auto() SparseCsrCPU = auto() SparseCsrCUDA = auto() - SparseHIP = auto() - SparseXPU = auto() - NestedTensor = auto() - PrivateUse1 = auto() - PrivateUse2 = auto() - PrivateUse3 = auto() - EndOfBackendKeys = PrivateUse3 ZeroTensor = auto() Meta = auto() BackendSelect = auto() Named = auto() AutogradOther = auto() - AutogradCPU = auto() - AutogradCUDA = auto() - AutogradXLA = auto() - AutogradLazy = auto() + AutogradFunctionality = auto() AutogradNestedTensor = auto() - AutogradXPU = auto() - AutogradPrivateUse1 = auto() - AutogradPrivateUse2 = auto() - AutogradPrivateUse3 = auto() Tracer = auto() Autocast = auto() Batched = auto() VmapMode = auto() TESTING_ONLY_GenericWrapper = auto() TESTING_ONLY_GenericMode = auto() - NumDispatchKeys = auto() + EndOfFunctionalityKeys = TESTING_ONLY_GenericMode + + CPU = auto() + CUDA = auto() + HIP = auto() + XLA = auto() + Lazy = auto() + XPU = auto() + NestedTensor = auto() + PrivateUse1 = auto() + PrivateUse2 = auto() + PrivateUse3 = auto() + + QuantizedCPU = auto() + QuantizedCUDA = auto() + QuantizedXPU = auto() + + SparseCPU = auto() + SparseCUDA = auto() + SparseHIP = auto() + SparseXPU = auto() + + AutogradCPU = auto() + AutogradCUDA = auto() + AutogradXLA = auto() + AutogradLazy = auto() + AutogradXPU = auto() + AutogradPrivateUse1 = auto() + AutogradPrivateUse2 = auto() + AutogradPrivateUse3 = auto() + Autograd = auto() CompositeImplicitAutograd = auto() CompositeExplicitAutograd = auto() diff --git a/torch/_python_dispatcher.py b/torch/_python_dispatcher.py index aa19a18efb3..fe0c6253fdd 100644 --- a/torch/_python_dispatcher.py +++ b/torch/_python_dispatcher.py @@ -15,9 +15,9 @@ keys for a single example of each use case. These use cases are listed below: - CPU/AutogradCPU: represents in-tree backends which we usually have dedicated inference & autograd kernel in pytorch core library. E.g. CPU, CUDA -- QuantizedCPU/AutogradOther: represents in-tree backends which we usually have backend specific +- FPGA/AutogradOther: represents in-tree backends which we usually have backend specific inference kernels, but they share the same autograd kernel specified in AutogradOther. - E.g. QuantizedCPU, QuantizedCUDA + E.g. FPGA, SparseCsrCPU - XLA/AutogradXLA: represents out-of-tree backends which we don't have either inference or autograd kernel defined in pytorch core library. Backend owner is responsible for registering both inference & autograd kernels in their extensions(e.g. torch-xla) for the operators they support. @@ -53,7 +53,7 @@ class PythonDispatcher: name = "foo" runtime_keys = [ "CPU", "AutogradCPU", - "QuantizedCPU", "AutogradOther", + "FPGA", "AutogradOther", "XLA", "AutogradXLA", "Lazy", "AutogradLazy", ] From 8aa3620d73d9afcdbcf3d691cdd45ae05ec5e7d5 Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Mon, 14 Feb 2022 07:53:38 -0800 Subject: [PATCH 004/199] DispatchKeySet perf improvements (#72403) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72403 reland of D33301590 (https://github.com/pytorch/pytorch/commit/18cbe80f23c6c236361c45b5debff58b65359a23) ghstack-source-id: 148830729 Test Plan: CI, and running explicit mobile test: `buck test //fbandroid/instrumentation_tests/com/facebook/pytorch/bi_xray:instrumentation_test -c test.external_runner=tpx -- --regex 'testBIXRayModel.*PyTorchBIXRayInstrumentationTest' --force-remote-execution --run-disabled` Reviewed By: albanD Differential Revision: D34034847 fbshipit-source-id: a930e44513a76c0c82c9d27f0fc2d2a6d7d90cf9 (cherry picked from commit 7f1ea7584c4fb098645428070a94b2f231af787b) --- .../native/quantized/cpu/fbgemm_utils.cpp | 3 +- c10/core/DispatchKeySet.h | 9 ++ c10/core/TensorImpl.cpp | 7 +- c10/core/TensorImpl.h | 89 ++++++++++++------- 4 files changed, 70 insertions(+), 38 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp index ab6df06f7b7..2cb25f360ba 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp @@ -160,9 +160,10 @@ Tensor MakeStridedQTensorCPU( allocator->allocate(size_bytes), allocator, /* resizable = */ true); + constexpr auto quantized_cpu_ks = at::DispatchKeySet(at::DispatchKey::QuantizedCPU); auto tensor = detail::make_tensor( storage, - at::DispatchKeySet(at::DispatchKey::QuantizedCPU), + quantized_cpu_ks, dtype, quantizer); get_qtensorimpl(tensor)->set_sizes_and_strides(sizes, strides); diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 1834ca0aa96..781df91767a 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -606,6 +606,15 @@ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({ constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView = autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView); +constexpr DispatchKeySet python_ks = DispatchKeySet(DispatchKey::Python); + +constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse); + +constexpr DispatchKeySet sparse_csr_ks = + DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA}); + +constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU); + // backend dispatch keys that map to DispatchKey::AutogradOther // NB: keys in this set also get associated with CompositeImplicitAutograd constexpr DispatchKeySet autogradother_backends = diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 379807df0c7..5a772be7a14 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -148,8 +148,7 @@ TensorImpl::TensorImpl( numel_(0), data_type_(data_type), device_opt_(storage_.device()), - key_set_(key_set.remove( - DispatchKey::Python)) { // See [Note: Python key removal] + key_set_(key_set - c10::python_ks) { // See [Note: Python key removal] init_bitfields(); // Inference tensor doesn't have version counter. if (!is_inference()) { @@ -194,8 +193,8 @@ TensorImpl::TensorImpl( key_set = key_set | getAutocastRelatedKeySetFromBackend(k); - key_set = - key_set.remove(DispatchKey::Python); // See [Note: Python key removal] + // See [Note: Python key removal] + key_set = key_set - c10::python_ks; // Inference tensor doesn't have autograd related keys. if (inference_mode) { diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index d703cb2abb8..737ba18f96e 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -838,91 +838,103 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_sparse() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::Sparse); + return key_set_.has_all(c10::sparse_ks); } // Whether a tensor is sparse COO or not. Use is_sparse_csr for checking CSR // format. bool is_sparse_csr() const { - return key_set_.has(DispatchKey::SparseCsrCPU) || - key_set_.has(DispatchKey::SparseCsrCUDA); + return key_set_.has_any(c10::sparse_csr_ks); } bool is_quantized() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::Quantized); + constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized); + return key_set_.has_all(quantized_ks); } bool is_meta() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::Meta); + constexpr auto meta_ks = DispatchKeySet(DispatchKey::Meta); + return key_set_.has_all(meta_ks); } bool is_cpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::CPUBit) || - key_set_.has(DispatchKey::SparseCsrCPU) || - key_set_.has(DispatchKey::MkldnnCPU); + constexpr auto cpu_bits_ks = DispatchKeySet(BackendComponent::CPUBit) | + DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::MkldnnCPU}); + return key_set_.has_any(cpu_bits_ks); } bool is_cuda() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::CUDABit) || - key_set_.has(DispatchKey::SparseCsrCUDA); + constexpr auto cuda_bits_ks = DispatchKeySet(BackendComponent::CUDABit) | + DispatchKeySet(DispatchKey::SparseCsrCUDA); + return key_set_.has_any(cuda_bits_ks); } bool is_xpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::XPUBit); + constexpr auto xpu_ks = DispatchKeySet(BackendComponent::XPUBit); + return key_set_.has_all(xpu_ks); } bool is_xla() const { - return key_set_.has_backend(BackendComponent::XLABit); + constexpr auto xla_ks = DispatchKeySet(BackendComponent::XLABit); + return key_set_.has_all(xla_ks); } bool is_hpu() const { - return key_set_.has_backend(BackendComponent::HPUBit); + constexpr auto hpu_ks = DispatchKeySet(BackendComponent::HPUBit); + return key_set_.has_all(hpu_ks); } bool is_lazy() const { - return key_set_.has_backend(BackendComponent::LazyBit); + constexpr auto lazy_ks = DispatchKeySet(BackendComponent::LazyBit); + return key_set_.has_all(lazy_ks); } bool is_hip() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::HIPBit); + constexpr auto hip_ks = DispatchKeySet(BackendComponent::HIPBit); + return key_set_.has_all(hip_ks); } bool is_ve() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::VEBit); + constexpr auto ve_ks = DispatchKeySet(BackendComponent::VEBit); + return key_set_.has_all(ve_ks); } bool is_mkldnn() const { - return key_set_.has(DispatchKey::MkldnnCPU); + return key_set_.has_all(c10::mkldnn_ks); } bool is_vulkan() const { - return key_set_.has(DispatchKey::Vulkan); + constexpr auto vulkan_ks = DispatchKeySet(DispatchKey::Vulkan); + return key_set_.has_all(vulkan_ks); } bool is_metal() const { - return key_set_.has(DispatchKey::Metal); + constexpr auto metal_ks = DispatchKeySet(DispatchKey::Metal); + return key_set_.has_all(metal_ks); } bool is_mlc() const { - return key_set_.has(DispatchKey::MLC); + constexpr auto mls_ks = DispatchKeySet(DispatchKey::MLC); + return key_set_.has_all(mls_ks); } bool is_ort() const { - return key_set_.has(DispatchKey::ORT); + constexpr auto ort_ks = DispatchKeySet(DispatchKey::ORT); + return key_set_.has_all(ort_ks); } // TODO: remove this once we don't automatically enabled Autograd dispatch @@ -938,8 +950,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // Invariant: // Inference tensor has version_counter_.enabled() == false bool is_inference() { - bool no_ADInplaceOrView = !key_set_.has(c10::DispatchKey::ADInplaceOrView); - bool no_Autograd = (key_set_ & c10::autograd_dispatch_keyset).empty(); + bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks); + bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( no_ADInplaceOrView == no_Autograd, "ADInplaceOrView and Autograd keys must be on/off at the same time."); @@ -960,14 +972,22 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { Layout layout() const { // NB: This method is not virtual and avoid dispatches for perf. - if (is_sparse()) { + // strided is also the most common layout type, so we check for + // strided case first. + // This keyset must also be kept in sync with the logic in + // is_sparse() / is_sparse_csr() / is_mkldnn() + constexpr auto sparse_and_sparsecsr_and_mkldnn_ks = + c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks; + if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) { + return kStrided; + } else if (is_sparse()) { return kSparse; } else if (is_sparse_csr()) { return kSparseCsr; - } else if (is_mkldnn()) { - return kMkldnn; } else { - return kStrided; + TORCH_INTERNAL_ASSERT( + is_mkldnn(), "There is an error in the layout calculation logic."); + return kMkldnn; } } @@ -1053,7 +1073,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the imaginary part of the tensor should be negated */ inline bool is_conj() const { - return key_set_.has(DispatchKey::Conjugate); + constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate); + return key_set_.has_all(conjugate_ks); } /** @@ -1073,7 +1094,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the tensor is a zerotensor */ inline bool _is_zerotensor() const { - return key_set_.has(DispatchKey::ZeroTensor); + constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor); + return key_set_.has_all(zerotensor_ks); } /** @@ -1093,7 +1115,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the tensor should be negated */ inline bool is_neg() const { - return key_set_.has(DispatchKey::Negative); + constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative); + return key_set_.has_all(negative_ks); } /** @@ -1464,14 +1487,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { void set_python_dispatch(bool k) { if (k) { - key_set_ = key_set_.add(DispatchKey::Python); + key_set_ = key_set_.add(c10::python_ks); } else { - key_set_ = key_set_.remove(DispatchKey::Python); + key_set_ = key_set_ - c10::python_ks; } } bool is_python_dispatch() const { - return key_set_.has(DispatchKey::Python); + return key_set_.has_all(c10::python_ks); } /** From 89c934f4b89580095324d17603610632afb50993 Mon Sep 17 00:00:00 2001 From: Michael Melesse Date: Mon, 14 Feb 2022 09:20:24 -0800 Subject: [PATCH 005/199] [ROCM] Navi21 Enablement 2: Depthwise kernels (#72682) Summary: This PR is a follow up to https://github.com/pytorch/pytorch/pull/69942. We are adding support to Navi21 GPUs which have a warpsize of 32. We cannot rely on a constant so we have to dynamically look up the warpsize when launching the kernel on the host side. Inside device functions this is not needed and the compiler can correctly detect the correct warpsize to replace the C10_WARP_SIZE constant. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72682 Reviewed By: soulitzer Differential Revision: D34174082 Pulled By: ngimel fbshipit-source-id: 2810e4d1d0f518f20a73a40a0b5c3d71ea120b9e (cherry picked from commit df003a5bb54cefbf2351f9c1bc8cf91a0609495a) --- aten/src/ATen/native/cuda/DepthwiseConv2d.cu | 7 ++++--- aten/src/ATen/native/cuda/DepthwiseConv3d.cu | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu index ac32bfafe1a..80fa0490fb7 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu @@ -442,7 +442,7 @@ void conv_depthwise2d_backward_out( int getGradParamsNumThreads(int batchSize) { //warp per item in a batch, up to a maximum constexpr int MAX_BLOCK_SIZE = 256; - return std::min(batchSize * C10_WARP_SIZE, MAX_BLOCK_SIZE); + return std::min(batchSize * at::cuda::warp_size(), MAX_BLOCK_SIZE); } void conv_depthwise2d_grad_weight_out( @@ -498,8 +498,9 @@ void conv_depthwise2d_grad_weight_out( const auto input_a = input.packed_accessor32(); const auto grad_weight_a = grad_weight.packed_accessor32(); using acc_t = at::acc_type; - TORCH_INTERNAL_ASSERT(block.x % C10_WARP_SIZE == 0); - int smem = (block.x / C10_WARP_SIZE) * sizeof(acc_t); + int warp_size = at::cuda::warp_size(); + TORCH_INTERNAL_ASSERT(block.x % warp_size == 0); + int smem = (block.x / warp_size) * sizeof(acc_t); conv_depthwise2d_grad_weight_kernel<<>>( grad_output_a, input_a, grad_weight_a, batchSize, inputChannels, outputChannels, depthwiseMultiplier, width, height, outputWidth, outputHeight, kW, kH, dW, dH, padW, padH, dilationW, dilationH); diff --git a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu index 8fbe14b797a..8924591d577 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu @@ -596,9 +596,10 @@ std::tuple _depthwise_3d_backward_cuda_out( TORCH_CHECK(padding[i] * 2 + input.size(i + 2) <= int_max, "Padded input tensor is too large."); } - TORCH_CHECK(grad_output_.size(0) * grad_output_.size(2) < int_max - block / C10_WARP_SIZE && - grad_output_.size(3) <= int_max - C10_WARP_SIZE && - grad_output_.size(4) <= int_max - C10_WARP_SIZE, + int64_t warp_size = at::cuda::warp_size(); + TORCH_CHECK(grad_output_.size(0) * grad_output_.size(2) < int_max - block / warp_size && + grad_output_.size(3) <= int_max - warp_size && + grad_output_.size(4) <= int_max - warp_size, "Output size is too large."); DWCONV3D_BACKWARD_WEIGHT_DISPATCH_SPECIALIZATION(1, 1) From 7d542a4f2b229657a64188f942ff88c81b5de5e5 Mon Sep 17 00:00:00 2001 From: lkct Date: Mon, 14 Feb 2022 09:21:12 -0800 Subject: [PATCH 006/199] Fix type annotation for `torch.backends.cudnn.allow_tf32` (#72757) Summary: Fixes https://github.com/pytorch/pytorch/issues/72753 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72757 Reviewed By: samdow Differential Revision: D34204436 Pulled By: ngimel fbshipit-source-id: 3528efd7bdf72c1d9338806555ecb643ab94ffeb (cherry picked from commit 7036c2e6e66cbcfeb6ff9042b84bbd392413b5f1) --- torch/backends/cudnn/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/backends/cudnn/__init__.py b/torch/backends/cudnn/__init__.py index 4f05e06225b..d89049b5f3c 100644 --- a/torch/backends/cudnn/__init__.py +++ b/torch/backends/cudnn/__init__.py @@ -133,3 +133,4 @@ sys.modules[__name__] = CudnnModule(sys.modules[__name__], __name__) enabled: bool deterministic: bool benchmark: bool +allow_tf32: bool From e7985e3c60d18bcc8fbd8d89b06ff2c3754306e0 Mon Sep 17 00:00:00 2001 From: Eddie Yan Date: Mon, 14 Feb 2022 09:21:29 -0800 Subject: [PATCH 007/199] Properly initialize `grad_weight` in `raw_cudnn_convolution_backward_weight_out` (#72157) Summary: https://github.com/pytorch/pytorch/issues/71521 attempted to fix an issue where the `test_conv_large` test was producing `NaN` values after the backward pass, yielding a bogus comparison between the result and the expected result. While tweaking the initialization of the conv layer seemed to fix this behavior, it was actually just masking the real issue, which was that `grad_weight` is not guaranteed to be initialized in `raw_cudnn_convolution_backward_weight_out` when the backward operation is split. Specifically, the `grad_weight` tensor is expected to be directly written to by a `cudnn` kernel (which does occur in most cases) so it does not need to be initialized, but splitting introduces an intermediate `grad_weight_` tensor that holds the intermediate gradients and then accumulates into `grad_weight` without initializing it first. This PR tweaks this behavior so that now accumulation is done with a zero'd tensor, and also adds the change of doing the accumulation in an accumulation dtype. The hacky workaround masking the issue is also reverted, with the safeguard against comparing `NaN` values (using the reference tensor for scale computation) kept in place. CC ngimel ptrblck Pull Request resolved: https://github.com/pytorch/pytorch/pull/72157 Reviewed By: malfet Differential Revision: D34147547 Pulled By: ngimel fbshipit-source-id: 056c19f727eeef96347db557528272e24eae4223 (cherry picked from commit 24c7f77a81c6ef5b0371ef0030e7003dcce55236) --- aten/src/ATen/native/cudnn/Conv_v7.cpp | 6 +++++- test/test_nn.py | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp index 502b32a5b44..a05eb021f0e 100644 --- a/aten/src/ATen/native/cudnn/Conv_v7.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -808,6 +808,9 @@ void raw_cudnn_convolution_backward_weight_out( int64_t split_size = std::max(1024 * 1024 * 512 / max_inner_size, 1L); int64_t num_splits = (n + split_size - 1) / split_size; if (split_size * max_inner_size < int_max) { + const auto kAccType = (grad_weight.scalar_type() == kHalf || grad_weight.scalar_type() == kBFloat16) + ? kFloat : grad_weight.scalar_type(); + Tensor grad_weight_accumulator = at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType)); for (const auto i : c10::irange(num_splits)) { int64_t start = split_size * i; int64_t split_size_ = std::min(split_size, n - start); @@ -815,8 +818,9 @@ void raw_cudnn_convolution_backward_weight_out( Tensor grad_output_ = grad_output.narrow(0, start, split_size_); Tensor grad_weight_ = at::empty_like(grad_weight); raw_cudnn_convolution_backward_weight_out_32bit(grad_weight_, grad_output_, input_, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - grad_weight.add_(grad_weight_); + grad_weight_accumulator.add_(grad_weight_); } + grad_weight.copy_(grad_weight_accumulator); return; } // If control flow reaches here, this means even splitting N is not enough, then things starts to become complicated: diff --git a/test/test_nn.py b/test/test_nn.py index c6a2e24e6cf..d0c7d8e14a4 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -16427,7 +16427,6 @@ class TestNNDeviceType(NNTestCase): def test_conv_large(self, device): dtype = torch.half if self.device_type == 'cuda' else torch.float conv = nn.Conv2d(2, 2, 8, 8, bias=False).to(device).to(dtype) - conv.weight = torch.nn.Parameter(torch.randn(2, 2, 8, 8, device=device, dtype=dtype) / 64) input_large = torch.randn(4097, 2, 512, 512, dtype=dtype, device=device) # forward ret = conv(input_large) From 9981aadee1c4bf3edec53c6e0dbe3551fc2838dd Mon Sep 17 00:00:00 2001 From: dzdang Date: Mon, 14 Feb 2022 11:02:52 -0800 Subject: [PATCH 008/199] [Quant][core][devs] Separated implementations for quantized & non-quantized tensors in index_select_cuda (#72407) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72407 This PR is part of a series of PRs addressing https://github.com/pytorch/pytorch/issues/54150, related to using dispatcher for calls to quantized backends as opposed to if/else conditionals. This particular PR separates the calls to quantized & non-quantized backends for index_select_cuda using a dispatcher. Differential Revision: D34035694 D34035694 Test Plan: Imported from OSS Reviewed By: jerryzh168 Pulled By: dzdang fbshipit-source-id: ff66b0c6c1be841a6cfaaa3339e7b0b19260708e (cherry picked from commit f92c008d8eb58a6c4b4d0132bd592ce70d1bdce7) --- aten/src/ATen/native/cuda/Indexing.cu | 19 ++++++++++--------- aten/src/ATen/native/native_functions.yaml | 3 ++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 9ea21b2171e..b215968fea5 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -905,15 +905,16 @@ Tensor& index_select_out_cuda( } Tensor index_select_cuda(const Tensor& self, int64_t dim, const Tensor& index) { - Tensor out; - if (self.is_quantized()){ - TORCH_CHECK( - self.qscheme() == kPerTensorAffine, - "Only per_tensor quantized quantized tensors are supported by index_select.") - out = at::empty_quantized({0}, self); - } else { - out = at::empty({0}, self.options()); - } + Tensor out = at::empty({0}, self.options()); + at::native::index_select_out_cuda(self, dim, index, out); + return out; +} + +Tensor index_select_quantized_cuda(const Tensor& self, int64_t dim, const Tensor& index) { + TORCH_CHECK( + self.qscheme() == kPerTensorAffine, + "Only per_tensor quantized quantized tensors are supported by index_select.") + Tensor out = at::empty_quantized({0}, self); at::native::index_select_out_cuda(self, dim, index, out); return out; } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 93c9ab24c79..a84eddacbb7 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6867,7 +6867,8 @@ dispatch: CPU: index_select_cpu_ QuantizedCPU: index_select_quantized_cpu_ - CUDA, QuantizedCUDA: index_select_cuda + CUDA: index_select_cuda + QuantizedCUDA: index_select_quantized_cuda SparseCPU: index_select_sparse SparseCUDA: index_select_sparse From aa44480b40075d4f696605f3ab7a03d87372b4e2 Mon Sep 17 00:00:00 2001 From: Pruthvi Madugundu Date: Mon, 14 Feb 2022 11:19:43 -0800 Subject: [PATCH 009/199] [ROCm] Enable sort operator BF16 support (#71226) Summary: Related to [https://github.com/pytorch/pytorch/issues/58196](https://github.com/pytorch/pytorch/pull/58196) cc jeffdaily sunway513 jithunnair-amd ROCmSupport KyleCZH amathews-amd Pull Request resolved: https://github.com/pytorch/pytorch/pull/71226 Reviewed By: malfet Differential Revision: D34152115 Pulled By: seemethere fbshipit-source-id: 53841c91976bdb5a0002362f22a54ec23aa2f78f (cherry picked from commit 963027c7f28cf20e1c4e5722eb62b5629e735a8e) --- aten/src/ATen/cuda/cub.cu | 3 -- aten/src/ATen/cuda/cub.cuh | 29 +++++++++++++++---- aten/src/ATen/native/cuda/Sort.cu | 10 +++---- test/test_sort_and_select.py | 7 ----- .../_internal/common_methods_invocations.py | 4 +-- 5 files changed, 31 insertions(+), 22 deletions(-) diff --git a/aten/src/ATen/cuda/cub.cu b/aten/src/ATen/cuda/cub.cu index 6915a1c2b98..8a64da6756c 100644 --- a/aten/src/ATen/cuda/cub.cu +++ b/aten/src/ATen/cuda/cub.cu @@ -57,10 +57,7 @@ AT_INSTANTIATE_SORT_PAIRS(int64_t, 4) AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8) -// BFloat16 is not supported by ROCm's radix sort -#if !AT_ROCM_ENABLED() AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8) -#endif } // namespace detail diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 6ac9905f571..bf51ccce49c 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -45,17 +45,23 @@ #ifdef USE_ROCM #define NO_ROCM(x) +#define ROCM_HIPCUB(x) ::hipcub #else #define NO_ROCM(x) x +#define ROCM_HIPCUB(x) x #endif -#if !defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16() +#if !CUB_SUPPORTS_NV_BFLOAT16() || \ + (defined(USE_ROCM) && ROCM_VERSION >= 40500) +#if !defined(USE_ROCM) namespace at_cuda_detail { +#endif + // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16 template <> -struct cub::FpLimits +struct ROCM_HIPCUB(cub)::FpLimits { static __host__ __device__ __forceinline__ c10::BFloat16 Max() { unsigned short max_word = 0x7F7F; @@ -68,8 +74,14 @@ struct cub::FpLimits } }; -template <> struct cub::NumericTraits: cub::BaseTraits {}; -} +template <> +struct ROCM_HIPCUB(cub)::NumericTraits: + ROCM_HIPCUB(cub)::BaseTraits {}; + +#if !defined(USE_ROCM) +} // namespace at_cuda_detail +#endif + #endif #if !defined(USE_ROCM) @@ -93,13 +105,20 @@ struct cuda_type { using type = __half; }; -#if CUB_SUPPORTS_NV_BFLOAT16() +#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16() template<> struct cuda_type { using type = __nv_bfloat16; }; +#elif (defined(USE_ROCM) && ROCM_VERSION >= 40500) + +template<> +struct cuda_type { + using type = hip_bfloat16; +}; + #endif } // namespace detail diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu index 15c89f7b76e..3ceb3001e50 100644 --- a/aten/src/ATen/native/cuda/Sort.cu +++ b/aten/src/ATen/native/cuda/Sort.cu @@ -325,14 +325,14 @@ void launch_stable_sort_kernel( TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort); int64_t *indices_ptr = indices.data_ptr(); -#if defined(USE_ROCM) - constexpr bool is_rocm = true; +#if (defined(USE_ROCM) && ROCM_VERSION < 40500) + constexpr bool is_rocm_bf16_sort_unsupported = true; #else - constexpr bool is_rocm = false; + constexpr bool is_rocm_bf16_sort_unsupported = false; #endif AT_DISPATCH_ALL_TYPES_AND3(kBool, kHalf, kBFloat16, self.scalar_type(), "sort", [&]{ - c10::guts::if_constexpr::value)>([&](auto _){ + c10::guts::if_constexpr::value)>([&](auto _){ const scalar_t *self_ptr = self.data_ptr(); scalar_t *values_ptr = values.data_ptr(); int64_t remaining = _(numel); @@ -353,7 +353,7 @@ void launch_stable_sort_kernel( values_ptr += n; indices_ptr += n; } - }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm"); }); + }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm < 4.5"); }); }); } diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py index b44b09ffa1d..840635acbbf 100644 --- a/test/test_sort_and_select.py +++ b/test/test_sort_and_select.py @@ -135,8 +135,6 @@ class TestSortAndSelect(TestCase): # FIXME: remove torch.bool from unsupported types once support is added for cub sort @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) def test_stable_sort(self, device, dtype): - if TEST_WITH_ROCM and dtype == torch.bfloat16: - return sizes = (100, 1000, 10000) for ncopies in sizes: x = torch.tensor([0, 1] * ncopies, dtype=dtype, device=device) @@ -230,8 +228,6 @@ class TestSortAndSelect(TestCase): # FIXME: remove torch.bool from unsupported types once support is added for cub sort @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) def test_stable_sort_against_numpy(self, device, dtype): - if TEST_WITH_ROCM and dtype == torch.bfloat16: - return if dtype in floating_types_and(torch.float16, torch.bfloat16): inf = float('inf') neg_inf = -float('inf') @@ -295,9 +291,6 @@ class TestSortAndSelect(TestCase): @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_msort(self, device, dtype): - if TEST_WITH_ROCM and dtype == torch.bfloat16: - return - def test(shape): tensor = make_tensor(shape, device, dtype, low=-9, high=9) if tensor.size() != torch.Size([]): diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index e9332bd4f01..c095f6a8523 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -13285,7 +13285,7 @@ op_db: List[OpInfo] = [ OpInfo('sort', dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), - dtypesIfROCM=all_types_and(torch.float16), + dtypesIfROCM=all_types_and(torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_sort, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -13931,7 +13931,7 @@ op_db: List[OpInfo] = [ OpInfo('msort', dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), - dtypesIfROCM=all_types_and(torch.float16), + dtypesIfROCM=all_types_and(torch.float16, torch.bfloat16), check_batched_gradgrad=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, From f87f753bb997b2da82f7d2a561ccb40ab4f6bd9d Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Mon, 14 Feb 2022 11:39:05 -0800 Subject: [PATCH 010/199] avoiding adding some functions to the public python API before 1.11 release (#72543) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72543 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D34085724 Pulled By: bdhirsh fbshipit-source-id: 941d5a90a6fa5328268d623e0e2b01577e4132ca (cherry picked from commit 6676a0c79a3b2bc1aa95e09e91eb92a6eca6b764) --- aten/src/ATen/native/native_functions.yaml | 2 +- .../check_forward_backward_compatibility.py | 1 + test/test_torch.py | 8 +++--- tools/autograd/derivatives.yaml | 2 +- torch/nn/init.py | 25 +++++++++++-------- torch/overrides.py | 2 +- .../_internal/common_methods_invocations.py | 2 +- 7 files changed, 23 insertions(+), 19 deletions(-) diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index a84eddacbb7..7c252141099 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -6061,7 +6061,7 @@ - func: scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor variants: function, method -- func: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor +- func: _scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor variants: function, method dispatch: CPU: scatter_reduce_two_cpu diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py index e15ac0f29bc..2297dec5c2f 100644 --- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py +++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py @@ -106,6 +106,7 @@ ALLOW_LIST = [ ("aten::_scatter_reduce", datetime.date(2022, 1, 31)), ("aten::native_multi_head_self_attention", datetime.date(9999, 1, 1)), ("aten::_native_multi_head_self_attention", datetime.date(9999, 1, 1)), + ("aten::scatter_reduce.two", datetime.date(2022, 3, 15)), ] ALLOW_LIST_COMPILED = [ diff --git a/test/test_torch.py b/test/test_torch.py index d06dfb97a54..e2422d1477d 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -5773,7 +5773,7 @@ class TestTorch(TestCase): for reduce in reduces: for dim in range(len(shape)): - output = input.scatter_reduce(dim, index, reduce, output_size=output_size) + output = input._scatter_reduce(dim, index, reduce, output_size=output_size) # Check that output is of the correct size output_shape = copy.copy(shape) @@ -5807,16 +5807,16 @@ class TestTorch(TestCase): self.assertTrue(torch.allclose(output, expected)) with self.assertRaisesRegex(RuntimeError, "Expected `dim` to be in range -3 to 2"): - torch.scatter_reduce(input, 4, index, "sum") + torch._scatter_reduce(input, 4, index, "sum") with self.assertRaisesRegex(RuntimeError, "Shape mismatch"): index2 = torch.randint(0, output_size, (10, ), dtype=torch.long, device=device) - torch.scatter_reduce(input, 0, index2, "sum") + torch._scatter_reduce(input, 0, index2, "sum") with self.assertRaisesRegex(RuntimeError, "Expected `index` values to be in range 0 to 2"): input2 = torch.randn(10, dtype=dtype, device=device) index2 = torch.tensor([0, 1, 0, 1, 2, 3, 3, 4, 4, 3]) - torch.scatter_reduce(input2, 0, index2, "sum", output_size=2) + torch._scatter_reduce(input2, 0, index2, "sum", output_size=2) def test_structseq_repr(self): a = torch.arange(250).reshape(5, 5, 10) diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index 7f7c13f01aa..27e4007d569 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -2595,6 +2595,6 @@ - name: _efficientzerotensor(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor output_differentiability: [False] -- name: scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor +- name: _scatter_reduce.two(Tensor self, int dim, Tensor index, str reduce, *, int? output_size=None) -> Tensor self: scatter_reduce_backward(grad, self, dim, index, reduce, result) index: non_differentiable diff --git a/torch/nn/init.py b/torch/nn/init.py index 357fb7498c5..ce83137845f 100644 --- a/torch/nn/init.py +++ b/torch/nn/init.py @@ -4,9 +4,6 @@ import warnings from torch import Tensor import torch -from ..overrides import ( - has_torch_function_variadic, - handle_torch_function) # These no_grad_* functions are necessary as wrappers around the parts of these # functions that use `with torch.no_grad()`. The JIT doesn't support context @@ -135,8 +132,8 @@ def uniform_(tensor: Tensor, a: float = 0., b: float = 1.) -> Tensor: >>> w = torch.empty(3, 5) >>> nn.init.uniform_(w) """ - if has_torch_function_variadic(tensor): - return handle_torch_function(uniform_, (tensor,), tensor=tensor, a=a, b=b) + if torch.overrides.has_torch_function_variadic(tensor): + return torch.overrides.handle_torch_function(uniform_, (tensor,), tensor=tensor, a=a, b=b) return _no_grad_uniform_(tensor, a, b) @@ -153,8 +150,8 @@ def normal_(tensor: Tensor, mean: float = 0., std: float = 1.) -> Tensor: >>> w = torch.empty(3, 5) >>> nn.init.normal_(w) """ - if has_torch_function_variadic(tensor): - return handle_torch_function(normal_, (tensor,), tensor=tensor, mean=mean, std=std) + if torch.overrides.has_torch_function_variadic(tensor): + return torch.overrides.handle_torch_function(normal_, (tensor,), tensor=tensor, mean=mean, std=std) return _no_grad_normal_(tensor, mean, std) def trunc_normal_(tensor: Tensor, mean: float = 0., std: float = 1., a: float = -2., b: float = 2.) -> Tensor: @@ -190,8 +187,8 @@ def constant_(tensor: Tensor, val: float) -> Tensor: >>> w = torch.empty(3, 5) >>> nn.init.constant_(w, 0.3) """ - if has_torch_function_variadic(tensor): - return handle_torch_function(constant_, (tensor,), tensor=tensor, val=val) + if torch.overrides.has_torch_function_variadic(tensor): + return torch.overrides.handle_torch_function(constant_, (tensor,), tensor=tensor, val=val) return _no_grad_fill_(tensor, val) @@ -393,8 +390,14 @@ def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'): >>> w = torch.empty(3, 5) >>> nn.init.kaiming_uniform_(w, mode='fan_in', nonlinearity='relu') """ - if has_torch_function_variadic(tensor): - return handle_torch_function(kaiming_uniform_, (tensor,), tensor=tensor, a=a, mode=mode, nonlinearity=nonlinearity) + if torch.overrides.has_torch_function_variadic(tensor): + return torch.overrides.handle_torch_function( + kaiming_uniform_, + (tensor,), + tensor=tensor, + a=a, + mode=mode, + nonlinearity=nonlinearity) if 0 in tensor.shape: warnings.warn("Initializing zero-element tensors is a no-op") diff --git a/torch/overrides.py b/torch/overrides.py index 76a5fe67069..c8ef49e7b9d 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -897,7 +897,7 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.saddmm: lambda input, mat1, mat2, beta=1, alpha=1, out=None: -1, torch.scatter: lambda input, dim, index, src: -1, torch.scatter_add: lambda input, dim, index, src: -1, - torch.scatter_reduce: lambda input, dim, index, reduce, output_size=None: -1, + torch._scatter_reduce: lambda input, dim, index, reduce, output_size=None: -1, torch.searchsorted: lambda sorted_sequence, input, out_int32=False, right=False, out=None: -1, torch.segment_reduce: lambda data, reduce="max", lengths=None, indices=None, axis=0, unsafe=False: -1, torch.select: lambda input, dim, index: -1, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index c095f6a8523..45c06edb9a3 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -15517,7 +15517,7 @@ op_db: List[OpInfo] = [ supports_fwgrad_bwgrad=True, ), OpInfo( - 'scatter_reduce', + '_scatter_reduce', dtypes=all_types_and(torch.float16, torch.bfloat16), sample_inputs_func=sample_inputs_scatter_reduce, supports_out=False, From 78e481d07d3ec94d89badf3beff52ca75615a10e Mon Sep 17 00:00:00 2001 From: Elijah Rippeth Date: Mon, 14 Feb 2022 11:48:02 -0800 Subject: [PATCH 011/199] add optional encoding argument to fileopener (#72715) Summary: Fixes https://github.com/pytorch/pytorch/issues/72713 TODO: add test cc ejguan Pull Request resolved: https://github.com/pytorch/pytorch/pull/72715 Reviewed By: samdow Differential Revision: D34212650 Pulled By: ejguan fbshipit-source-id: 78db4cc04ec0db8fd25b3d1e6c77eb0616075960 (cherry picked from commit c1898031c0a80c4dc7ef23a48d23acba70f465a1) --- torch/utils/data/datapipes/iter/fileopener.py | 13 +++++++++++-- torch/utils/data/datapipes/utils/common.py | 6 +++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py index b7198d78b49..6c10016dc75 100644 --- a/torch/utils/data/datapipes/iter/fileopener.py +++ b/torch/utils/data/datapipes/iter/fileopener.py @@ -1,5 +1,5 @@ from io import IOBase -from typing import Iterable, Tuple +from typing import Iterable, Tuple, Optional from torch.utils.data import IterDataPipe from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames, deprecation_warning @@ -15,6 +15,8 @@ class FileOpenerIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): the file is opened by ``open()``. It defaults to ``b`` which means open for reading in binary mode. Another option is to use ``t`` for text mode + encoding: An optional string that specifies the encoding of the + underlying file. It defaults to ``None`` to match the default encoding of ``open``. length: Nominal length of the datapipe Note: @@ -26,21 +28,28 @@ class FileOpenerIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): self, datapipe: Iterable[str], mode: str = 'r', + encoding: Optional[str] = None, length: int = -1): super().__init__() self.datapipe: Iterable = datapipe self.mode: str = mode + self.encoding: Optional[str] = encoding + if self.mode not in ('b', 't', 'rb', 'rt', 'r'): raise ValueError("Invalid mode {}".format(mode)) # TODO: enforce typing for each instance based on mode, otherwise # `argument_validation` with this DataPipe may be potentially broken + + if 'b' in mode and encoding is not None: + raise ValueError("binary mode doesn't take an encoding argument") + self.length: int = length # Remove annotation due to 'IOBase' is a general type and true type # is determined at runtime based on mode. Some `DataPipe` requiring # a subtype would cause mypy error. def __iter__(self): - yield from get_file_binaries_from_pathnames(self.datapipe, self.mode) + yield from get_file_binaries_from_pathnames(self.datapipe, self.mode, self.encoding) def __len__(self): if self.length == -1: diff --git a/torch/utils/data/datapipes/utils/common.py b/torch/utils/data/datapipes/utils/common.py index e15be6999ab..9fd3b706024 100644 --- a/torch/utils/data/datapipes/utils/common.py +++ b/torch/utils/data/datapipes/utils/common.py @@ -3,7 +3,7 @@ import fnmatch import warnings from io import IOBase -from typing import Iterable, List, Tuple, Union +from typing import Iterable, List, Tuple, Union, Optional try: import dill @@ -78,7 +78,7 @@ def get_file_pathnames_from_root( dirs.sort() -def get_file_binaries_from_pathnames(pathnames: Iterable, mode: str): +def get_file_binaries_from_pathnames(pathnames: Iterable, mode: str, encoding: Optional[str] = None): if not isinstance(pathnames, Iterable): pathnames = [pathnames, ] @@ -89,7 +89,7 @@ def get_file_binaries_from_pathnames(pathnames: Iterable, mode: str): if not isinstance(pathname, str): raise TypeError("Expected string type for pathname, but got {}" .format(type(pathname))) - yield pathname, StreamWrapper(open(pathname, mode)) + yield pathname, StreamWrapper(open(pathname, mode, encoding=encoding)) def validate_pathname_binary_tuple(data: Tuple[str, IOBase]): From 3c33f0bdcddc90ee643e05e2d3c6e14376fc9eec Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Mon, 14 Feb 2022 12:05:41 -0800 Subject: [PATCH 012/199] Clean up LoggingTensor semantic (#72620) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72620 Clarify how LoggingTensor works with autograd. The updated comment should cover the semantic changes. Test Plan: Imported from OSS Reviewed By: samdow Differential Revision: D34214956 Pulled By: albanD fbshipit-source-id: 730d0a68f4228d2a84758e6807d869a34cbc1b31 (cherry picked from commit 66110bf16bbe17d52781d05077eb73192e0fe3c4) --- test/test_python_dispatch.py | 6 +++--- torch/testing/_internal/logging_tensor.py | 13 +++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index 3cf4a18bd1e..bbda25ca79a 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -11,7 +11,7 @@ import logging class TestPythonDispatch(TestCase): def test_basic(self) -> None: with capture_logs() as logs: - x = LoggingTensor(torch.tensor([3.0], requires_grad=True)) + x = LoggingTensor(torch.tensor([3.0]), requires_grad=True) log_input("x", x) y = x * x saved_x = y.grad_fn._saved_self @@ -141,7 +141,7 @@ $5 = torch._ops.aten.kl_div($0, $1, 2, log_target=True)''') def test_detach_appears_twice_when_called_once(self) -> None: with capture_logs() as logs: - x = LoggingTensor(torch.tensor([3.0], requires_grad=True)) + x = LoggingTensor(torch.tensor([3.0]), requires_grad=True) log_input("x", x) x.detach() # FIXME: We actually want this to emit a single detach. However, @@ -240,7 +240,7 @@ $2 = torch._ops.aten.detach($1)''') return grad_output * 2 * x with capture_logs() as logs: - x = LoggingTensor(torch.ones(1, requires_grad=True)) + x = LoggingTensor(torch.ones(1), requires_grad=True) log_input("x", x) x.grad = LoggingTensor(torch.zeros(1)) log_input("x.grad", x.grad) diff --git a/torch/testing/_internal/logging_tensor.py b/torch/testing/_internal/logging_tensor.py index fec00fe30d3..a368d453651 100644 --- a/torch/testing/_internal/logging_tensor.py +++ b/torch/testing/_internal/logging_tensor.py @@ -22,6 +22,15 @@ def no_dispatch() -> Iterator[None]: # 3. Enter dispatcher, wind your way through Autograd # 4. Hit Python dispatch key, call __torch_dispatch__ +# This Tensor can work with autograd in two ways: +# - The wrapped Tensor does not require gradients. In that case, the LoggingTensor +# can require gradients if the user asks for it as a constructor kwarg. +# - The wrapped Tensor can require gradients. In that case autograd will be tracked +# for the wrapped Tensor and the LoggingTensor itself cannot require gradients. +# Note that this second one is not possible today as dispatcher exclude keys are not properly reset +# WARNING: We allow these two possibilities for testing purposes. You should NEVER use both in a single +# test or you might get surprising behavior. + # TODO: TensorBase should work class LoggingTensor(torch.Tensor): elem: torch.Tensor @@ -38,10 +47,10 @@ class LoggingTensor(torch.Tensor): strides=elem.stride(), storage_offset=elem.storage_offset(), # TODO: clone storage aliasing dtype=elem.dtype, layout=elem.layout, - device=elem.device, requires_grad=elem.requires_grad + device=elem.device, requires_grad=kwargs.get("requires_grad", False) ) # ...the real tensor is held as an element on the tensor. - r.elem = elem + r.elem = elem.detach() if r.requires_grad else elem return r def __repr__(self): From 584f13967bfbff1e884066152a7c77891e7f31f5 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Mon, 14 Feb 2022 12:05:41 -0800 Subject: [PATCH 013/199] Add wrapped Tensor autograd test (#72622) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72622 This contain a version of the test for next PR that doesn't work. To see the change in behavior more easily. Test Plan: Imported from OSS Reviewed By: samdow Differential Revision: D34214954 Pulled By: albanD fbshipit-source-id: 4d72f2d20e12c57ca7b63852ffe0c8aa61aa593b (cherry picked from commit b5d792d1039d4f6cf2679f916c53234f55035aad) --- test/test_python_dispatch.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index bbda25ca79a..d127bacc616 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -541,6 +541,32 @@ $6 = torch._ops.aten.add_($1, $5)''') z = LoggingTensor(torch.empty([])) z.set_(s) + def test_autograd_in_attr(self): + # We want the wrapped Tensor to require gradients! + true_t = torch.rand(2, requires_grad=True) + t = LoggingTensor(true_t) + + out = t + 2 + + self.assertFalse(out.requires_grad) + self.assertIsNone(out.grad_fn) + + # TODO: this should be True + self.assertFalse(out.elem.requires_grad) + # TODO: this should be not None + self.assertIsNone(out.elem.grad_fn) + + with self.assertRaisesRegex(RuntimeError, "does not require grad"): + out.backward() + + # TODO: this should not raise + with self.assertRaisesRegex(RuntimeError, "does not require grad"): + out.elem.backward() + + self.assertIsNone(t.grad) + # TODO: this should not be None + self.assertIsNone(t.elem.grad) + if __name__ == '__main__': run_tests() From 8e8c15cf6ea443380157c4faf6cf1f294afafe8f Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Mon, 14 Feb 2022 12:27:21 -0800 Subject: [PATCH 014/199] Operator developer guidance (#72470) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72470 Use markdown to write an upgrader guidance ghstack-source-id: 149079223 Test Plan: CI Reviewed By: iseeyuan, tugsbayasgalan Differential Revision: D34054964 fbshipit-source-id: 80f907701a9d44bcd6b5c22d01a471c26669bfb0 (cherry picked from commit eaed7627f526cab932273d24002a4f8e4a92c824) --- torch/csrc/jit/operator_upgraders/README.md | 205 ++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 torch/csrc/jit/operator_upgraders/README.md diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md new file mode 100644 index 00000000000..bcec71a2a3b --- /dev/null +++ b/torch/csrc/jit/operator_upgraders/README.md @@ -0,0 +1,205 @@ +# Guidance for Operator Developer + +PyTorch’s operators sometimes require changes to maintain the high quality user experience (UX) that PyTorch is known for. These changes can be backward compatibility (BC) breaking, where older programs will no longer run as expected on the latest version of PyTorch (an old writer / new reader problem) or forward compatibility (FC) breaking, where new programs will not run on older versions of PyTorch (a new writer / old reader problem). An upgrader is a method to use the new operator to mimic the old operator behavior. When a new runtime loads an old model with the old operator, the upgrader will replace the old operator in the model with the new operator. The replacement will only happen for old models, and it does not need to consider the new models. Please refer to the documentation [PyTorch Operator Versioning](https://github.com/pytorch/rfcs/blob/master/RFC-0017-PyTorch-Operator-Versioning.md) for more details. + +After you change to operator either the operator schema is BC-breaking way or the semantics of the operator, you will need to write an “upgrader” to make the change non-BC breaking iff they are used in TorchScript or mobile. In general, you can know your operator is BC breaking, if it fails `test/forward_backward_compatibility/check_forward_backward_compatibility.py ` + +The steps to write upgrader: + +### 1.Preparation + +[Build PyTorch from souce](https://github.com/pytorch/pytorch#from-source) and prepare a test model before making changes to the operator, following the process below. A test model before making the operator changes is needed to test the upgrader. Otherwise, after the change to operator, the new runtime will no longer be able to produce a model with the historic operator and can't test it anymore. + + 1. Add a test module in `test/jit/fixtures_srcs/fixtures_src.py`. In `test/jit/fixtures_srcs/generate_models.py`, + ``` + class TestVersionedLinspaceV7(torch.nn.Module): + def __init__(self): + super(TestVersionedLinspaceV7, self).__init__() + + def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]): + c = torch.linspace(a, b, steps=5) + d = torch.linspace(a, b) + return c, d + ``` + Please make sure the module uses the changed operator and follow the name schema ` TestVersioned{${OpnameOverloadedname}}V${kProducedFileFormatVersion}`. [`kProducedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) can be found in `versions.h`. The example operator usage can be found on [PyTorch Docs](https://pytorch.org/docs/stable/index.html), like [linspace operator](https://pytorch.org/docs/stable/generated/torch.linspace.html) + 2. Register its corresponding changed operator in ALL_MODULES like following. Use an instance as the key and the changed operator as the value. It will ensure the test model covers everything needed. It's important to check in a valid test model before making the change to the runtime, as it will be really challenging to switch to the revision of the source code and regenerate the test model after the change is merged. + + ``` + # key: test module instance, value: changed operator name + ALL_MODULES = { + TestVersionedLinspaceV7(): "aten::linspace", + } + ``` + + This module should include the changed operator. If the operator isn't covered in the model, the model export process will fail. + + 3. Export the model to `test/jit/fixtures` by running + ``` + python test/jit/fixtures_src/generate_models.py + ``` + + 4. Commit the change and submit a pull request. + +### 2. Make changes to the operator and write an upgrader. + 1. Make the operator change. + 2. Write an upgrader in `torch/csrc/jit/operator_upgraders/upgraders_entry.cpp` file inside a map `kUpgradersEntryMap`. The softly enforced naming format is `___`. The start and end means the upgrader can be applied to the operator exported during when [the global operator version](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) within the range `[start, end]`. Let's take an operator `linspace` with the overloaded name `out` as an example. The first thing is to check if the upgrader exists in in [upgraders_entry.cpp](https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/operator_upgraders/upgraders_entry.cpp). + 1. If the upgrader doesn't exist in `upgraders_entry.cpp`, the upgrader name can be `linspace_out_0_{kProducedFileFormatVersion}`, where [`kProducedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) can be found in [versions.h](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h). + 2. If the upgrader exist in `upgraders_entry.cpp`, for example `linspace_out_0_7` (means `linspace.out` operator is changed when operator version is bumped from 7 to 8), + 1. If it's possible to write an upgrader valid for `linspace` before versioning bumping to 8, after versioning bumping to 8, write an upgrader `linspace_out_0_{kProducedFileFormatVersion}` + 2. If it's impossible to write an upgrader valid for `linspace` before versioning bumping to 8, check the date when the version is bumped to 8 at [`versions.h`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82). If it has been 180 days, write an upgrader `linspace_out_8_{kProducedFileFormatVersion}` for `linspace.out` after bumping to 8, and deprecate the old upgrader. If it hasn't been 180 days, wait until 180 days and do the same changes as above. + + To write an upgrader, you would need to know how the new runtime with the new `linspace` operator can handle an old model with the old `linspace` operator. When `linspace` is bumped to 8, the change is to make `step` a required argument, instead of an optional argument. The old schema is: + ``` + linspace(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], dtype: Optional[int], layout: Optional[int], + device: Optional[Device], pin_memory: Optional[bool]): + ``` + And the new schema is: + ``` + linspace(start: Union[int, float, complex], end: Union[int, float, complex], steps: int, dtype: Optional[int], layout: Optional[int], + device: Optional[Device], pin_memory: Optional[bool]): + ``` + An upgrader will only be applied to an old model and it won't be applied to a new model. The upgrader can be written with the following logic: + ``` + def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int], + device: Optional[Device], pin_memory: Optional[bool]): + if (steps is None): + return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) + return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) + ``` + + The actual upgrader needs to be written as [TorchScript](https://pytorch.org/docs/stable/jit.html), and the below example is the actual upgrader of the operator `linspace.out `and the operator ` linspace` exported at version from 0 to 7. + ``` + static std::unordered_map kUpgradersEntryMap( + { + {"linspace_0_7", R"SCRIPT( + def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int], + device: Optional[Device], pin_memory: Optional[bool]): + if (steps is None): + return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) + return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) + )SCRIPT"}, + } + ``` + With the upgrader, when a new runtime loads an old model, it will first check the operator version of the old model. If it's older than the current runtime, it will replace the operator from the old model with the upgrader above. + + 3. Bump [`kMaxSupportedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L15) the [`kProducedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) by 1 and provide the reasons under [`versions.h`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L73-L81) + ``` + + constexpr uint64_t kMaxSupportedFileFormatVersion = 0x9L; + + ... + // We describe new operator version bump reasons here: + // 1) [01/24/2022] + // We bump the version number to 8 to update aten::linspace + // and aten::linspace.out to error out when steps is not + // provided. (see: https://github.com/pytorch/pytorch/issues/55951) + // 2) [01/30/2022] + // Bump the version number to 9 to update aten::logspace and + // and aten::logspace.out to error out when steps is not + // provided. (see: https://github.com/pytorch/pytorch/issues/55951) + constexpr uint64_t kProducedFileFormatVersion = 0x9L; + ``` + + 4. In `torch/csrc/jit/operator_upgraders/version_map.cpp`, add changes like below. You will need to make sure that the entry is **SORTED** by the bumped to version number. + ``` + {{${operator_name.overloaded_name}, + {{${bump_to_version}, + "${upgrader_name}", + "${old operator schema}"}}}, + ``` + For the example operator `linspace`, if there are two version bumps, one is bumped to 8 and one is bumped to 12, the sorted result is: + ``` + {{"aten::linspace", + {{12, + "linspace_0_11", + "aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}}, + {{8, + "linspace_0_7", + "aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}}, + ``` + + 5. After [rebuilding PyTorch](https://github.com/pytorch/pytorch#from-source), run the following command to auto update the file [`torch/csrc/jit/mobile/upgrader_mobile.cpp`](https://github.com/pytorch/pytorch/blob/8757e21c6a4fc00e83539aa7f9c28eb11eff53c1/torch/csrc/jit/mobile/upgrader_mobile.cpp). After rebuild PyTorch from source (`python setup.py`), run + + ``` + python pytorch/tools/codegen/operator_versions/gen_mobile_upgraders.py + ``` + + 6. Add a test. With the model generated from step 1, you will need to add tests in `test/test_save_load_for_op_versions.py`. Following is an example to write a test + ``` + @settings(max_examples=10, deadline=200000) # A total of 10 examples will be generated + @given( + sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0)) + ) # Generate a pair (integer, float) + @example((2, 3, 2.0, 3.0)) # Ensure this example will be covered + def test_versioned_div_scalar(self, sample_input): + # Step 1. Write down the old behavior of this operator, if possible + def historic_div_scalar_float(self, other: float): + return torch.true_divide(self, other) + + # Step 2. Write down how current module should look like + class MyModuleFloat(torch.nn.Module): + def __init__(self): + super(MyModuleFloat, self).__init__() + + def forward(self, a, b: float): + return a / b + try: + # Step 3. Load the old model and it will apply upgrader + v3_mobile_module_float = _load_for_lite_interpreter( + pytorch_test_dir + "/jit/fixtures/test_versioned_div_scalar_float_v2.ptl") + v3_server_module_float = torch.jit.load( + pytorch_test_dir + "/jit/fixtures/test_versioned_div_scalar_float_v2.ptl") + except Exception as e: + self.skipTest("Failed to load fixture!") + + # Step4. Load the new model and it won't apply the ugprader + current_mobile_module_float = self._save_load_mobile_module(MyModuleFloat) + current_server_module_float = self._save_load_module(MyModuleFloat) + + for val_a, val_b in product(sample_input, sample_input): + a = torch.tensor((val_a,)) + b = val_b + + def _helper(m, fn): + m_result = self._try_fn(m, a, b) + fn_result = self._try_fn(fn, a, b) + + if isinstance(m_result, Exception): + self.assertTrue(fn_result, Exception) + else: + self.assertEqual(m_result, fn_result) + + # Ensure the module loaded from the old model with upgrader + # has the same result as the module loaded from the new model + _helper(v3_mobile_module_float, current_mobile_module_float) + _helper(v3_mobile_module_float, current_server_module_float) + + # Ensure the module loaded from the new model with upgrader + # has the same result as the module loaded from the new model + _helper(current_mobile_module_float, torch.div) + _helper(current_server_module_float, torch.div) + + ``` + + 7. Commit all changes made in step 2 in a single pull request and submit it. + +You can look at following PRs to get the rough idea of what needs to be done: +1. [PR that adds `logspace` test modules](https://github.com/pytorch/pytorch/pull/72052) +2. [PR that updates `logspace`](https://github.com/pytorch/pytorch/pull/72051) + +--- +**NOTE** + +Adding arguments with a default value to an operator is not BC breaking, and thus does not require an upgrader. For example, the following change to operator `foo` is backwards compatible: +``` +# before +def foo(x, y): + return x, y +``` +``` +# after +def foo(x, y, z=100): + return x, y, z +``` + +--- From 454e2ec7bc8f19d1dc375dacb007e5ab765c3369 Mon Sep 17 00:00:00 2001 From: Jordan Fix Date: Mon, 14 Feb 2022 12:33:10 -0800 Subject: [PATCH 015/199] [test_fx_const_fold] Remove dependencies on acc_* (#72810) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72810 Test Plan: CI Reviewed By: hl475 Differential Revision: D34220004 fbshipit-source-id: c58e287cb140411dcb5a6795c179004612e4016c (cherry picked from commit 0f7c99f00498f224c60b7d5ecd2c3d902d5d6785) --- test/fx/test_fx_const_fold.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/test/fx/test_fx_const_fold.py b/test/fx/test_fx_const_fold.py index 0d178e956c4..80198c2baea 100644 --- a/test/fx/test_fx_const_fold.py +++ b/test/fx/test_fx_const_fold.py @@ -5,7 +5,6 @@ import operator import torch import torch.fx from torch.fx.experimental import const_fold -from torch.fx.experimental.fx_acc import acc_tracer, acc_ops from torch.testing._internal.common_utils import TestCase @@ -610,14 +609,14 @@ class TestConstFold(TestCase): mod = ConstFoldTestModule() in_x = torch.randn(2, 4) - gm = acc_tracer.trace(mod, in_x) + gm = torch.fx.symbolic_trace(mod) def skip_folding_quant_dequant(node: torch.fx.Node): - if node.target != acc_ops.quantize_per_tensor: + if node.target != torch.quantize_per_tensor: return False # If quantize_per_node -> dequantize, then skip folding. for user in node.users: - if user.target == acc_ops.dequantize: + if user.target == torch.dequantize: return True return False From 6199b5231fcc577caa8bd42e6ea4c413697eda91 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Mon, 14 Feb 2022 12:39:33 -0800 Subject: [PATCH 016/199] Add new tls snapshot feature (#72623) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72623 Test Plan: Imported from OSS Reviewed By: samdow Differential Revision: D34214953 Pulled By: albanD fbshipit-source-id: 7aa5d5e3540a45a0ae70c5af3a4495c755908aa9 (cherry picked from commit dc0a1ab54a459019e4cd91b30a34adbc2e4ac5a4) --- aten/src/ATen/core/PythonFallbackKernel.cpp | 24 +++++++++++++++++++++ aten/src/ATen/core/PythonModeTLS.cpp | 2 ++ c10/core/DispatchKey.cpp | 3 +++ c10/core/DispatchKey.h | 5 +++++ c10/core/DispatchKeySet.h | 5 ++++- c10/core/TensorImpl.cpp | 14 ++++++------ c10/core/impl/LocalDispatchKeySet.h | 14 ++++++++++++ test/test_python_dispatch.py | 15 +++++-------- torch/csrc/autograd/init.cpp | 5 +++-- torch/testing/_internal/logging_tensor.py | 1 - 10 files changed, 67 insertions(+), 21 deletions(-) diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index 6b51aa53156..b5861253c1e 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -4,7 +4,14 @@ namespace { +// TLS saving the state of the include/exclude sets on entry to the dispatcher +// This is set in the pythonTLSSnapshot fallback and used by the Python fallback. +thread_local c10::optional tls_on_entry; + void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { + TORCH_INTERNAL_ASSERT(tls_on_entry.has_value()); + c10::impl::ForceDispatchKeyGuard guard(tls_on_entry.value()); + // If Python Mode is active, use its PyInterpreter for dispatch const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state(); if (maybe_python_mode_state) { @@ -42,8 +49,25 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)"); } +void pythonTLSSnapshotFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { + // It is ok for the tls to be already set here. + // A CompositeImplicitAutograd function may have been called just before this and so the tls here were never cleared + // This is also why we don't need an RAII to ensure the tls is reset when exceptions happen + + tls_on_entry = c10::impl::tls_local_dispatch_key_set(); + + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::PythonTLSSnapshot), stack); + + tls_on_entry = c10::nullopt; +} + + } // anonymous namespace TORCH_LIBRARY_IMPL(_, Python, m) { m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>()); } + +TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) { + m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>()); +} diff --git a/aten/src/ATen/core/PythonModeTLS.cpp b/aten/src/ATen/core/PythonModeTLS.cpp index dd4b44bc5fe..97892fcf5d3 100644 --- a/aten/src/ATen/core/PythonModeTLS.cpp +++ b/aten/src/ATen/core/PythonModeTLS.cpp @@ -8,6 +8,7 @@ void PythonModeTLS::set_state(const std::shared_ptr& st pythonModeState = state; if (state) { c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); + c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, true); } else { PythonModeTLS::reset_state(); } @@ -20,6 +21,7 @@ const std::shared_ptr& PythonModeTLS::get_state() { void PythonModeTLS::reset_state() { pythonModeState.reset((TorchDispatchTypeObject*)nullptr); c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); + c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, false); } } // namespace impl diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp index b95558563bf..ab9f41e58f3 100644 --- a/c10/core/DispatchKey.cpp +++ b/c10/core/DispatchKey.cpp @@ -100,6 +100,8 @@ const char* toString(DispatchKey t) { case DispatchKey::Python: return "Python"; + case DispatchKey::PythonTLSSnapshot: + return "PythonTLSSnapshot"; case DispatchKey::PrivateUse1: return "PrivateUse1"; @@ -251,6 +253,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) { {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA}, {"BackendSelect", c10::DispatchKey::BackendSelect}, {"Python", c10::DispatchKey::Python}, + {"PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot}, {"Named", c10::DispatchKey::Named}, {"Conjugate", c10::DispatchKey::Conjugate}, {"Negative", c10::DispatchKey::Negative}, diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index b5860bd608c..0260ab9a38f 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -354,6 +354,11 @@ enum class DispatchKey : uint16_t { Functionalize, FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype] + // Used by Python key logic to know the set of tls on entry to the dispatcher + // This kernel assumes it is at the very top of the dispatcher. If you add + // a key above, make sure to update the fallback implementation for this. + PythonTLSSnapshot, + // TESTING: This is intended to be a generic testing tensor type id. // Don't use it for anything real; its only acceptable use is within a single // process test. Use it by creating a TensorImpl with this DispatchKey, and diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 781df91767a..5f725ab245e 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -606,7 +606,10 @@ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({ constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView = autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView); -constexpr DispatchKeySet python_ks = DispatchKeySet(DispatchKey::Python); +constexpr DispatchKeySet python_ks = DispatchKeySet({ + DispatchKey::Python, + DispatchKey::PythonTLSSnapshot, +}); constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse); diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 5a772be7a14..1d252016870 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -120,11 +120,11 @@ TensorImpl::TensorImpl( // [Note: Python key removal] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// In most constructors for TensorImpl, you will see Python key is removed from -// the passed in DispatchKeySet. Why? +// In most constructors for TensorImpl, you will see Python and PythonTLSSnapshot +// keys are removed from the passed in DispatchKeySet. Why? // -// INVARIANT: Python dispatch key is set iff PyObject for the Tensor has a -// nontrivial __torch_dispatch__ implementation. +// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject for +// the Tensor has a nontrivial __torch_dispatch__ implementation. // // When a fresh TensorImpl is created, there is *no* PyObject (this only gets // initialized lazily at the first point in time the Tensor passes into Python). @@ -132,8 +132,8 @@ TensorImpl::TensorImpl( // // In practice, what will happen shortly afterwards is that the TensorImpl // will get its PyObject initialized by Tensor._make_subclass; at this point -// the Python dispatch key will be set and all is well. The point is to delay -// the dispatch key setting until that point. +// the Python and PythonTLSSnapshot dispatch keys will be set and all is well. +// The point is to delay the dispatch key setting until that point. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) TensorImpl::TensorImpl( @@ -552,7 +552,7 @@ void TensorImpl::copy_tensor_metadata_except_version_counter( dest_impl->storage_offset_ = src_impl->storage_offset_; dest_impl->data_type_ = src_impl->data_type_; dest_impl->device_opt_ = src_impl->device_opt_; - dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python); + dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); dest_impl->is_contiguous_ = src_impl->is_contiguous_; dest_impl->has_contiguity_ = src_impl->has_contiguity_; dest_impl->is_channels_last_contiguous_ = diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 050363fc7c1..5ee622d433a 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -117,6 +117,20 @@ class C10_API ExcludeDispatchKeyGuard { DispatchKeySet exclude_; }; +struct C10_API ForceDispatchKeyGuard { + public: + ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) : + saved_keyset_(c10::impl::tls_local_dispatch_key_set()) { + c10::impl::_force_tls_local_dispatch_key_set(key_set); + } + ~ForceDispatchKeyGuard() { + c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_); + } + + private: + c10::impl::LocalDispatchKeySet saved_keyset_; +}; + // Non-RAII API for manipulating the thread-local dispatch state. // Please prefer the RAII API. The non-RAII API may be useful when // the included/excluded state of a given DispatchKey must span diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index d127bacc616..a3e7e545799 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -551,21 +551,16 @@ $6 = torch._ops.aten.add_($1, $5)''') self.assertFalse(out.requires_grad) self.assertIsNone(out.grad_fn) - # TODO: this should be True - self.assertFalse(out.elem.requires_grad) - # TODO: this should be not None - self.assertIsNone(out.elem.grad_fn) + self.assertTrue(out.elem.requires_grad) + self.assertIsNotNone(out.elem.grad_fn) with self.assertRaisesRegex(RuntimeError, "does not require grad"): - out.backward() + out.sum().backward() - # TODO: this should not raise - with self.assertRaisesRegex(RuntimeError, "does not require grad"): - out.elem.backward() + out.elem.sum().backward() self.assertIsNone(t.grad) - # TODO: this should not be None - self.assertIsNone(t.elem.grad) + self.assertIsNotNone(t.elem.grad) if __name__ == '__main__': diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 890b7f715ea..3e352294df1 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -27,9 +27,10 @@ #include struct DisableTorchDispatch { - DisableTorchDispatch() : guard_(c10::DispatchKey::Python) { - } + DisableTorchDispatch() : guard_(c10::DispatchKey::Python), + guard_tls_snapshot_(c10::DispatchKey::PythonTLSSnapshot) {} c10::impl::ExcludeDispatchKeyGuard guard_; + c10::impl::ExcludeDispatchKeyGuard guard_tls_snapshot_; }; PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { diff --git a/torch/testing/_internal/logging_tensor.py b/torch/testing/_internal/logging_tensor.py index a368d453651..d553d781735 100644 --- a/torch/testing/_internal/logging_tensor.py +++ b/torch/testing/_internal/logging_tensor.py @@ -27,7 +27,6 @@ def no_dispatch() -> Iterator[None]: # can require gradients if the user asks for it as a constructor kwarg. # - The wrapped Tensor can require gradients. In that case autograd will be tracked # for the wrapped Tensor and the LoggingTensor itself cannot require gradients. -# Note that this second one is not possible today as dispatcher exclude keys are not properly reset # WARNING: We allow these two possibilities for testing purposes. You should NEVER use both in a single # test or you might get surprising behavior. From 831cb4b94d868201de91b4aee44d18d1dd93f6b8 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Mon, 14 Feb 2022 13:46:10 -0800 Subject: [PATCH 017/199] ci: Update macOS binary workflows with new templates (#72727) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72727 Updates the macos binary workflows to align with the new templates created to fix issues with ciflow, binary upload, etc. Signed-off-by: Eli Uriegas Test Plan: Imported from OSS Reviewed By: janeyx99 Differential Revision: D34176435 Pulled By: seemethere fbshipit-source-id: bebe02b19aa2470212c9835a660c5a755c4f94d1 (cherry picked from commit 8a5f68e892d7045ce01ded5197e6c6a5cbcf9072) --- .../macos_binary_build_workflow.yml.j2 | 83 +-------- .github/templates/upload.yml.j2 | 10 +- .../generated-macos-arm64-binary-conda.yml | 121 ++++++++---- .../generated-macos-arm64-binary-wheel.yml | 161 +++++++++++----- .../generated-macos-binary-conda.yml | 161 +++++++++++----- ...erated-macos-binary-libtorch-cxx11-abi.yml | 173 ++++++++++++------ ...erated-macos-binary-libtorch-pre-cxx11.yml | 173 ++++++++++++------ .../generated-macos-binary-wheel.yml | 161 +++++++++++----- 8 files changed, 664 insertions(+), 379 deletions(-) diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 604d8251bc9..926b7e37740 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -1,4 +1,5 @@ {% import 'common.yml.j2' as common %} +{% import 'upload.yml.j2' as upload %} {%- block name -%} # Template is at: .github/templates/macos_binary_build_workflow.yml.j2 @@ -6,24 +7,6 @@ name: !{{ build_environment }} {%- endblock %} -{%- macro binary_env(config) -%} - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: !{{ config["package_type"] }} - SKIP_ALL_TESTS: 1 - DESIRED_CUDA: cpu -{%- if config["package_type"] == "libtorch" %} - LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }} - DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }} - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" -{%- else %} - DESIRED_PYTHON: "!{{ config["python_version"] }}" -{%- endif %} -{%- endmacro %} - {%- macro set_runner_specific_vars() -%} # NOTE: These environment variables are put here so that they can be applied on every job equally # They are also here because setting them at a workflow level doesn't give us access to the @@ -83,7 +66,7 @@ jobs: {%- else %} timeout-minutes: !{{ common.timeout_minutes }} {%- endif %} - !{{ binary_env(config) }} + !{{ upload.binary_env(config, true) }} # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} @@ -96,16 +79,8 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }} - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -129,53 +104,5 @@ jobs: retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - !{{ config["build_name"] }}-upload: # Uploading - runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts - if: ${{ github.repository_owner == 'pytorch' }} - needs: !{{ config["build_name"] }}-build - !{{ binary_env(config) }} - steps: - !{{ common.setup_ec2_linux() }} - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - - uses: actions/download-artifact@v2 - name: Download Build Artifacts - with: - name: !{{ config["build_name"] }} - path: "${{ runner.temp }}/artifacts/" - - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Upload binaries - env: - PKG_DIR: "${{ runner.temp }}/artifacts" - UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_SECRET_KEY }} - ANACONDA_API_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - docker run --rm -i \ - -e ANACONDA_API_TOKEN \ - -e AWS_ACCESS_KEY_ID \ - -e AWS_SECRET_ACCESS_KEY \ - -e DRY_RUN \ - -e PACKAGE_TYPE \ - -e PKG_DIR=/artifacts \ - -e UPLOAD_CHANNEL \ - -e UPLOAD_SUBFOLDER \ - -v "${RUNNER_TEMP}/artifacts:/artifacts" \ - -v "${GITHUB_WORKSPACE}:/v" \ - -w /v \ - 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/miniconda3:4.10.3 \ - bash -c '.circleci/scripts/binary_upload.sh' - !{{ common.teardown_ec2_linux() }} + !{{ upload.upload_binaries(config, has_test=False, use_s3=False) }} {%- endfor %} diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2 index 4dc13971da1..a7d89e5c743 100644 --- a/.github/templates/upload.yml.j2 +++ b/.github/templates/upload.yml.j2 @@ -32,17 +32,25 @@ {%- endmacro %} -{%- macro upload_binaries(config, is_windows=False) -%} +{%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%} !{{ config["build_name"] }}-upload: # Uploading runs-on: linux.2xlarge # self hosted runner to download ec2 artifacts if: ${{ github.repository_owner == 'pytorch' }} +{%- if has_test %} needs: !{{ config["build_name"] }}-test +{%- else %} + needs: !{{ config["build_name"] }}-build +{%- endif %} !{{ binary_env(config, is_windows) }} steps: !{{ common.setup_ec2_linux() }} - name: Clone pytorch/pytorch uses: actions/checkout@v2 +{%- if use_s3 %} - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b +{%- else %} + - uses: actions/download-artifact@v2 +{%- endif %} name: Download Build Artifacts with: name: !{{ config["build_name"] }} diff --git a/.github/workflows/generated-macos-arm64-binary-conda.yml b/.github/workflows/generated-macos-arm64-binary-conda.yml index 40383e51bee..6c2dc4e5879 100644 --- a/.github/workflows/generated-macos-arm64-binary-conda.yml +++ b/.github/workflows/generated-macos-arm64-binary-conda.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/macos_binary_build_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: macos-arm64-binary-conda @@ -43,8 +44,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -69,16 +73,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -107,11 +122,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: conda-py3_8-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" steps: - name: Display EC2 information @@ -165,11 +182,11 @@ jobs: name: conda-py3_8-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -222,8 +239,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -248,16 +268,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -286,11 +317,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: conda-py3_9-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" steps: - name: Display EC2 information @@ -344,11 +377,11 @@ jobs: name: conda-py3_9-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -401,8 +434,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -427,16 +463,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -465,11 +512,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: conda-py3_10-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information @@ -523,11 +572,11 @@ jobs: name: conda-py3_10-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then diff --git a/.github/workflows/generated-macos-arm64-binary-wheel.yml b/.github/workflows/generated-macos-arm64-binary-wheel.yml index cb407a31342..1333425238e 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/macos_binary_build_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: macos-arm64-binary-wheel @@ -43,8 +44,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.7" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -69,16 +73,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -107,11 +122,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: wheel-py3_7-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information @@ -165,11 +182,11 @@ jobs: name: wheel-py3_7-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -222,8 +239,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -248,16 +268,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -286,11 +317,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: wheel-py3_8-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" steps: - name: Display EC2 information @@ -344,11 +377,11 @@ jobs: name: wheel-py3_8-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -401,8 +434,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -427,16 +463,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -465,11 +512,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: wheel-py3_9-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" steps: - name: Display EC2 information @@ -523,11 +572,11 @@ jobs: name: wheel-py3_9-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -580,8 +629,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -606,16 +658,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -644,11 +707,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: wheel-py3_10-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information @@ -702,11 +767,11 @@ jobs: name: wheel-py3_10-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then diff --git a/.github/workflows/generated-macos-binary-conda.yml b/.github/workflows/generated-macos-binary-conda.yml index db148ed0e02..3e43727760a 100644 --- a/.github/workflows/generated-macos-binary-conda.yml +++ b/.github/workflows/generated-macos-binary-conda.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/macos_binary_build_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: macos-binary-conda @@ -41,8 +42,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.7" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -67,16 +71,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -105,11 +120,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: conda-py3_7-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information @@ -163,11 +180,11 @@ jobs: name: conda-py3_7-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -220,8 +237,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -246,16 +266,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -284,11 +315,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: conda-py3_8-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" steps: - name: Display EC2 information @@ -342,11 +375,11 @@ jobs: name: conda-py3_8-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -399,8 +432,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -425,16 +461,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -463,11 +510,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: conda-py3_9-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" steps: - name: Display EC2 information @@ -521,11 +570,11 @@ jobs: name: conda-py3_9-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -578,8 +627,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -604,16 +656,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -642,11 +705,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: conda-py3_10-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/conda-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information @@ -700,11 +765,11 @@ jobs: name: conda-py3_10-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml index 5f9ea6396f6..4b03b779454 100644 --- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml +++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/macos_binary_build_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: macos-binary-libtorch-cxx11-abi @@ -42,8 +43,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: cxx11-abi # This is a dummy value for libtorch to work correctly with our batch scripts @@ -72,16 +76,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -110,16 +125,15 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: libtorch-cpu-shared-with-deps-cxx11-abi-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information shell: bash @@ -172,11 +186,11 @@ jobs: name: libtorch-cpu-shared-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -230,8 +244,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: shared-without-deps DESIRED_DEVTOOLSET: cxx11-abi # This is a dummy value for libtorch to work correctly with our batch scripts @@ -260,16 +277,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -298,16 +326,15 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: libtorch-cpu-shared-without-deps-cxx11-abi-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: shared-without-deps DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information shell: bash @@ -360,11 +387,11 @@ jobs: name: libtorch-cpu-shared-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -418,8 +445,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: static-with-deps DESIRED_DEVTOOLSET: cxx11-abi # This is a dummy value for libtorch to work correctly with our batch scripts @@ -448,16 +478,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -486,16 +527,15 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: libtorch-cpu-static-with-deps-cxx11-abi-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: static-with-deps DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information shell: bash @@ -548,11 +588,11 @@ jobs: name: libtorch-cpu-static-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -606,8 +646,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: static-without-deps DESIRED_DEVTOOLSET: cxx11-abi # This is a dummy value for libtorch to work correctly with our batch scripts @@ -636,16 +679,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -674,16 +728,15 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: libtorch-cpu-static-without-deps-cxx11-abi-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: static-without-deps DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information shell: bash @@ -736,11 +789,11 @@ jobs: name: libtorch-cpu-static-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml index 0cac68d7291..2006f81d394 100644 --- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml +++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/macos_binary_build_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: macos-binary-libtorch-pre-cxx11 @@ -42,8 +43,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: pre-cxx11 # This is a dummy value for libtorch to work correctly with our batch scripts @@ -72,16 +76,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -110,16 +125,15 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: libtorch-cpu-shared-with-deps-pre-cxx11-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information shell: bash @@ -172,11 +186,11 @@ jobs: name: libtorch-cpu-shared-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -230,8 +244,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: shared-without-deps DESIRED_DEVTOOLSET: pre-cxx11 # This is a dummy value for libtorch to work correctly with our batch scripts @@ -260,16 +277,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -298,16 +326,15 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: libtorch-cpu-shared-without-deps-pre-cxx11-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: shared-without-deps DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information shell: bash @@ -360,11 +387,11 @@ jobs: name: libtorch-cpu-shared-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -418,8 +445,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: static-with-deps DESIRED_DEVTOOLSET: pre-cxx11 # This is a dummy value for libtorch to work correctly with our batch scripts @@ -448,16 +478,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -486,16 +527,15 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: libtorch-cpu-static-with-deps-pre-cxx11-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: static-with-deps DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information shell: bash @@ -548,11 +588,11 @@ jobs: name: libtorch-cpu-static-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -606,8 +646,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: static-without-deps DESIRED_DEVTOOLSET: pre-cxx11 # This is a dummy value for libtorch to work correctly with our batch scripts @@ -636,16 +679,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -674,16 +728,15 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: libtorch-cpu-static-without-deps-pre-cxx11-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: libtorch - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 LIBTORCH_VARIANT: static-without-deps DESIRED_DEVTOOLSET: pre-cxx11 - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information shell: bash @@ -736,11 +789,11 @@ jobs: name: libtorch-cpu-static-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then diff --git a/.github/workflows/generated-macos-binary-wheel.yml b/.github/workflows/generated-macos-binary-wheel.yml index 2a97b166dd7..0aa84d70d9e 100644 --- a/.github/workflows/generated-macos-binary-wheel.yml +++ b/.github/workflows/generated-macos-binary-wheel.yml @@ -1,4 +1,5 @@ # @generated DO NOT EDIT MANUALLY + # Template is at: .github/templates/macos_binary_build_workflow.yml.j2 # Generation script: .github/scripts/generate_ci_workflows.py name: macos-binary-wheel @@ -41,8 +42,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.7" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -67,16 +71,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -105,11 +120,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: wheel-py3_7-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.7" steps: - name: Display EC2 information @@ -163,11 +180,11 @@ jobs: name: wheel-py3_7-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -220,8 +237,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -246,16 +266,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -284,11 +315,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: wheel-py3_8-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" steps: - name: Display EC2 information @@ -342,11 +375,11 @@ jobs: name: wheel-py3_8-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -399,8 +432,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -425,16 +461,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -463,11 +510,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: wheel-py3_9-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" steps: - name: Display EC2 information @@ -521,11 +570,11 @@ jobs: name: wheel-py3_9-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then @@ -578,8 +627,11 @@ jobs: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} @@ -604,16 +656,27 @@ jobs: chmod +x "${RUNNER_TEMP}/conda.sh" /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | @@ -642,11 +705,13 @@ jobs: if: ${{ github.repository_owner == 'pytorch' }} needs: wheel-py3_10-cpu-build env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel - SKIP_ALL_TESTS: 1 + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DOCKER_IMAGE: pytorch/manylinux-builder:cpu + SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: - name: Display EC2 information @@ -700,11 +765,11 @@ jobs: name: wheel-py3_10-cpu path: "${{ runner.temp }}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/'))) }} run: | echo "DRY_RUN=disabled" >> "$GITHUB_ENV" - name: Set UPLOAD_CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/')}} + if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/') }} run: | # reference ends with an RC suffix if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then From 801abc0cddbb4b72610bdea279a08664dc96152e Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Mon, 14 Feb 2022 13:49:07 -0800 Subject: [PATCH 018/199] MAINT, DOC: Trivial spellings and warnings (#72745) Summary: Fixes N/A. Just minor annoyances. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72745 Reviewed By: samdow Differential Revision: D34216016 Pulled By: albanD fbshipit-source-id: b65600b50e41a1dd7bf7d076b0dd3e2d1c99caf9 (cherry picked from commit b959392a5fcb9a22583b49738b989f93d6b17e1f) --- c10/util/Optional.h | 2 +- docs/source/notes/extending.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/c10/util/Optional.h b/c10/util/Optional.h index e81911296bc..17f4d5a8007 100644 --- a/c10/util/Optional.h +++ b/c10/util/Optional.h @@ -12,7 +12,7 @@ // C10 // - Move file to `c10` namespace. // - Remove macro use in line 478 because the nvcc device compiler cannot handle -// it it. +// it. // - Revise constructor logic so that it is 1) consistent with c++ 17 standard // documented here in (8): // https://en.cppreference.com/w/cpp/utility/optional/optional, and 2) able to diff --git a/docs/source/notes/extending.rst b/docs/source/notes/extending.rst index 95b34e50178..dbeb135d6e2 100644 --- a/docs/source/notes/extending.rst +++ b/docs/source/notes/extending.rst @@ -355,7 +355,7 @@ Extending :mod:`torch` with a :class:`Tensor`-like type .. note:: This functionality is inspired by the NumPy ``__array_function__`` protocol. See `the NumPy documentation - `_ + `_ and `NEP-0018 `_ for more details. From 5bb851dcaaa46f4e7b0fc9243b17f2929b52da22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hendrik=20Schr=C3=B6ter?= Date: Mon, 14 Feb 2022 14:48:21 -0800 Subject: [PATCH 019/199] Fix ConvTranspose2D dilation type annotaion (#72789) Summary: ConvTranspose2D dilation parameters allows a 2-tuple as specified in the documentation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72789 Reviewed By: albanD Differential Revision: D34213968 Pulled By: jbschlosser fbshipit-source-id: 2b7f06996f5bdaa3997b9e430c3bfbdf5d988dfc (cherry picked from commit 4f7338d4f407356d585bd451f94f61ebe489b775) --- torch/nn/modules/conv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py index dcd1c9b961f..ee1abe2e6e3 100644 --- a/torch/nn/modules/conv.py +++ b/torch/nn/modules/conv.py @@ -897,7 +897,7 @@ class ConvTranspose2d(_ConvTransposeNd): output_padding: _size_2_t = 0, groups: int = 1, bias: bool = True, - dilation: int = 1, + dilation: _size_2_t = 1, padding_mode: str = 'zeros', device=None, dtype=None From f1a9650e4f4032aa336cd286036f9f650ebe2a9c Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Mon, 14 Feb 2022 15:09:10 -0800 Subject: [PATCH 020/199] Revert D34214953: Add new tls snapshot feature Test Plan: revert-hammer Differential Revision: D34214953 (https://github.com/pytorch/pytorch/commit/6199b5231fcc577caa8bd42e6ea4c413697eda91) Original commit changeset: 7aa5d5e3540a Original Phabricator Diff: D34214953 (https://github.com/pytorch/pytorch/commit/6199b5231fcc577caa8bd42e6ea4c413697eda91) fbshipit-source-id: 5d271e9a5ab021b8202402630dbf917b43c55421 (cherry picked from commit a12c630198d391e05b413ee2ff5155ab1aee282f) --- aten/src/ATen/core/PythonFallbackKernel.cpp | 24 --------------------- aten/src/ATen/core/PythonModeTLS.cpp | 2 -- c10/core/DispatchKey.cpp | 3 --- c10/core/DispatchKey.h | 5 ----- c10/core/DispatchKeySet.h | 5 +---- c10/core/TensorImpl.cpp | 14 ++++++------ c10/core/impl/LocalDispatchKeySet.h | 14 ------------ test/test_python_dispatch.py | 15 ++++++++----- torch/csrc/autograd/init.cpp | 5 ++--- torch/testing/_internal/logging_tensor.py | 1 + 10 files changed, 21 insertions(+), 67 deletions(-) diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index b5861253c1e..6b51aa53156 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -4,14 +4,7 @@ namespace { -// TLS saving the state of the include/exclude sets on entry to the dispatcher -// This is set in the pythonTLSSnapshot fallback and used by the Python fallback. -thread_local c10::optional tls_on_entry; - void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { - TORCH_INTERNAL_ASSERT(tls_on_entry.has_value()); - c10::impl::ForceDispatchKeyGuard guard(tls_on_entry.value()); - // If Python Mode is active, use its PyInterpreter for dispatch const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state(); if (maybe_python_mode_state) { @@ -49,25 +42,8 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)"); } -void pythonTLSSnapshotFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { - // It is ok for the tls to be already set here. - // A CompositeImplicitAutograd function may have been called just before this and so the tls here were never cleared - // This is also why we don't need an RAII to ensure the tls is reset when exceptions happen - - tls_on_entry = c10::impl::tls_local_dispatch_key_set(); - - op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::PythonTLSSnapshot), stack); - - tls_on_entry = c10::nullopt; -} - - } // anonymous namespace TORCH_LIBRARY_IMPL(_, Python, m) { m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>()); } - -TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) { - m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>()); -} diff --git a/aten/src/ATen/core/PythonModeTLS.cpp b/aten/src/ATen/core/PythonModeTLS.cpp index 97892fcf5d3..dd4b44bc5fe 100644 --- a/aten/src/ATen/core/PythonModeTLS.cpp +++ b/aten/src/ATen/core/PythonModeTLS.cpp @@ -8,7 +8,6 @@ void PythonModeTLS::set_state(const std::shared_ptr& st pythonModeState = state; if (state) { c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); - c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, true); } else { PythonModeTLS::reset_state(); } @@ -21,7 +20,6 @@ const std::shared_ptr& PythonModeTLS::get_state() { void PythonModeTLS::reset_state() { pythonModeState.reset((TorchDispatchTypeObject*)nullptr); c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); - c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, false); } } // namespace impl diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp index ab9f41e58f3..b95558563bf 100644 --- a/c10/core/DispatchKey.cpp +++ b/c10/core/DispatchKey.cpp @@ -100,8 +100,6 @@ const char* toString(DispatchKey t) { case DispatchKey::Python: return "Python"; - case DispatchKey::PythonTLSSnapshot: - return "PythonTLSSnapshot"; case DispatchKey::PrivateUse1: return "PrivateUse1"; @@ -253,7 +251,6 @@ c10::DispatchKey parseDispatchKey(const std::string& k) { {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA}, {"BackendSelect", c10::DispatchKey::BackendSelect}, {"Python", c10::DispatchKey::Python}, - {"PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot}, {"Named", c10::DispatchKey::Named}, {"Conjugate", c10::DispatchKey::Conjugate}, {"Negative", c10::DispatchKey::Negative}, diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index 0260ab9a38f..b5860bd608c 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -354,11 +354,6 @@ enum class DispatchKey : uint16_t { Functionalize, FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype] - // Used by Python key logic to know the set of tls on entry to the dispatcher - // This kernel assumes it is at the very top of the dispatcher. If you add - // a key above, make sure to update the fallback implementation for this. - PythonTLSSnapshot, - // TESTING: This is intended to be a generic testing tensor type id. // Don't use it for anything real; its only acceptable use is within a single // process test. Use it by creating a TensorImpl with this DispatchKey, and diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 5f725ab245e..781df91767a 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -606,10 +606,7 @@ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({ constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView = autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView); -constexpr DispatchKeySet python_ks = DispatchKeySet({ - DispatchKey::Python, - DispatchKey::PythonTLSSnapshot, -}); +constexpr DispatchKeySet python_ks = DispatchKeySet(DispatchKey::Python); constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse); diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 1d252016870..5a772be7a14 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -120,11 +120,11 @@ TensorImpl::TensorImpl( // [Note: Python key removal] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// In most constructors for TensorImpl, you will see Python and PythonTLSSnapshot -// keys are removed from the passed in DispatchKeySet. Why? +// In most constructors for TensorImpl, you will see Python key is removed from +// the passed in DispatchKeySet. Why? // -// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject for -// the Tensor has a nontrivial __torch_dispatch__ implementation. +// INVARIANT: Python dispatch key is set iff PyObject for the Tensor has a +// nontrivial __torch_dispatch__ implementation. // // When a fresh TensorImpl is created, there is *no* PyObject (this only gets // initialized lazily at the first point in time the Tensor passes into Python). @@ -132,8 +132,8 @@ TensorImpl::TensorImpl( // // In practice, what will happen shortly afterwards is that the TensorImpl // will get its PyObject initialized by Tensor._make_subclass; at this point -// the Python and PythonTLSSnapshot dispatch keys will be set and all is well. -// The point is to delay the dispatch key setting until that point. +// the Python dispatch key will be set and all is well. The point is to delay +// the dispatch key setting until that point. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) TensorImpl::TensorImpl( @@ -552,7 +552,7 @@ void TensorImpl::copy_tensor_metadata_except_version_counter( dest_impl->storage_offset_ = src_impl->storage_offset_; dest_impl->data_type_ = src_impl->data_type_; dest_impl->device_opt_ = src_impl->device_opt_; - dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); + dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python); dest_impl->is_contiguous_ = src_impl->is_contiguous_; dest_impl->has_contiguity_ = src_impl->has_contiguity_; dest_impl->is_channels_last_contiguous_ = diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 5ee622d433a..050363fc7c1 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -117,20 +117,6 @@ class C10_API ExcludeDispatchKeyGuard { DispatchKeySet exclude_; }; -struct C10_API ForceDispatchKeyGuard { - public: - ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) : - saved_keyset_(c10::impl::tls_local_dispatch_key_set()) { - c10::impl::_force_tls_local_dispatch_key_set(key_set); - } - ~ForceDispatchKeyGuard() { - c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_); - } - - private: - c10::impl::LocalDispatchKeySet saved_keyset_; -}; - // Non-RAII API for manipulating the thread-local dispatch state. // Please prefer the RAII API. The non-RAII API may be useful when // the included/excluded state of a given DispatchKey must span diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index a3e7e545799..d127bacc616 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -551,16 +551,21 @@ $6 = torch._ops.aten.add_($1, $5)''') self.assertFalse(out.requires_grad) self.assertIsNone(out.grad_fn) - self.assertTrue(out.elem.requires_grad) - self.assertIsNotNone(out.elem.grad_fn) + # TODO: this should be True + self.assertFalse(out.elem.requires_grad) + # TODO: this should be not None + self.assertIsNone(out.elem.grad_fn) with self.assertRaisesRegex(RuntimeError, "does not require grad"): - out.sum().backward() + out.backward() - out.elem.sum().backward() + # TODO: this should not raise + with self.assertRaisesRegex(RuntimeError, "does not require grad"): + out.elem.backward() self.assertIsNone(t.grad) - self.assertIsNotNone(t.elem.grad) + # TODO: this should not be None + self.assertIsNone(t.elem.grad) if __name__ == '__main__': diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 3e352294df1..890b7f715ea 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -27,10 +27,9 @@ #include struct DisableTorchDispatch { - DisableTorchDispatch() : guard_(c10::DispatchKey::Python), - guard_tls_snapshot_(c10::DispatchKey::PythonTLSSnapshot) {} + DisableTorchDispatch() : guard_(c10::DispatchKey::Python) { + } c10::impl::ExcludeDispatchKeyGuard guard_; - c10::impl::ExcludeDispatchKeyGuard guard_tls_snapshot_; }; PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { diff --git a/torch/testing/_internal/logging_tensor.py b/torch/testing/_internal/logging_tensor.py index d553d781735..a368d453651 100644 --- a/torch/testing/_internal/logging_tensor.py +++ b/torch/testing/_internal/logging_tensor.py @@ -27,6 +27,7 @@ def no_dispatch() -> Iterator[None]: # can require gradients if the user asks for it as a constructor kwarg. # - The wrapped Tensor can require gradients. In that case autograd will be tracked # for the wrapped Tensor and the LoggingTensor itself cannot require gradients. +# Note that this second one is not possible today as dispatcher exclude keys are not properly reset # WARNING: We allow these two possibilities for testing purposes. You should NEVER use both in a single # test or you might get surprising behavior. From 7f560fb3e07f8fa11de10bfdc6c8190141a905cd Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Mon, 14 Feb 2022 15:23:25 -0800 Subject: [PATCH 021/199] Revert D34034847: DispatchKeySet perf improvements Test Plan: revert-hammer Differential Revision: D34034847 (https://github.com/pytorch/pytorch/commit/8aa3620d73d9afcdbcf3d691cdd45ae05ec5e7d5) Original commit changeset: a930e44513a7 Original Phabricator Diff: D34034847 (https://github.com/pytorch/pytorch/commit/8aa3620d73d9afcdbcf3d691cdd45ae05ec5e7d5) fbshipit-source-id: 57b8b7dee252bb8d10316189a034517a28c42199 (cherry picked from commit c3151d4e73836cdfcca37f222a086d2e00755601) --- .../native/quantized/cpu/fbgemm_utils.cpp | 3 +- c10/core/DispatchKeySet.h | 9 -- c10/core/TensorImpl.cpp | 7 +- c10/core/TensorImpl.h | 89 +++++++------------ 4 files changed, 38 insertions(+), 70 deletions(-) diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp index 2cb25f360ba..ab6df06f7b7 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp @@ -160,10 +160,9 @@ Tensor MakeStridedQTensorCPU( allocator->allocate(size_bytes), allocator, /* resizable = */ true); - constexpr auto quantized_cpu_ks = at::DispatchKeySet(at::DispatchKey::QuantizedCPU); auto tensor = detail::make_tensor( storage, - quantized_cpu_ks, + at::DispatchKeySet(at::DispatchKey::QuantizedCPU), dtype, quantizer); get_qtensorimpl(tensor)->set_sizes_and_strides(sizes, strides); diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 781df91767a..1834ca0aa96 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -606,15 +606,6 @@ constexpr DispatchKeySet default_excluded_set = DispatchKeySet({ constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView = autograd_dispatch_keyset | DispatchKeySet(DispatchKey::ADInplaceOrView); -constexpr DispatchKeySet python_ks = DispatchKeySet(DispatchKey::Python); - -constexpr DispatchKeySet sparse_ks = DispatchKeySet(DispatchKey::Sparse); - -constexpr DispatchKeySet sparse_csr_ks = - DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA}); - -constexpr DispatchKeySet mkldnn_ks = DispatchKeySet(DispatchKey::MkldnnCPU); - // backend dispatch keys that map to DispatchKey::AutogradOther // NB: keys in this set also get associated with CompositeImplicitAutograd constexpr DispatchKeySet autogradother_backends = diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 5a772be7a14..379807df0c7 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -148,7 +148,8 @@ TensorImpl::TensorImpl( numel_(0), data_type_(data_type), device_opt_(storage_.device()), - key_set_(key_set - c10::python_ks) { // See [Note: Python key removal] + key_set_(key_set.remove( + DispatchKey::Python)) { // See [Note: Python key removal] init_bitfields(); // Inference tensor doesn't have version counter. if (!is_inference()) { @@ -193,8 +194,8 @@ TensorImpl::TensorImpl( key_set = key_set | getAutocastRelatedKeySetFromBackend(k); - // See [Note: Python key removal] - key_set = key_set - c10::python_ks; + key_set = + key_set.remove(DispatchKey::Python); // See [Note: Python key removal] // Inference tensor doesn't have autograd related keys. if (inference_mode) { diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 737ba18f96e..d703cb2abb8 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -838,103 +838,91 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_sparse() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_all(c10::sparse_ks); + return key_set_.has(DispatchKey::Sparse); } // Whether a tensor is sparse COO or not. Use is_sparse_csr for checking CSR // format. bool is_sparse_csr() const { - return key_set_.has_any(c10::sparse_csr_ks); + return key_set_.has(DispatchKey::SparseCsrCPU) || + key_set_.has(DispatchKey::SparseCsrCUDA); } bool is_quantized() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - constexpr auto quantized_ks = DispatchKeySet(DispatchKey::Quantized); - return key_set_.has_all(quantized_ks); + return key_set_.has(DispatchKey::Quantized); } bool is_meta() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - constexpr auto meta_ks = DispatchKeySet(DispatchKey::Meta); - return key_set_.has_all(meta_ks); + return key_set_.has(DispatchKey::Meta); } bool is_cpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - constexpr auto cpu_bits_ks = DispatchKeySet(BackendComponent::CPUBit) | - DispatchKeySet({DispatchKey::SparseCsrCPU, DispatchKey::MkldnnCPU}); - return key_set_.has_any(cpu_bits_ks); + return key_set_.has_backend(BackendComponent::CPUBit) || + key_set_.has(DispatchKey::SparseCsrCPU) || + key_set_.has(DispatchKey::MkldnnCPU); } bool is_cuda() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - constexpr auto cuda_bits_ks = DispatchKeySet(BackendComponent::CUDABit) | - DispatchKeySet(DispatchKey::SparseCsrCUDA); - return key_set_.has_any(cuda_bits_ks); + return key_set_.has_backend(BackendComponent::CUDABit) || + key_set_.has(DispatchKey::SparseCsrCUDA); } bool is_xpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - constexpr auto xpu_ks = DispatchKeySet(BackendComponent::XPUBit); - return key_set_.has_all(xpu_ks); + return key_set_.has_backend(BackendComponent::XPUBit); } bool is_xla() const { - constexpr auto xla_ks = DispatchKeySet(BackendComponent::XLABit); - return key_set_.has_all(xla_ks); + return key_set_.has_backend(BackendComponent::XLABit); } bool is_hpu() const { - constexpr auto hpu_ks = DispatchKeySet(BackendComponent::HPUBit); - return key_set_.has_all(hpu_ks); + return key_set_.has_backend(BackendComponent::HPUBit); } bool is_lazy() const { - constexpr auto lazy_ks = DispatchKeySet(BackendComponent::LazyBit); - return key_set_.has_all(lazy_ks); + return key_set_.has_backend(BackendComponent::LazyBit); } bool is_hip() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - constexpr auto hip_ks = DispatchKeySet(BackendComponent::HIPBit); - return key_set_.has_all(hip_ks); + return key_set_.has_backend(BackendComponent::HIPBit); } bool is_ve() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - constexpr auto ve_ks = DispatchKeySet(BackendComponent::VEBit); - return key_set_.has_all(ve_ks); + return key_set_.has_backend(BackendComponent::VEBit); } bool is_mkldnn() const { - return key_set_.has_all(c10::mkldnn_ks); + return key_set_.has(DispatchKey::MkldnnCPU); } bool is_vulkan() const { - constexpr auto vulkan_ks = DispatchKeySet(DispatchKey::Vulkan); - return key_set_.has_all(vulkan_ks); + return key_set_.has(DispatchKey::Vulkan); } bool is_metal() const { - constexpr auto metal_ks = DispatchKeySet(DispatchKey::Metal); - return key_set_.has_all(metal_ks); + return key_set_.has(DispatchKey::Metal); } bool is_mlc() const { - constexpr auto mls_ks = DispatchKeySet(DispatchKey::MLC); - return key_set_.has_all(mls_ks); + return key_set_.has(DispatchKey::MLC); } bool is_ort() const { - constexpr auto ort_ks = DispatchKeySet(DispatchKey::ORT); - return key_set_.has_all(ort_ks); + return key_set_.has(DispatchKey::ORT); } // TODO: remove this once we don't automatically enabled Autograd dispatch @@ -950,8 +938,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { // Invariant: // Inference tensor has version_counter_.enabled() == false bool is_inference() { - bool no_ADInplaceOrView = !key_set_.has_any(c10::inplace_or_view_ks); - bool no_Autograd = !key_set_.has_any(c10::autograd_dispatch_keyset); + bool no_ADInplaceOrView = !key_set_.has(c10::DispatchKey::ADInplaceOrView); + bool no_Autograd = (key_set_ & c10::autograd_dispatch_keyset).empty(); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( no_ADInplaceOrView == no_Autograd, "ADInplaceOrView and Autograd keys must be on/off at the same time."); @@ -972,22 +960,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { Layout layout() const { // NB: This method is not virtual and avoid dispatches for perf. - // strided is also the most common layout type, so we check for - // strided case first. - // This keyset must also be kept in sync with the logic in - // is_sparse() / is_sparse_csr() / is_mkldnn() - constexpr auto sparse_and_sparsecsr_and_mkldnn_ks = - c10::sparse_ks | c10::sparse_csr_ks | c10::mkldnn_ks; - if (!key_set_.has_any(sparse_and_sparsecsr_and_mkldnn_ks)) { - return kStrided; - } else if (is_sparse()) { + if (is_sparse()) { return kSparse; } else if (is_sparse_csr()) { return kSparseCsr; - } else { - TORCH_INTERNAL_ASSERT( - is_mkldnn(), "There is an error in the layout calculation logic."); + } else if (is_mkldnn()) { return kMkldnn; + } else { + return kStrided; } } @@ -1073,8 +1053,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the imaginary part of the tensor should be negated */ inline bool is_conj() const { - constexpr auto conjugate_ks = DispatchKeySet(DispatchKey::Conjugate); - return key_set_.has_all(conjugate_ks); + return key_set_.has(DispatchKey::Conjugate); } /** @@ -1094,8 +1073,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the tensor is a zerotensor */ inline bool _is_zerotensor() const { - constexpr auto zerotensor_ks = DispatchKeySet(DispatchKey::ZeroTensor); - return key_set_.has_all(zerotensor_ks); + return key_set_.has(DispatchKey::ZeroTensor); } /** @@ -1115,8 +1093,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Whether or not the tensor should be negated */ inline bool is_neg() const { - constexpr auto negative_ks = DispatchKeySet(DispatchKey::Negative); - return key_set_.has_all(negative_ks); + return key_set_.has(DispatchKey::Negative); } /** @@ -1487,14 +1464,14 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { void set_python_dispatch(bool k) { if (k) { - key_set_ = key_set_.add(c10::python_ks); + key_set_ = key_set_.add(DispatchKey::Python); } else { - key_set_ = key_set_ - c10::python_ks; + key_set_ = key_set_.remove(DispatchKey::Python); } } bool is_python_dispatch() const { - return key_set_.has_all(c10::python_ks); + return key_set_.has(DispatchKey::Python); } /** From 22ccf448e8d87e78dd002424bab2d937d9e2db0d Mon Sep 17 00:00:00 2001 From: Brian Hirsh Date: Mon, 14 Feb 2022 15:23:25 -0800 Subject: [PATCH 022/199] Revert D34034848: free up dispatch key space (in C++) Test Plan: revert-hammer Differential Revision: D34034848 (https://github.com/pytorch/pytorch/commit/66902560216740db6f38cb7a8458bd29c213f296) Original commit changeset: 9677ee2c0a1a Original Phabricator Diff: D34034848 (https://github.com/pytorch/pytorch/commit/66902560216740db6f38cb7a8458bd29c213f296) fbshipit-source-id: fd50943d915ef813bb9f9ab278fb582429eea3b1 (cherry picked from commit 3acefee1cdb89bc051d1ef2e9deb5698d2bd85c3) --- aten/src/ATen/TensorSubclassLikeUtils.h | 3 +- aten/src/ATen/core/TensorBase.h | 1 + .../core/dispatch/DispatchKeyExtractor.cpp | 41 -- .../ATen/core/dispatch/DispatchKeyExtractor.h | 29 +- aten/src/ATen/core/dispatch/Dispatcher.cpp | 10 +- aten/src/ATen/core/dispatch/Dispatcher.h | 11 +- aten/src/ATen/core/dispatch/OperatorEntry.cpp | 36 +- aten/src/ATen/core/dispatch/OperatorEntry.h | 11 +- .../op_registration/op_registration_test.cpp | 42 +- c10/core/DispatchKey.cpp | 163 ++--- c10/core/DispatchKey.h | 485 +++---------- c10/core/DispatchKeySet.cpp | 225 ++---- c10/core/DispatchKeySet.h | 676 +++--------------- c10/core/TensorImpl.cpp | 2 +- c10/core/TensorImpl.h | 53 +- c10/test/core/DispatchKeySet_test.cpp | 373 ++-------- test/test_dispatch.py | 26 +- test/test_sparse.py | 6 +- tools/codegen/model.py | 62 +- torch/_python_dispatcher.py | 6 +- 20 files changed, 514 insertions(+), 1747 deletions(-) diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h index e9f5e7d26e1..7f5517bc081 100644 --- a/aten/src/ATen/TensorSubclassLikeUtils.h +++ b/aten/src/ATen/TensorSubclassLikeUtils.h @@ -28,7 +28,8 @@ constexpr auto kFunctorchWrappedTensors = DispatchKeySet({ constexpr auto kTensorSubclassLike = kFunctorchWrappedTensors | DispatchKeySet({ DispatchKey::Batched, - DispatchKey::Sparse, + DispatchKey::SparseCPU, + DispatchKey::SparseCUDA, DispatchKey::SparseCsrCPU, DispatchKey::SparseCsrCUDA, DispatchKey::Meta, diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 225b6c934c0..b05f74259dc 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -43,6 +43,7 @@ inline bool variable_excluded_from_dispatch() { // Please read the comment in `VariableFallbackKernel.cpp` about the background of this change. return true; #else + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::impl::tls_local_dispatch_key_set().excluded_.has(DispatchKey::Autograd)); return c10::impl::tls_local_dispatch_key_set().excluded_.isSupersetOf(c10::autograd_dispatch_keyset); #endif } diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp index 9180d0d19e6..a930edc2db6 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.cpp @@ -6,52 +6,11 @@ namespace c10 { void DispatchKeyExtractor::setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough) { - // (1) update nonFallthroughKeys_ if (has_fallthrough) { nonFallthroughKeys_ = nonFallthroughKeys_.remove(k); } else { nonFallthroughKeys_ = nonFallthroughKeys_.add(k); } - // (2) update nonFallthroughKeysPerBackend_ - if (isPerBackendFunctionalityKey(toFunctionalityKey(k))) { - // This is a per-backend functionality key. - // We need to figure out what the current backend is, - // and only update the bitset for that backend. - // subtracting 1 because the first backend should have index 0 (CPU), - // But the enum starts with BackendComponent::InvalidBit. - auto backend_idx = static_cast(toBackendComponent(k)) - 1; - TORCH_INTERNAL_ASSERT(backend_idx >= 0 && static_cast(backend_idx) < nonFallthroughKeysPerBackend_.size()); - if (has_fallthrough) { - nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].remove(k); - } else { - nonFallthroughKeysPerBackend_[backend_idx] = nonFallthroughKeysPerBackend_[backend_idx].add(k); - } - - // Set requiresBitsetPerBackend_ accordingly - for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size() - 1)) { - if (nonFallthroughKeysPerBackend_[i] != nonFallthroughKeysPerBackend_[i+1]) { - requiresBitsetPerBackend_ = true; - return; - } - } - requiresBitsetPerBackend_ = false; - return; - } else { - // Otherwise, if a fallthrough is set for a functionality that isn't per backend, - // Then we update the fallthrough bitset for EVERY backend. - // TODO: we could probably optimize this by only lazily updating these values - // the first time that we see requiresBitsetPerBackend_ = true - // (which should almost never happen) - if (has_fallthrough) { - for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { - nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].remove(k); - } - } else { - for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { - nonFallthroughKeysPerBackend_[i] = nonFallthroughKeysPerBackend_[i].add(k); - } - } - } } std::string DispatchKeyExtractor::dumpState() const { diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h index 79ea44396bd..4d2e7d0d4bd 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h @@ -156,24 +156,14 @@ public: } }); // Keys that are fallthrough should be skipped - if (requiresBitsetPerBackend_) { - auto backend_idx = ks.getBackendIndex(); - return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]); - } else { - return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); - } + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); } template DispatchKeySet getDispatchKeySetUnboxed(const Args&... args) const { auto ks = detail::multi_dispatch_key_set(args...); // Keys that are fallthrough should be skipped - if (requiresBitsetPerBackend_) { - auto backend_idx = ks.getBackendIndex(); - return impl::computeDispatchKeySet(ks, nonFallthroughKeysPerBackend_[backend_idx]); - } else { - return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); - } + return impl::computeDispatchKeySet(ks, nonFallthroughKeys_); } void setOperatorHasFallthroughForKey(DispatchKey k, bool has_fallthrough); @@ -203,12 +193,7 @@ private: explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse) : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse) - , nonFallthroughKeys_(DispatchKeySet::FULL) - , requiresBitsetPerBackend_(false) { - for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) { - nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL; - } - } + , nonFallthroughKeys_(DispatchKeySet::FULL) {} // this is a bitset that has ones for each argument index which has to be // considered for dispatch. This avoids having to iterate over the stack @@ -220,14 +205,8 @@ private: // fallthrough c10::utils::bitset dispatch_arg_indices_reverse_; - // Set of functionality keys for which the operator does NOT have fallthrough kernel. + // Set of keys for which the operator does NOT have fallthrough kernel. DispatchKeySet nonFallthroughKeys_; - // Set of functionality keys for which the operator does NOT have fallthrough kernel, defined PER BACKEND. - // This is only needed if we know that the operator has a different set of fallthroughs defined for some backends. - std::array nonFallthroughKeysPerBackend_; - // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast path), - // or if we need to fall back to the slower path and check nonFallthroughKeysPerBackend_ - bool requiresBitsetPerBackend_; }; } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index f2426f6bb1f..3dccc4645a8 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -267,15 +267,14 @@ void Dispatcher::cleanup(const OperatorHandle& op, const OperatorName& op_name) RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, KernelFunction kernel, std::string debug) { std::lock_guard lock(mutex_); - auto idx = getDispatchTableIndexForDispatchKey(dispatchKey); TORCH_CHECK( - !backendFallbackKernels_[idx].kernel.isValid(), + !backendFallbackKernels_[static_cast(dispatchKey)].kernel.isValid(), "Tried to register multiple backend fallbacks for the same dispatch key ", dispatchKey, "; previous registration ", - backendFallbackKernels_[idx].debug, ", new registration ", debug + backendFallbackKernels_[static_cast(dispatchKey)].debug, ", new registration ", debug ); // NB: inferred function schema is always nullptr for fallbacks, as fallbacks // cannot be unobxed - backendFallbackKernels_[idx] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug)); + backendFallbackKernels_[static_cast(dispatchKey)] = impl::AnnotatedKernel(std::move(kernel), nullptr, std::move(debug)); for (auto& op : operators_) { op.op.updateFallback(*this, dispatchKey); @@ -289,8 +288,7 @@ RegistrationHandleRAII Dispatcher::registerFallback(DispatchKey dispatchKey, Ker void Dispatcher::deregisterFallback_(DispatchKey dispatchKey) { std::lock_guard lock(mutex_); - auto idx = getDispatchTableIndexForDispatchKey(dispatchKey); - backendFallbackKernels_[idx] = {}; + backendFallbackKernels_[static_cast(dispatchKey)] = {}; for (auto& op : operators_) { op.op.updateFallback(*this, dispatchKey); diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index 8108c3c1928..14ffa2f94c9 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -291,7 +291,7 @@ private: // Map from namespace to debug string (saying, e.g., where the library was defined) ska::flat_hash_map libraries_; - std::array backendFallbackKernels_; + std::array(DispatchKey::NumDispatchKeys)> backendFallbackKernels_; std::unique_ptr listeners_; std::mutex mutex_; @@ -531,7 +531,8 @@ C10_DISPATCHER_INLINE_UNLESS_MOBILE Return Dispatcher::call(const TypedOperatorH detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5 auto dispatchKeySet = op.operatorDef_->op.dispatchKeyExtractor() .template getDispatchKeySetUnboxed(args...); - const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::isAliasDispatchKey(dispatchKeySet.highestPriorityTypeId())); + const KernelFunction& kernel = op.operatorDef_->op.lookup(dispatchKeySet.highestPriorityTypeId()); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING // By default, when there're no high-frequency or non-sampled callbacks, // RecordFunction is pre-sampled as a perf optimization; @@ -552,7 +553,7 @@ template inline Return Dispatcher::redispatch(const TypedOperatorHandle& op, DispatchKeySet currentDispatchKeySet, Args... args) const { detail::unused_arg_(args...); // workaround for a false-positive warning about unused parameters in gcc 5 // do not use RecordFunction on redispatch - const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet); + const KernelFunction& kernel = op.operatorDef_->op.lookup(currentDispatchKeySet.highestPriorityTypeId()); return kernel.template call(op, currentDispatchKeySet, std::forward(args)...); } @@ -560,7 +561,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const // note: this doesn't need the mutex because write operations on the list keep iterators intact. const auto& entry = op.operatorDef_->op; auto dispatchKeySet = entry.dispatchKeyExtractor().getDispatchKeySetBoxed(stack); - const auto& kernel = entry.lookup(dispatchKeySet); + const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId()); #ifndef PYTORCH_DISABLE_PER_OP_PROFILING bool pre_sampled = false; if (C10_UNLIKELY(at::shouldRunRecordFunction(&pre_sampled))) { @@ -592,7 +593,7 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const inline void Dispatcher::redispatchBoxed(const OperatorHandle& op, DispatchKeySet dispatchKeySet, Stack* stack) const { // note: this doesn't need the mutex because write operations on the list keep iterators intact. const auto& entry = op.operatorDef_->op; - const auto& kernel = entry.lookup(dispatchKeySet); + const auto& kernel = entry.lookup(dispatchKeySet.highestPriorityTypeId()); return kernel.callBoxed(op, dispatchKeySet, stack); } diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 06165baf183..d4d997fde69 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -283,7 +283,7 @@ std::pair OperatorEntry::computeDispatchTab } // 3. Backend fallback - auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key); + auto dispatch_ix = static_cast(dispatch_key); if (dispatcher.backendFallbackKernels_[dispatch_ix].kernel.isValid()) { return {dispatcher.backendFallbackKernels_[dispatch_ix], "backend fallback"}; } @@ -299,7 +299,10 @@ std::pair OperatorEntry::computeDispatchTab // or alias keys and their associated keysets). // This function should be considered a private helper for updateDispatchTable_() void OperatorEntry::updateDispatchTableEntry_(const c10::Dispatcher& dispatcher, DispatchKey dispatch_key) { - const auto dispatch_ix = getDispatchTableIndexForDispatchKey(dispatch_key); + const auto dispatch_ix = c10::getDispatchTableIndexForDispatchKey(dispatch_key); + if (C10_UNLIKELY(dispatch_ix == -1)) { + return; + } dispatchTable_[dispatch_ix] = computeDispatchTableEntry(dispatcher, dispatch_key); dispatchKeyExtractor_.setOperatorHasFallthroughForKey(dispatch_key, dispatchTable_[dispatch_ix].isFallthrough()); } @@ -326,12 +329,8 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp } // Note [Refresh Runtime Autograd entries in dispatchTable_] // Registering to backend key might affect computed entry at its Autograd backend key due to (2.1) & (2.3). - // In theory, we should only have to check if the given runtime key has "dense" functionality, - // e.g. DispatchKey::CPU (which is composed of DispatchKey::Dense and BackendComponent::CPUBit). - // However, there are some backends that should be included in this set that don't have the dense key set. - // E.g. DispatchKey::Meta, DispatchKey::ORT. if (c10::isBackendDispatchKey(dispatch_key)) { - DispatchKey autograd_key = getAutogradKeyFromBackend(toBackendComponent(dispatch_key)); + DispatchKey autograd_key = getAutogradKeyFromBackend(dispatch_key); updateDispatchTableEntry_(dispatcher, autograd_key); } } @@ -358,9 +357,8 @@ void OperatorEntry::updateDispatchTableFull_(const c10::Dispatcher& dispatcher) // catchAll. After catchAllKernel_ is removed, Undefined now can get a kernel from either CompositeExplicitAutograd // or CompositeImplicitAutograd alias key so that we don't break the support. Ideally isIncludedInAlias(Undefined, CompositeImplicitAutograd) // should return true, it returns false because Undefined cannot be represented in a DispatchKeySet. - updateDispatchTable_(dispatcher, DispatchKey::Undefined); - for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { - updateDispatchTable_(dispatcher, k); + for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { + updateDispatchTable_(dispatcher, static_cast(iter)); } } @@ -373,10 +371,9 @@ void OperatorEntry::checkInvariants() const { for (const auto& kv : kernels_) { TORCH_INTERNAL_ASSERT(kv.second.size() > 0, dumpState()); } - for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { - auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), k); - auto idx = getDispatchTableIndexForDispatchKey(k); - TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[idx]), + for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { + auto expected_k = computeDispatchTableEntry(c10::Dispatcher::singleton(), static_cast(iter)); + TORCH_INTERNAL_ASSERT(expected_k._equalsBoxedAndUnboxed(dispatchTable_[iter]), "Canonical state\n~~~~~~~~~~~\n", dumpState(), "\n\n" "Computed table:\n~~~~~~~~~~~\n", dumpComputedTable()); } @@ -387,8 +384,7 @@ std::string OperatorEntry::listAllDispatchKeys() const { str << "["; bool has_kernels = false; - for (auto k : DispatchKeySet(DispatchKeySet::FULL)) { - auto iter = getDispatchTableIndexForDispatchKey(k); + for (uint8_t iter = 0; iter != static_cast(DispatchKey::NumDispatchKeys); ++iter) { if (!dispatchTable_[iter].isValid()) { continue; } @@ -447,12 +443,8 @@ void OperatorEntry::reportError(DispatchKey dispatchKey) const { // updateDispatchTableFull_ would update the dispatch table to be) std::string OperatorEntry::dumpComputedTable() const { std::ostringstream oss; - // Need to handle Undefined separately, because its a runtime key that can't be represented - // in a DispatchKeySet. - std::vector runtime_keys = {DispatchKey::Undefined}; - for (auto k : DispatchKeySet(DispatchKeySet::FULL)) runtime_keys.push_back(k); - - for (auto k : runtime_keys) { + for (uint8_t i = 0; i < static_cast(DispatchKey::NumDispatchKeys); i++) { + auto k = static_cast(i); auto kernel_prov = computeDispatchTableEntryWithDebug(c10::Dispatcher::singleton(), k); if (kernel_prov.first.kernel.isValid()) { oss << toString(k) << ": " diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.h b/aten/src/ATen/core/dispatch/OperatorEntry.h index d86f0cfef3e..d98bd6bc690 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.h +++ b/aten/src/ATen/core/dispatch/OperatorEntry.h @@ -173,8 +173,11 @@ public: [[noreturn]] void reportError(DispatchKey dispatchKey) const; - const KernelFunction& lookup(DispatchKeySet ks) const { - const auto idx = ks.getDispatchTableIndexForDispatchKeySet(); + const KernelFunction& lookup(DispatchKey k) const { + const auto idx = getDispatchTableIndexForDispatchKey(k); + if (C10_UNLIKELY(idx == -1)) { + reportError(k); + } const auto& kernel = dispatchTable_[idx]; // A valid kernel *always* has a boxed kernel and *may* have an // unboxed kernel. However, we typically do unboxed calls in at:: @@ -184,7 +187,7 @@ public: // in the common case. if (C10_UNLIKELY(!kernel.isValidUnboxed())) { if (!kernel.isValid()) { - reportError(ks.highestPriorityTypeId()); + reportError(k); } } return kernel; @@ -208,7 +211,7 @@ private: OperatorName name_; c10::optional schema_; - std::array dispatchTable_; + std::array dispatchTable_; DispatchKeyExtractor dispatchKeyExtractor_; // kernels_ stores all registered kernels for the corresponding dispatch key diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index 970e7949131..0a3f9236b75 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -591,7 +591,7 @@ TEST(OperatorRegistrationTest, AutogradBackendOverridesAutogradKernel) { void LazyBackendsAutogradOverridesAutogradKernel(DispatchKey key) { auto registrar = c10::RegisterOperators().op("_test::dummy(Tensor dummy) -> ()", c10::RegisterOperators::options() - .kernel(c10::getAutogradKeyFromBackend(toBackendComponent(key))) + .kernel(c10::getAutogradKeyFromBackend(key)) .kernel(DispatchKey::Autograd)); auto op = Dispatcher::singleton().findSchema({"_test::dummy", ""}); @@ -1791,22 +1791,22 @@ TEST(NewOperatorRegistrationTest, dispatchAutogradPrecedence) { TEST(NewOperatorRegistrationTest, throwsWhenRegisterToBackendMapsToAutogradOther) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - bool fpga_called, math_called = false; + bool sparsecpu_called, math_called = false; auto m = MAKE_TORCH_LIBRARY(test); - m.def("fn", torch::dispatch(c10::DispatchKey::FPGA, [&](const Tensor& x) { fpga_called = true; return x; })); + m.def("fn", torch::dispatch(c10::DispatchKey::SparseCPU, [&](const Tensor& x) { sparsecpu_called = true; return x; })); m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }); auto op = Dispatcher::singleton().findSchema({"test::fn", ""}); ASSERT_TRUE(op.has_value()); { - callOp(*op, dummyTensor(c10::DispatchKey::FPGA)); - ASSERT_TRUE(fpga_called); + callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU)); + ASSERT_TRUE(sparsecpu_called); } { expectThrows([&] { - callOp(*op, dummyTensor(c10::DispatchKey::FPGA, /*requires_grad=*/true)); + callOp(*op, dummyTensor(c10::DispatchKey::SparseCPU, /*requires_grad=*/true)); }, "test::fn has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther."); } } @@ -1849,15 +1849,18 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) { } { + // TODO(#43908): currently this will fallthrough AutogradPrivateUse1 then call catchall kernel + // at AutogradCPU, while backend extenders are indeed expecting to call PrivateUse1 kernel. + // This confusing behavior is caused by we registering fallthrough as backend fallback for + // Autograd keys. Note users could always work around this by registering the same kernel to + // AutogradPrivateUse1 as shown below until we support it. auto op = Dispatcher::singleton().findOp({"test::fn", ""}); ASSERT_TRUE(op.has_value()); catchall_called = false; - privateuse1_called = false; callOp(*op, dummyTensor(c10::DispatchKey::PrivateUse1, /*requires_grad=*/true), dummyTensor(c10::DispatchKey::CPU, /*requires_grad=*/true)); - ASSERT_FALSE(catchall_called); - ASSERT_TRUE(privateuse1_called); + ASSERT_TRUE(catchall_called); } m.impl("fn", c10::DispatchKey::AutogradPrivateUse1, [&](const Tensor& x, const Tensor& y) { privateuse1_called = true; return x; }); @@ -1873,27 +1876,6 @@ TEST(NewOperatorRegistrationTest, dispatchMultipleTensors) { } } -TEST(NewOperatorRegistrationTest, registerCompositeImplicitAutogradWithCPUKernel_andCallAutogradOtherKernel_callsComposite) { - bool math_called = false; - bool cpu_called = false; - auto m = MAKE_TORCH_LIBRARY(test); - m.def("fn(Tensor dummy) -> Tensor"); - m.impl("fn", c10::DispatchKey::CPU, [&](const Tensor& x) { cpu_called = true; return x; }); - m.impl("fn", c10::DispatchKey::CompositeImplicitAutograd, [&](const Tensor& x) { math_called = true; return x; }); - - auto op = Dispatcher::singleton().findSchema({"test::fn", ""}); - ASSERT_TRUE(op.has_value()); - - { - math_called = cpu_called = false; - // Meta should redispatch to the AutogradOther backend, - // which the composite kernel should be registered to. - callOp(*op, dummyTensor(c10::DispatchKey::Meta, /*requires_grad=*/true)); - ASSERT_TRUE(math_called); - ASSERT_FALSE(cpu_called); - } -} - TEST(NewOperatorRegistrationTest, dispatchMultiple) { bool cpu_called = false; bool cuda_called = false; diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp index b95558563bf..7d2f9e7fcb6 100644 --- a/c10/core/DispatchKey.cpp +++ b/c10/core/DispatchKey.cpp @@ -1,47 +1,14 @@ #include -#include #include namespace c10 { -const char* toString(BackendComponent t) { - switch (t) { - case BackendComponent::CPUBit: - return "CPUBit"; - case BackendComponent::CUDABit: - return "CUDABit"; - case BackendComponent::HIPBit: - return "HIPBit"; - case BackendComponent::XLABit: - return "XLABit"; - case BackendComponent::LazyBit: - return "LazyBit"; - case BackendComponent::XPUBit: - return "XPUBit"; - case BackendComponent::MLCBit: - return "MLCBit"; - case BackendComponent::HPUBit: - return "HPUBit"; - case BackendComponent::VEBit: - return "VEBit"; - case BackendComponent::PrivateUse1Bit: - return "PrivateUse1Bit"; - case BackendComponent::PrivateUse2Bit: - return "PrivateUse2Bit"; - case BackendComponent::PrivateUse3Bit: - return "PrivateUse3Bit"; - case BackendComponent::InvalidBit: - return "InvalidBit"; - default: - return "UNKNOWN_BACKEND_BIT"; - } -} - const char* toString(DispatchKey t) { switch (t) { case DispatchKey::Undefined: return "Undefined"; + case DispatchKey::CPU: return "CPU"; case DispatchKey::CUDA: @@ -134,6 +101,8 @@ const char* toString(DispatchKey t) { return "AutogradMLC"; case DispatchKey::AutogradHPU: return "AutogradHPU"; + case DispatchKey::AutogradNestedTensor: + return "AutogradNestedTensor"; case DispatchKey::AutogradPrivateUse1: return "AutogradPrivateUse1"; case DispatchKey::AutogradPrivateUse2: @@ -142,8 +111,6 @@ const char* toString(DispatchKey t) { return "AutogradPrivateUse3"; case DispatchKey::AutogradOther: return "AutogradOther"; - case DispatchKey::AutogradNestedTensor: - return "AutogradNestedTensor"; case DispatchKey::ZeroTensor: return "ZeroTensor"; @@ -201,15 +168,6 @@ const char* toString(DispatchKey t) { case DispatchKey::FuncTorchBatched: return "FuncTorchBatched"; - case DispatchKey::Dense: - return "Dense"; - case DispatchKey::Quantized: - return "Quantized"; - case DispatchKey::Sparse: - return "Sparse"; - case DispatchKey::AutogradFunctionality: - return "AutogradFunctionality"; - default: return "UNKNOWN_TENSOR_TYPE_ID"; } @@ -218,37 +176,76 @@ const char* toString(DispatchKey t) { std::ostream& operator<<(std::ostream& str, DispatchKey rhs) { return str << toString(rhs); } -std::ostream& operator<<(std::ostream& str, BackendComponent rhs) { - return str << toString(rhs); -} -DispatchKey getAutogradKeyFromBackend(BackendComponent k) { - // We want this to return an autograd key. We're relying on the fact that - // getAutogradRelatedKeySetFromBackend returns an autograd key + - // ADInplaceOrView, and autograd has higher precedence. The core mapping from - // backend -> autograd key lives in `getAutogradRelatedKeySetFromBackend` - // instead of here for performance. `getAutogradRelatedKeySetFromBackend` is a - // hotpath function, and we want to make sure that it doesn't have to - // construct any DispatchKeySets at runtime. - return getAutogradRelatedKeySetFromBackend(k).highestPriorityTypeId(); +// for a given backend key, return the associated autograd key. +// for non-backend keys, return AutogradOther as a default. +// Note: it's convenient and fast to return a default here rather than (say) +// returning an optional, or throwing. But it makes callers +// responsible for either a) enforcing the invariant that only backend keys +// be passed as arguments, or b) interpreting our return value carefully. +// +DispatchKey getAutogradKeyFromBackend(DispatchKey t) { + switch (t) { + case DispatchKey::CPU: + return DispatchKey::AutogradCPU; + case DispatchKey::XPU: + return DispatchKey::AutogradXPU; + case DispatchKey::CUDA: + return DispatchKey::AutogradCUDA; + case DispatchKey::XLA: + return DispatchKey::AutogradXLA; + case DispatchKey::Lazy: + return DispatchKey::AutogradLazy; + case DispatchKey::MLC: + return DispatchKey::AutogradMLC; + case DispatchKey::HPU: + return DispatchKey::AutogradHPU; + case DispatchKey::NestedTensor: + return DispatchKey::AutogradNestedTensor; + case DispatchKey::PrivateUse1: + return DispatchKey::AutogradPrivateUse1; + case DispatchKey::PrivateUse2: + return DispatchKey::AutogradPrivateUse2; + case DispatchKey::PrivateUse3: + return DispatchKey::AutogradPrivateUse3; + default: + return DispatchKey::AutogradOther; + } } c10::DispatchKey parseDispatchKey(const std::string& k) { static std::unordered_map key_map = { {"Undefined", c10::DispatchKey::Undefined}, - {"Dense", c10::DispatchKey::Dense}, + {"CPU", c10::DispatchKey::CPU}, + {"CUDA", c10::DispatchKey::CUDA}, + {"HIP", c10::DispatchKey::HIP}, {"FPGA", c10::DispatchKey::FPGA}, {"ORT", c10::DispatchKey::ORT}, + {"XLA", c10::DispatchKey::XLA}, + {"MLC", c10::DispatchKey::MLC}, {"Vulkan", c10::DispatchKey::Vulkan}, {"Metal", c10::DispatchKey::Metal}, + {"XPU", c10::DispatchKey::XPU}, + {"HPU", c10::DispatchKey::HPU}, {"VE", c10::DispatchKey::VE}, + {"Lazy", c10::DispatchKey::Lazy}, {"Meta", c10::DispatchKey::Meta}, - {"Quantized", c10::DispatchKey::Quantized}, + {"QuantizedCPU", c10::DispatchKey::QuantizedCPU}, + {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA}, + {"QuantizedXPU", c10::DispatchKey::QuantizedXPU}, {"CustomRNGKeyId", c10::DispatchKey::CustomRNGKeyId}, {"MkldnnCPU", c10::DispatchKey::MkldnnCPU}, - {"Sparse", c10::DispatchKey::Sparse}, + {"SparseCPU", c10::DispatchKey::SparseCPU}, + {"SparseCUDA", c10::DispatchKey::SparseCUDA}, + {"SparseHIP", c10::DispatchKey::SparseHIP}, + {"SparseXPU", c10::DispatchKey::SparseXPU}, + {"SparseVE", c10::DispatchKey::SparseVE}, {"SparseCsrCPU", c10::DispatchKey::SparseCsrCPU}, {"SparseCsrCUDA", c10::DispatchKey::SparseCsrCUDA}, + {"NestedTensor", c10::DispatchKey::NestedTensor}, + {"PrivateUse1", c10::DispatchKey::PrivateUse1}, + {"PrivateUse2", c10::DispatchKey::PrivateUse2}, + {"PrivateUse3", c10::DispatchKey::PrivateUse3}, {"BackendSelect", c10::DispatchKey::BackendSelect}, {"Python", c10::DispatchKey::Python}, {"Named", c10::DispatchKey::Named}, @@ -259,8 +256,17 @@ c10::DispatchKey parseDispatchKey(const std::string& k) { c10::DispatchKey::FuncTorchDynamicLayerBackMode}, {"ADInplaceOrView", c10::DispatchKey::ADInplaceOrView}, {"AutogradOther", c10::DispatchKey::AutogradOther}, - {"AutogradFunctionality", c10::DispatchKey::AutogradFunctionality}, + {"AutogradCPU", c10::DispatchKey::AutogradCPU}, + {"AutogradCUDA", c10::DispatchKey::AutogradCUDA}, + {"AutogradXLA", c10::DispatchKey::AutogradXLA}, + {"AutogradLazy", c10::DispatchKey::AutogradLazy}, + {"AutogradXPU", c10::DispatchKey::AutogradXPU}, + {"AutogradMLC", c10::DispatchKey::AutogradMLC}, + {"AutogradHPU", c10::DispatchKey::AutogradHPU}, {"AutogradNestedTensor", c10::DispatchKey::AutogradNestedTensor}, + {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1}, + {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2}, + {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3}, {"Tracer", c10::DispatchKey::Tracer}, {"AutocastCPU", c10::DispatchKey::AutocastCPU}, {"AutocastCUDA", c10::DispatchKey::AutocastCUDA}, @@ -274,41 +280,6 @@ c10::DispatchKey parseDispatchKey(const std::string& k) { {"TESTING_ONLY_GenericWrapper", c10::DispatchKey::TESTING_ONLY_GenericWrapper}, {"TESTING_ONLY_GenericMode", c10::DispatchKey::TESTING_ONLY_GenericMode}, - - {"CPU", c10::DispatchKey::CPU}, - {"CUDA", c10::DispatchKey::CUDA}, - {"HIP", c10::DispatchKey::HIP}, - {"XLA", c10::DispatchKey::XLA}, - {"MLC", c10::DispatchKey::MLC}, - {"XPU", c10::DispatchKey::XPU}, - {"HPU", c10::DispatchKey::HPU}, - {"Lazy", c10::DispatchKey::Lazy}, - {"NestedTensor", c10::DispatchKey::NestedTensor}, - {"PrivateUse1", c10::DispatchKey::PrivateUse1}, - {"PrivateUse2", c10::DispatchKey::PrivateUse2}, - {"PrivateUse3", c10::DispatchKey::PrivateUse3}, - - {"QuantizedCPU", c10::DispatchKey::QuantizedCPU}, - {"QuantizedCUDA", c10::DispatchKey::QuantizedCUDA}, - {"QuantizedXPU", c10::DispatchKey::QuantizedXPU}, - - {"SparseCPU", c10::DispatchKey::SparseCPU}, - {"SparseCUDA", c10::DispatchKey::SparseCUDA}, - {"SparseHIP", c10::DispatchKey::SparseHIP}, - {"SparseXPU", c10::DispatchKey::SparseXPU}, - {"SparseVE", c10::DispatchKey::SparseVE}, - - {"AutogradCPU", c10::DispatchKey::AutogradCPU}, - {"AutogradCUDA", c10::DispatchKey::AutogradCUDA}, - {"AutogradXLA", c10::DispatchKey::AutogradXLA}, - {"AutogradLazy", c10::DispatchKey::AutogradLazy}, - {"AutogradXPU", c10::DispatchKey::AutogradXPU}, - {"AutogradMLC", c10::DispatchKey::AutogradMLC}, - {"AutogradHPU", c10::DispatchKey::AutogradHPU}, - {"AutogradPrivateUse1", c10::DispatchKey::AutogradPrivateUse1}, - {"AutogradPrivateUse2", c10::DispatchKey::AutogradPrivateUse2}, - {"AutogradPrivateUse3", c10::DispatchKey::AutogradPrivateUse3}, - {"Autograd", c10::DispatchKey::Autograd}, {"CompositeImplicitAutograd", c10::DispatchKey::CompositeImplicitAutograd}, diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index b5860bd608c..1bb8268e2bd 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -9,98 +9,20 @@ namespace c10 { -// Semantically, each value of BackendComponent identifies a "backend" for our -// dispatch. Some functionalities that we may dispatch to are allowed to -// register different handlers for each backend. The BackendComponent is then -// used to figure out which backend implementation to dispatch to. - -// In implementation terms, the backend component identifies a specific "bit" in -// a DispatchKeySet. The bits in the DispatchKeySet are split between the bottom -// ~12 "BackendComponent" bits, while the remaining upper bits are assigned to -// functionalities. When we encounter a functionality bit that is known to be -// customizeable per-backend, then we also look at the lower BackendComponent -// bits and take the highest bit to determine which backend's implementation to -// use. - -enum class BackendComponent : uint8_t { - - // A "backend" is colloquially used to refer to handlers for dispatch - // which actually implement the numerics of an operation in question. - // - // Due to the nature of the enum, these backends are specified in - // an ordered way, but for most backends this order is not semantically - // meaningful (e.g., it's valid to reorder these backends without changing - // semantics). The only situation when backend ordering is meaningful - // is when the backend participates in multiple dispatch with another - // backend; e.g., CPU and CUDA (cuda must have higher priority). - - // These keys don't correspond to individual kernels. - // Instead, they represent the backends that are allowed to override specific - // pieces of functionality: - // - dense kernels (e.g. DispatchKey::CPU) - // - sparse kernels (e.g. DispatchKey::SparseCPU) - // - quantized kernels (e.g. DispatchKey::QuantizedCPU) - // - autograd kernels (e.g. DispatchKey::AutogradCPU) - // We reserve space in the runtime operator table for this full cross product - // of - // [backends in this enum] x [keys below that are explicitly marked as having - // per-backend functionality] - - InvalidBit = 0, - CPUBit, - CUDABit, - HIPBit, - XLABit, - MLCBit, - XPUBit, - HPUBit, - VEBit, - LazyBit, - PrivateUse1Bit, - PrivateUse2Bit, - PrivateUse3Bit, - // Define an alias to represent end of backend dispatch keys. - // If you add new backend keys after PrivateUse3, please also update it here. - // (But you shouldn't: private use keys should have higher precedence than - // all built-in keys) - EndOfBackendKeys = PrivateUse3Bit, -}; - // Semantically, a dispatch key identifies a possible "level" in our -// dispatch, for which a handler may be registered. Each handler corresponds -// to a type of functionality. +// dispatch, for which a handler may be registered. Traditional +// backends like CPU and CUDA get dispatch keys; however, so do +// "wrapping" layers like Variable (for autograd handling). // // In implementation terms, the dispatch key identifies a specific "bit" in a // DispatchKeySet. Higher bit indexes get handled by dispatching first (because // we "count leading zeros" when we extract the highest priority dispatch // key.) // -// Note [DispatchKey Classification] -// This enum actually contains several types of keys, which are explained -// in more detail further down: -// (1) non-customizable backends (e.g. FPGA) -// (2) non-customizable functionalities (e.g. Functionalize) -// (3) functionalized that are customizable per backend (e.g. Dense, Sparse, -// AutogradFunctionality) (4) per-backend instances of customizable -// functionalities (e.g. CPU, SparseCPU, AutogradCPU) (5) alias keys (e.g. -// CompositeImplicitAutograd) -// -// Of the categories above, it's important to note: -// (a) which keys are assigned individual bits in a DispatchKeySet -// (b) which keys are assigned individual slots in the runtime operator table -// ("Runtime keys") -// -// (1), (2) and (3) all get their own dedicated bits in the DispatchKeySet. -// (1), (2) and (4) all get their own dedicated slots in the runtime operator -// table. - -// See Note [DispatchKeySet Internal Representation] for more details. -// // NOTE: Keep the list in sync with `DispatchKey` in tools/codegen/model.py -enum class DispatchKey : uint16_t { - +enum class DispatchKey : uint8_t { // ~~~~~~~~~~~~~~~~~~~~~~~~~~ UNDEFINED ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // This is not a "real" functionality, but it exists to give us a "nullopt" + // This is not a "real" tensor id, but it exists to give us a "nullopt" // element we can return for cases when a DispatchKeySet contains no elements. // You can think a more semantically accurate definition of DispatchKey is: // @@ -116,31 +38,24 @@ enum class DispatchKey : uint16_t { // this will get eliminated, but for now it's convenient) CatchAll = Undefined, - // ~~~~~~~~~~~~~~~~~~~~~~~~~~ Functionality Keys ~~~~~~~~~~~~~~~~~~~~~~ // - // Every value in the enum (up to EndOfFunctionalityKeys) - // corresponds to an individual "functionality" that can be dispatched to. - // This is represented in the DispatchKeySet by assigning each of these enum - // values - // to each of the remaining (64 - len(BackendComponent)) bits. + // ~~~~~~~~~~~~~~~~~~~~~~~~~~ BACKENDS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // + // A "backend" is colloquially used to refer to handlers for dispatch + // which actually implement the numerics of an operation in question. // - // Most of these functionalities have a single handler assigned to them, - // making them "runtime keys". - // That map to a single slot in the runtime operator table. - // - // A few functionalities are allowed to be customizable per backend. - // See [Note: Per-Backend Functionality Dispatch Keys] for details. - - // See [Note: Per-Backend Functionality Dispatch Keys] - Dense, - - // Below are non-extensible backends. - // These are backends that currently don't have their own overrides for - // Autograd/Sparse/Quantized kernels, - // and we therefore don't waste space in the runtime operator table allocating - // space for them. - // If any of these backends ever need to customize, e.g., Autograd, then we'll - // need to add a DispatchKey::*Bit for them. + // Due to the nature of the enum, these backends are specified in + // an ordered way, but for most backends this order is not semantically + // meaningful (e.g., it's valid to reorder these backends without changing + // semantics). The only situation when backend ordering is meaningful + // is when the backend participates in multiple dispatch with another + // backend; e.g., CPU and SparseCPU (sparse must have + // higher priority). + // Here are backends which you think of as traditionally specifying + // how to implement operations on some device. + CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp + CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp + HIP, // NB: I think this is not actually used, due to Note [Masquerading as + // CUDA] FPGA, // Xilinx support lives out of tree at // https://gitlab.com/pytorch-complex/vitis_kernels @@ -152,8 +67,14 @@ enum class DispatchKey : uint16_t { // - aten/src/ATen/test/extension_backend_test.cpp ORT, + XLA, // lives out of tree at https://github.com/pytorch/xla + MLC, // lives out of tree at https://github.com/pytorch/MLCompute Vulkan, Metal, + XPU, // For out of tree Intel's heterogeneous computing plug-in + HPU, // For out of tree & closed source integration of HPU / Habana + VE, // For out of tree & closed source integration of SX-Aurora / NEC + Lazy, // For lazy tensor backends // A meta tensor is a tensor without any data associated with it. (They // have also colloquially been referred to as tensors on the "null" device). @@ -162,8 +83,11 @@ enum class DispatchKey : uint16_t { // tensor with the output shape and dtype, but wouldn't actually add anything. Meta, - // See [Note: Per-Backend Functionality Dispatch Keys] - Quantized, + // Here are backends which specify more specialized operators + // based on the dtype of the tensor. + QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp + QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp + QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in // This backend is to support custom RNGs; it lets you go // to a different kernel if you pass in a generator that is not a @@ -182,29 +106,31 @@ enum class DispatchKey : uint16_t { // the corresponding dense tensors, and must be handled before them. MkldnnCPU, // registered at build/aten/src/ATen/RegisterMkldnnCPU.cpp // NB: not to be confused with MKLDNN, which is Caffe2 only - - // See [Note: Per-Backend Functionality Dispatch Keys] - Sparse, + SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp + SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp + SparseHIP, // TODO: I think this is not actually used, due to Note + // [Masquerading as CUDA] + SparseXPU, // For out of tree Intel's heterogeneous computing plug-in + SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC SparseCsrCPU, SparseCsrCUDA, - // Note [Non-Customizable Backend Keys] - // Every key above here is considered a "non-customizable backend". - // These are backends that will work correctly with autograd, but - // but currently don't require separate implementations - // for autograd sparse or quantized kernels. - // Any new backends that don't need to be customized should go above here. - // If an existing backend needs to e.g. override autograd, then we can - // consider promoting it into the "BackendComponent" enum - // - // For all intents and purposes from the perspective of DispatchKeySet, - // "non-customizable backend" keys are treated the same way - // as other functionality keys - EndOfNonCustomizableBackends = SparseCsrCUDA, - NestedTensor, // lives out of tree at https://github.com/pytorch/nestedtensor + // Here are reserved backends for user-defined backends, see Note [Private use + // DispatchKey] + // To see some example about how to use this, check out ORT + PrivateUse1, + PrivateUse2, + PrivateUse3, + + // Define an alias key to represent end of backend dispatch keys. + // If you add new backend keys after PrivateUse3, please also update it here. + // (But you shouldn't: private use keys should have higher precedence than + // all built-in keys) + EndOfBackendKeys = PrivateUse3, + // In some situations, it is not immediately obvious what the correct // backend for function is, because the function in question doesn't // have any "tensor" arguments. In this case, a BackendSelect function @@ -307,18 +233,20 @@ enum class DispatchKey : uint16_t { // AutogradOther key. We can add specific autograd key for those backends // upon request. AutogradOther, - - // See [Note: Per-Backend Functionality Dispatch Keys] - AutogradFunctionality, - - // NestedTensor is an example of something that isn't a "real backend" - // (because it mostly consists of redispatching kernels) - // but it would like to override autograd functionality in C++. - // We can handle cases like this by adding an extra functionality key - // exclusively for handling autograd for NestedTensor. - // lives out of tree at + AutogradCPU, + AutogradCUDA, + AutogradXLA, + AutogradLazy, + AutogradXPU, + AutogradMLC, + AutogradHPU, + AutogradNestedTensor, // lives out of tree at // https://github.com/pytorch/nestedtensor - AutogradNestedTensor, + // Here are some reserved pre-autograd keys for user-defined backends, see + // Note [Private use DispatchKey] + AutogradPrivateUse1, + AutogradPrivateUse2, + AutogradPrivateUse3, Tracer, @@ -371,100 +299,9 @@ enum class DispatchKey : uint16_t { TESTING_ONLY_GenericMode, // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FIN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // - EndOfFunctionalityKeys, // End of functionality keys. - - // ~~~~~~~~~~~~~~ "Dense" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~~ // - // Here are backends which you think of as traditionally specifying - // how to implement operations on some device. - - // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] - StartOfDenseBackends, - CPU, // registered at build/aten/src/ATen/RegisterCPU.cpp - CUDA, // registered at build/aten/src/ATen/RegisterCUDA.cpp - HIP, // NB: I think this is not actually used, due to Note [Masquerading as - // CUDA] - XLA, // lives out of tree at https://github.com/pytorch/xla - MLC, // lives out of tree at https://github.com/pytorch/MLCompute - XPU, // For out of tree Intel's heterogeneous computing plug-in - HPU, // For out of tree & closed source integration of HPU / Habana - VE, // For out of tree & closed source integration of SX-Aurora / NEC - Lazy, // For lazy tensor backends - // Here are reserved backends for user-defined backends, see Note [Private use - // DispatchKey] - // To see some example about how to use this, check out ORT - PrivateUse1, - PrivateUse2, - PrivateUse3, - EndOfDenseBackends = PrivateUse3, - - // ~~~~~~~~~~~~~~ "Quantized" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~ // - // keys starting with an _ are not currently used, - // but are needed to ensure that every backend is indexed correctly. - - // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] - StartOfQuantizedBackends, - QuantizedCPU, // registered at build/aten/src/ATen/RegisterQuantizedCPU.cpp - QuantizedCUDA, // registered at build/aten/src/ATen/RegisterQuantizedCUDA.cpp - _QuantizedHIP, - _QuantizedXLA, - _QuantizedMLC, - QuantizedXPU, // For out of tree Intel's heterogeneous computing plug-in - _QuantizedHPU, - _QuantizedVE, - _QuantizedLazy, - _QuantizedPrivateUse1, - _QuantizedPrivateUse2, - _QuantizedPrivateUse3, - EndOfQuantizedBackends = _QuantizedPrivateUse3, - - // ~~~~~~~~~~~~~~ "Sparse" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~~~ // - // keys starting with an _ are not currently used, - // but are needed to ensure that every backend is indexed correctly. - - // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] - StartOfSparseBackends, - SparseCPU, // registered at build/aten/src/ATen/RegisterSparseCPU.cpp - SparseCUDA, // registered at build/aten/src/ATen/RegisterSparseCUDA.cpp - SparseHIP, // TODO: I think this is not actually used, due to Note - // [Masquerading as CUDA] - _SparseXLA, - _SparseMLC, - SparseXPU, // For out of tree Intel's heterogeneous computing plug-in - _SparseHPU, - SparseVE, // For out of tree & closed source integration of SX-Aurora / NEC - _SparseLazy, - _SparsePrivateUse1, - _SparsePrivateUse2, - _SparsePrivateUse3, - EndOfSparseBackends = _SparsePrivateUse3, - - // ~~~~~~~~~~~~~~ "Autograd" Per-Backend Dispatch keys ~~~~~~~~~~~~~~~~~ // - // keys starting with an _ are not currently used, - // but are needed to ensure that every backend is indexed correctly. - - // See Note [The Ordering of Per-Backend Dispatch Keys Matters!] - StartOfAutogradBackends, - AutogradCPU, - AutogradCUDA, - _AutogradHIP, - AutogradXLA, - AutogradMLC, - AutogradXPU, - AutogradHPU, - _AutogradVE, - AutogradLazy, - // Here are some reserved pre-autograd keys for user-defined backends, see - // Note [Private use DispatchKey] - AutogradPrivateUse1, - AutogradPrivateUse2, - AutogradPrivateUse3, - EndOfAutogradBackends = AutogradPrivateUse3, - // If we add a new per-backend functionality key that has higher priority - // than Autograd, then this key should be updated. - EndOfRuntimeBackendKeys = EndOfAutogradBackends, + NumDispatchKeys, // Sentinel, end of runtime keys. // ~~~~~~~~~~~~~~~~~~~~~~ Alias Dispatch Keys ~~~~~~~~~~~~~~~~~~~~~~~~~~ // - // Note [Alias Dispatch Keys] // Alias dispatch keys are synthetic dispatch keys which map to multiple // runtime dispatch keys. Alisa keys have precedence, but they are always // lower precedence than runtime keys. You can register a kernel to an @@ -484,7 +321,6 @@ enum class DispatchKey : uint16_t { // Define an alias key to represent end of alias dispatch keys. // If you add new alias keys after Autograd, please also update it here. - StartOfAliasKeys = Autograd, EndOfAliasKeys = CompositeExplicitAutograd, // // ~~~~~~~~~~~~~~~~~~~~~~~~~ BC ALIASES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // @@ -524,83 +360,54 @@ enum class DispatchKey : uint16_t { // built-in autograd formulas for operators are not appropriate. static_assert( - (static_cast(BackendComponent::EndOfBackendKeys) + - static_cast(DispatchKey::EndOfFunctionalityKeys)) <= 64, - "The BackendComponent and DispatchKey enums (below EndOfFunctionalityKeys)" - " both map to backend and functionality bits" - " into a 64-bit bitmask; you must have less than 64 total entries between them"); - -// Check if a DispatchKey is an alias mapping to other runtime keys. -constexpr bool isAliasDispatchKey(DispatchKey k) { - return k >= DispatchKey::StartOfAliasKeys && k <= DispatchKey::EndOfAliasKeys; -} - -// [Note: Per-Backend Functionality Dispatch Keys] -// Check if a DispatchKey is a per-backend functionality key -// Any functionalities that can be customized per-backend should be added here. -// These keys correspond to functionalities that can be customized indivually -// per backend. While they only take up one bit in the `DispatchKeySet` bitset, -// they map to (# backends) slots in the operator table. -// Each of these keys also has a separate set of "runtime keys" in the dispatch -// key enum, per backend, which *do* map to the individual operator table slots. -// For example, the "Sparse" key maps to an individual bit in the -// DispatchKeySet, while `SparseCPU`, `SparseCUDA`, etc all map to individual -// slots in the runtime operator table. - -constexpr bool isPerBackendFunctionalityKey(DispatchKey k) { - if (k == DispatchKey::Dense || k == DispatchKey::Quantized || - k == DispatchKey::Sparse || k == DispatchKey::AutogradFunctionality) { - return true; - } else { - return false; - } -} - -// Note that this includes Undefined in the total count. -// BUT EndOfFunctionalityKeys is its own (placeholder) key. -// e.g. Undefined=0, Dense=1, Sparse=2, EndOfFunctionalityKeys=3. -// In the above example, there are 3 total functionality keys. -constexpr uint8_t num_functionality_keys = - static_cast(DispatchKey::EndOfFunctionalityKeys); - -// Note [No More Than 16 Backends] -// Search for this note to find places in the code where the "no more than 16 -// backends" invariant is baked in. -static_assert( - static_cast(BackendComponent::EndOfBackendKeys) <= 16, - "BackendComponent currently only supports <= 16 backends. If we really need to extend this, \ -there are a few places where this invariant is baked in"); - -constexpr uint8_t numPerBackendFunctionalityKeys() { - uint8_t count = 0; - for (uint8_t k = 0; k <= num_functionality_keys; ++k) { - if (isPerBackendFunctionalityKey(static_cast(k))) - ++count; - } - return count; -} + static_cast(DispatchKey::NumDispatchKeys) < 64, + "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries"); #if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) -// See [Note: Trimmed Mobile Dispatch Keys] -constexpr uint8_t num_backends = 1; // Only CPU -constexpr uint16_t num_runtime_entries = 8; +/** + * The method below maps the dispatch key in the enum DispatchKey to an + * integer index in the dispatchTable_ array in OperatorEntry. The array + * is trimmed for mobile to reduce peak memory usage since it's + * unnecessary to reserve additional space for dispatch keys that will + * never be used on mobile. + */ +C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) { + switch (dk) { + case DispatchKey::Undefined: + return 0; + case DispatchKey::CPU: + return 1; + case DispatchKey::QuantizedCPU: + return 2; + case DispatchKey::SparseCPU: + return 3; + case DispatchKey::BackendSelect: + return 4; + case DispatchKey::ADInplaceOrView: + return 5; + case DispatchKey::AutogradOther: + return 6; + case DispatchKey::AutogradCPU: + return 7; + case DispatchKey::NumDispatchKeys: // Sentinel, end of runtime keys. + return 8; + default: + return -1; + } +} #else -constexpr uint8_t num_backends = - static_cast(BackendComponent::EndOfBackendKeys); -constexpr uint16_t num_runtime_entries = num_functionality_keys + - (numPerBackendFunctionalityKeys() * (num_backends - 1)); +/** + * For the server use-case, make this a simple pass-through. + */ +C10_API constexpr int getDispatchTableIndexForDispatchKey(DispatchKey dk) { + return static_cast(dk); +} #endif -// See Note [No More Than 16 Backends] -constexpr uint16_t full_backend_mask = - (static_cast(1) << num_backends) - 1; - C10_API const char* toString(DispatchKey); -C10_API const char* toString(BackendComponent); C10_API std::ostream& operator<<(std::ostream&, DispatchKey); -C10_API std::ostream& operator<<(std::ostream&, BackendComponent); -C10_API DispatchKey getAutogradKeyFromBackend(BackendComponent k); +C10_API DispatchKey getAutogradKeyFromBackend(DispatchKey t); // Parses a string into a dispatch key. // If the string cannot be correctly parsed, throws an exception. @@ -613,86 +420,10 @@ C10_API c10::DispatchKey parseDispatchKey(const std::string& k); // torch::dispatch(torch::kCPU, ...) is also valid. constexpr DispatchKey kAutograd = DispatchKey::Autograd; -// See Note [The Ordering of Per-Backend Dispatch Keys Matters!] -// This function relies on the invariant that the dispatch keys between -// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend -// in the same order as `BackendComponent`. -constexpr BackendComponent toBackendComponent(DispatchKey k) { - if (k >= DispatchKey::StartOfDenseBackends && - k <= DispatchKey::EndOfDenseBackends) { - return static_cast( - static_cast(k) - - static_cast(DispatchKey::StartOfDenseBackends)); - } else if ( - k >= DispatchKey::StartOfQuantizedBackends && - k <= DispatchKey::EndOfQuantizedBackends) { - return static_cast( - static_cast(k) - - static_cast(DispatchKey::StartOfQuantizedBackends)); - } else if ( - k >= DispatchKey::StartOfSparseBackends && - k <= DispatchKey::EndOfSparseBackends) { - return static_cast( - static_cast(k) - - static_cast(DispatchKey::StartOfSparseBackends)); - } else if ( - k >= DispatchKey::StartOfAutogradBackends && - k <= DispatchKey::EndOfAutogradBackends) { - return static_cast( - static_cast(k) - - static_cast(DispatchKey::StartOfAutogradBackends)); - } else { - return BackendComponent::InvalidBit; - } +// Check if a DispatchKey is an alias mapping to other runtime keys. +inline bool isAliasDispatchKey(DispatchKey k) { + return k > DispatchKey::NumDispatchKeys && k <= DispatchKey::EndOfAliasKeys; } - -constexpr DispatchKey toFunctionalityKey(DispatchKey k) { - if (k <= DispatchKey::EndOfFunctionalityKeys) { - return k; - } else if (k <= DispatchKey::EndOfDenseBackends) { - return DispatchKey::Dense; - } else if (k <= DispatchKey::EndOfQuantizedBackends) { - return DispatchKey::Quantized; - } else if (k <= DispatchKey::EndOfSparseBackends) { - return DispatchKey::Sparse; - } else if (k <= DispatchKey::EndOfAutogradBackends) { - return DispatchKey::AutogradFunctionality; - } else { - return DispatchKey::Undefined; - } -} - -// Given (DispatchKey::Dense, DispatchKey::CUDABit), returns DispatchKey::CUDA -// See Note [The Ordering of Per-Backend Dispatch Keys Matters!] -// This function relies on the invariant that the dispatch keys between -// StartOfDenseBackends and EndOfRuntimeBackendKeys are ordered by backend -// in the same order as `BackendComponent`. -constexpr DispatchKey toRuntimePerBackendFunctionalityKey( - DispatchKey functionality_k, - BackendComponent backend_k) { - if (functionality_k == DispatchKey::Dense) { - return static_cast( - static_cast(DispatchKey::StartOfDenseBackends) + - static_cast(backend_k)); - } - if (functionality_k == DispatchKey::Sparse) { - return static_cast( - static_cast(DispatchKey::StartOfSparseBackends) + - static_cast(backend_k)); - } - if (functionality_k == DispatchKey::Quantized) { - return static_cast( - static_cast(DispatchKey::StartOfQuantizedBackends) + - static_cast(backend_k)); - } - if (functionality_k == DispatchKey::AutogradFunctionality) { - return static_cast( - static_cast(DispatchKey::StartOfAutogradBackends) + - static_cast(backend_k)); - } - return DispatchKey::Undefined; -} - } // namespace c10 namespace torch { diff --git a/c10/core/DispatchKeySet.cpp b/c10/core/DispatchKeySet.cpp index d5c11c02399..7f85567f886 100644 --- a/c10/core/DispatchKeySet.cpp +++ b/c10/core/DispatchKeySet.cpp @@ -1,29 +1,37 @@ #include -#include namespace c10 { -// backend_dispatch_keyset includes all dispatch keys that map to backends. +// backend_dispatch_keyset should include all runtime backend keys. // Alias key DispatchKey::CompositeExplicitAutograd maps to -// backend_dispatch_keyset -constexpr DispatchKeySet backend_dispatch_keyset = - autogradother_backends | DispatchKeySet(DispatchKey::Dense); +// backend_dispatch_keyset NestedTensor has been explicitly removed due to +// incompatibility with some kernels, such as structured kernels, that use the +// DefaultBackend key. +constexpr DispatchKeySet backend_dispatch_keyset = autogradother_backends | + DispatchKeySet({ + DispatchKey::CPU, + DispatchKey::CUDA, + DispatchKey::XLA, + DispatchKey::Lazy, + DispatchKey::XPU, + DispatchKey::PrivateUse1, + DispatchKey::PrivateUse2, + DispatchKey::PrivateUse3, + DispatchKey::MLC, + DispatchKey::HPU, + DispatchKey::ORT, + DispatchKey::Meta, + }); bool isBackendDispatchKey(DispatchKey t) { return t != DispatchKey::Undefined // See Note [No Alias Keys in DispatchKeySet] - && !isAliasDispatchKey(t) - // Note [NestedTensor Not Included in Backend Keys] - // NestedTensor has been explicitly removed from the "backend keyset" due - // to incompatibility with some kernels, so we don't want it to be - // included in CompositeImplicitAutograd or CompositeExplicitAutograd - // kernels. - && t != DispatchKey::NestedTensor && backend_dispatch_keyset.has(t); + && !isAliasDispatchKey(t) && backend_dispatch_keyset.has(t); } // math_dispatch_keyset contains all keys in backend_dispatch_keyset and // autograd_dispatch_keyset Alias key DispatchKey::CompositeImplicitAutograd -// maps to [math_dispatch_keyset x full_backend_mask] +// maps to math_dispatch_keyset. constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset | autograd_dispatch_keyset; @@ -31,12 +39,7 @@ DispatchKeySet getRuntimeDispatchKeySet(DispatchKey t) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); switch (t) { case DispatchKey::Autograd: - // See Note [autograd_dispatch_keyset Does Not Include Backend Bits] - // That's why we OR it with a mask of the backend bits here. - // getRuntimeDispatchKeySet() expects to return a keyset of runtime - // dispatch keys, like AutogradCPU, but that requires having backend bits. - return autograd_dispatch_keyset | - DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); + return autograd_dispatch_keyset; case DispatchKey::CompositeImplicitAutograd: return math_dispatch_keyset; case DispatchKey::CompositeExplicitAutograd: @@ -50,13 +53,11 @@ bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k) { TORCH_INTERNAL_ASSERT(t != DispatchKey::Undefined); switch (t) { case DispatchKey::Autograd: - return autograd_dispatch_keyset.has(toFunctionalityKey(k)); + return autograd_dispatch_keyset.has(k); case DispatchKey::CompositeImplicitAutograd: - // See Note [NestedTensor Not Included in Backend Keys] - return k != DispatchKey::NestedTensor && math_dispatch_keyset.has(k); + return math_dispatch_keyset.has(k); case DispatchKey::CompositeExplicitAutograd: - // See Note [NestedTensor Not Included in Backend Keys] - return k != DispatchKey::NestedTensor && backend_dispatch_keyset.has(k); + return backend_dispatch_keyset.has(k); default: return t == k; } @@ -78,6 +79,8 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) { return DispatchKeySet(DispatchKey::MLC); case DispatchKey::AutogradHPU: return DispatchKeySet(DispatchKey::HPU); + case DispatchKey::AutogradNestedTensor: + return DispatchKeySet(DispatchKey::NestedTensor); case DispatchKey::AutogradXPU: return DispatchKeySet(DispatchKey::XPU); case DispatchKey::AutogradPrivateUse1: @@ -93,6 +96,23 @@ DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t) { } } +DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t) { + switch (t) { + case DispatchKey::CPU: + return DispatchKeySet(DispatchKey::AutocastCPU); + case DispatchKey::CUDA: + case DispatchKey::XLA: + return DispatchKeySet(DispatchKey::AutocastCUDA); + default: + return DispatchKeySet(); + } +} + +DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t) { + return DispatchKeySet( + {DispatchKey::ADInplaceOrView, getAutogradKeyFromBackend(t)}); +} + bool isIncludedInAlias(DispatchKey k, DispatchKey alias) { return k != DispatchKey::Undefined && runtimeDispatchKeySetHas(alias, k); } @@ -109,167 +129,18 @@ std::ostream& operator<<(std::ostream& os, DispatchKeySet ts) { return os; } os << "DispatchKeySet("; + DispatchKey tid; bool first = true; - for (auto k : ts) { + while ((tid = ts.highestPriorityTypeId()) != DispatchKey::Undefined) { if (!first) { os << ", "; } - os << k; + os << tid; + ts = ts.remove(tid); first = false; } os << ")"; return os; } -DispatchKeySet::iterator& DispatchKeySet::iterator::operator++() { - TORCH_INTERNAL_ASSERT(next_functionality_ >= num_backends); - TORCH_INTERNAL_ASSERT(next_functionality_ <= iterator::end_iter_mask_val); - TORCH_INTERNAL_ASSERT(next_backend_ <= num_backends); - - // Create a masked version of the set representation to ignore previous - // keys that we've iterated through. - uint64_t masked_functionality_bits = - llvm::maskTrailingZeros(next_functionality_) & *data_ptr_; - uint64_t masked_backend_bits = - llvm::maskTrailingZeros(next_backend_) & full_backend_mask & - *data_ptr_; - - uint64_t first_functionality_idx = - llvm::findFirstSet(masked_functionality_bits); - uint64_t first_backendcomponent_idx = llvm::findFirstSet(masked_backend_bits); - - // If there are no keys, set to end iterator value - if (first_functionality_idx == std::numeric_limits::max() || - next_functionality_ == iterator::end_iter_mask_val) { - // Set up state to be the same as end() - next_functionality_ = iterator::end_iter_mask_val; - current_dispatchkey_idx_ = iterator::end_iter_key_val; - next_backend_ = 0; - current_backendcomponent_idx_ = iterator::end_iter_key_val; - return *this; - } - - // The +1 is because of DispatchKey::Undefined and - // BackendComponent::InvalidBit - auto new_next_functionality = first_functionality_idx + 1; - auto new_backendcomponent_idx = first_backendcomponent_idx + 1; - // and the -num_backends is because the first bits in the - // keyset are not Dispatch Keys. - auto next_dispatchkey_idx = new_next_functionality - num_backends; - - // If the current functionality bit is a per-backend bit, we need special - // handling - if (isPerBackendFunctionalityKey( - static_cast(next_dispatchkey_idx))) { - // case 1: if the current backend is undefined, then there is no valid - // backend instance of this functionality key so we can skip it. - if (first_backendcomponent_idx == std::numeric_limits::max()) { - // increment the functionality mask so we skip the current functionality - // bit on the next increment. - next_functionality_ = new_next_functionality; - ++(*this); - return *this; - } - - // Otherwise, at this point we know what the current backend and - // functionality bits are. - current_dispatchkey_idx_ = next_dispatchkey_idx; - current_backendcomponent_idx_ = new_backendcomponent_idx; - - // Next, we need to set up the masks for the next increment. - uint64_t next_backendcomponent_bits = - llvm::maskTrailingZeros(first_backendcomponent_idx + 1) & - full_backend_mask & *data_ptr_; - uint64_t next_backendcomponent_idx = - llvm::findFirstSet(next_backendcomponent_bits); - if (next_backendcomponent_idx == std::numeric_limits::max()) { - // case 2: the current backend is valid, but there is not another backend - // in the keyset. In this case, we need to bump the functionality mask and - // reset the backend mask for the next increment - next_functionality_ = new_next_functionality; - next_backend_ = 0; - } else { - // case 3: we have another backend to iterate over. We want to iterate - // over the same functionality bit next time, but a different backend bit. - next_backend_ = first_backendcomponent_idx + 1; - } - } else { - // Functionality bits that aren't per backend are simpler to handle. We can - // ignore the backend bits. - TORCH_INTERNAL_ASSERT(next_backend_ == 0); - current_dispatchkey_idx_ = next_dispatchkey_idx; - next_functionality_ = new_next_functionality; - } - return *this; -} - -std::array -initializeFunctionalityOffsetsAndMasks() { - std::array - offsets_and_masks; - // manualy set the first entry, which corresponds to Undefined. - offsets_and_masks[0] = FunctionalityOffsetAndMask(0, 0); - // loop through every functionality key (aside from Undefined). - for (const auto functionality_idx : c10::irange(1, num_functionality_keys)) { - // functionality_idx should be Dense -> 1, ... - auto prev_offset_and_mask = offsets_and_masks[functionality_idx - 1]; - auto k = static_cast(functionality_idx); - -#if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) - // [Note: Trimmed Mobile Dispatch Keys] - uint16_t mask = 0; - uint16_t offset = 0; - switch (k) { - case DispatchKey::Undefined: - offset = 0; - case DispatchKey::CPU: - offset = 1; - case DispatchKey::QuantizedCPU: - offset = 2; - case DispatchKey::SparseCPU: - offset = 3; - case DispatchKey::BackendSelect: - offset = 4; - case DispatchKey::ADInplaceOrView: - offset = 5; - case DispatchKey::AutogradOther: - offset = 6; - case DispatchKey::AutogradCPU: - offset = 7; - default: - // All other keys which are unsupported on mobile will get sent - // to the undefined kernel, causing them to error. - offset = 0; - } - offsets_and_masks[functionality_idx] = - FunctionalityOffsetAndMask(offset, 0); - } -#else - // If the previous functionality was not per-backend, then we can just - // increment the previous offset. Otherwise, the next offset = - // previous_offset + num_backends. - auto next_offset = prev_offset_and_mask.offset + - (prev_offset_and_mask.mask == 0 ? 1 : num_backends); - // the mask is used in the runtime index calculation to find the offset of - // the backend. For non-per-backend functionalities, this offset should - // always be 0. Otherwise, we need to get the index of the backend (which we - // can do using a backend mask). - auto next_mask = isPerBackendFunctionalityKey(k) ? full_backend_mask : 0; - offsets_and_masks[functionality_idx] = - FunctionalityOffsetAndMask(next_offset, next_mask); - } - // Sanity check that the computed offset index of the last functionality key - // is correct. This assumes that the highest priority functionality key is not - // per backend. - TORCH_INTERNAL_ASSERT( - offsets_and_masks[num_functionality_keys - 1].offset == - (num_runtime_entries - 1), - "num_runtime_entries: ", - num_runtime_entries, - "last_offset: ", - offsets_and_masks[num_functionality_keys - 1].offset); -#endif - return offsets_and_masks; -} - } // namespace c10 diff --git a/c10/core/DispatchKeySet.h b/c10/core/DispatchKeySet.h index 1834ca0aa96..79d39652219 100644 --- a/c10/core/DispatchKeySet.h +++ b/c10/core/DispatchKeySet.h @@ -1,4 +1,5 @@ #pragma once + #include #include #include @@ -7,147 +8,29 @@ namespace c10 { -struct FunctionalityOffsetAndMask { - // empty constructor shouldn't be used; only needed to initialize - // the array before populating it. - FunctionalityOffsetAndMask() {} - FunctionalityOffsetAndMask(uint16_t offset, uint16_t mask) - : offset(offset), mask(mask) {} - // This needs to big enough to cover the size of the operator table. - uint16_t offset; - // See Note [No More Than 16 Backends] - // This mask needs to be big enough to mask all of the backend bits. - // We probably don't ever want to have more than 16 backend bits, so uint16_t - // should be enough. - uint16_t mask; -}; -static_assert( - c10::num_runtime_entries < 65536, - "The dispatcher currently only supports up to 2^16 runtime entries"); - -C10_API std::array -initializeFunctionalityOffsetsAndMasks(); - -C10_ALWAYS_INLINE static const std:: - array& - offsetsAndMasks() { - static auto offsets_and_masks_ = initializeFunctionalityOffsetsAndMasks(); - return offsets_and_masks_; -} - -// A representation of a set of DispatchKeys. A DispatchKeySet contains both -// "functionality" bits and "backend bits", and every tensor holds its own -// DispatchKeySet. The Dispatcher implements multiple dispatch by grabbing the -// keyset on every input tensor, or’ing them together, and dispatching to a -// specific piece of functionality. The functionality bits are *ordered*. When -// multiple functionality bits are set, we use the highest priority -// functionality. Similarly, multiple backend bits can theoretically be set if -// you call an operator with multiple tensors from difference devices (e.g. CPU -// and CUDA), although support for mixed device dispatch is limited (the only -// kernels that gracefully handle mixed device inputs for now are cuda kernels -// that take in a scalar cpu tensor). - // A representation of a set of DispatchKeys. A tensor may have multiple // tensor type ids, e.g., a Variable tensor can also be a CPU tensor; the // DispatchKeySet specifies what type ids apply. The internal representation is // as a 64-bit bit set (this means only 64 tensor type ids are supported). // -// As mentioned above, DispatchKeys are ordered; thus, we can ask questions like -// "what is the highest priority DispatchKey in the set"? (The set itself is -// not ordered; two sets with the same ids will always have the ids ordered in -// the same way.) +// Note that DispatchKeys are ordered; thus, we can ask questions like "what is +// the highest priority DispatchKey in the set"? (The set itself is not +// ordered; two sets with the same ids will always have the ids ordered in the +// same way.) // -// Note [DispatchKeySet Internal Representation] -// Internally, dispatch keys are packed into 64-bit DispatchKeySet objects -// that get passed around at runtime. -// However, there isn't necessarily a 1-to-1 mapping between bits in the keyset -// and individual dispatch keys. +// At the moment, there are no nontrivial uses of this set; tensors are always +// singletons. In the near future, this set will represent variable? + tensor +// type id. In the far future, it will be requires grad? + profiling? + +// tracing? + lazy? + tensor type id. // -// First: why do we have this distinction, and why not map every dispatch key -// directly to a bit? This is mostly because we have several types of -// functionalities that different backends would like to customize. For example, -// we have: -// - "Dense": CPU, CUDA, XLA, ... (~12 keys) -// - "Sparse": SparseCPU, SparseCUDA, ... -// - "Quantized": QuantizedCPU, QuantizedCUDA, QuantizedXLA, ... -// - "Autograd": AutogradCPU, AutogradCUDA, Autograd XLA, ... -// The problem is that total number of keys grows quadratically with [# -// backends] x [# functionalities], making it very difficult to map each key -// directly to a bit in a bitset without dramatically increasing the size of the -// bitset over time. +// (The difference between variable and requires grad, is that +// there are currently three states a tensor can be: +// 1. Not a variable +// 2. Variable with requires_grad=False +// 3. Variable with requires_grad=True +// Eventually, we want to kill state (1), and only dispatch to autograd +// handling code if one of the inputs requires grad.) // -// The two enums (BackendComponent and DispatchKey) can be divided roughly into -// 5 categories. -// -// (1) "Building block" keys -// (a) backends: jEverything in the BackendComponent enum (e.g. CPUBit, -// CUDABIt) (b) functionalities: (per-backend) functionality-bit DispatchKeys -// (e.g. AutogradFunctionality, Sparse, Dense) -// (2) "Runtime" keys -// (a) "non-customizable backends" (e.g. FPGA) -// (b) "non-customizable functionalities" (e.g. Functionalize) -// (c) "per-backend instances of customizable functionalities" (e.g. CPU, -// SparseCPU, AutogradCPU) -// (3) "Alias" DispatchKeys (see Note [Alias Dispatch Keys]) -// -// (1) Building block keys always correspond to individual bits in a -// DispatchKeySet. They can also be combined in a DispatchKeySet to form actual -// runtime keys. e.g. -// auto dense_cpu_ks = DispatchKeySet({DispatchKey::CPUBit, -// DispatchKey::Dense}); -// // The keyset has the runtime dense-cpu key. -// dense_cpu_ks.has(DispatchKey::CPU); -// // And it contains the building block keys too. -// dense_cpu_ks.has(DispatchKey::CPUBit); -// dense_cpu_ks.has(DispatchKey::Dense); -// -// Not every backend and not every functionality counts as a "building block -// key". This is mostly to give us more levers to pull in the design space. -// Backend keys and functionality keys that count as "building blocks" will -// contribute to a full cross product of functionality that can be overriden. -// -// For example, right now we have at least 12 "backend" building blocks (CPU, -// CUDA, XLA, ...) and at least 4 "functionality" building blocks (Dense, -// Sparse, Quantized, AutogradFunctionality, ...). These keys together allow -// every dispatcher operator to be customized in up to 12*4 different ways. Each -// of those requires a slot in the operator table of every dispatcher operator. -// Not every piece of functionality necessarily needs to be customizeable -// per-backend, and not every backend necessarily needs to be able to customize -// every type of functionality. -// -// -// (2) Every runtime key corresponds directly to a slot in an operator's runtime -// dispatch table, and you can directly register kernels to a runtime dispatch -// key. -// -// For per-backend functionalities like "Dense" or "AutogradFunctionality", -// you can think of the corresponding runtime dispatch keys as "instances" of -// that functionality, per backend. E.g. "CPU", "CUDA", "XLA", etc. are all -// runtime instances of the "Dense" building block key. - -// (2a) and (2b) are represented identically in the DispatchKeySet logic: -// - backend-agnostic functionalities (e.g. FuncTorchBatched) are NOT -// customizeable per backend. -// In order to do so, we'd need to promote it to a per-backend functionality -// "building block" key. -// - non-customizeable backends (e.g. FPGA) can NOT customize existing -// functionality like Sparse, Autograd, etc. -// In order to do so, we'd need to promote it to a backend "building block" -// key. -// -// In both cases, these keys directly correspond to runtime slots in the -// operator table. -// -// -// (3) "Alias" keys -// See Note [Alias Dispatch Keys] -// -// Final note: for anyone making future changes to the Dispatcher + -// DispatchKeySet internals, there's a closed PR with a basic -// python-implementation of the Dispatcher that might be useful in quickly -// testing out and validating changes. See it at -// https://github.com/pytorch/pytorch/pull/68743 - // An undefined tensor is one with an empty tensor type set. class DispatchKeySet final { public: @@ -158,146 +41,29 @@ class DispatchKeySet final { // NB: default constructor representation as zero is MANDATORY as // use of DispatchKeySet in TLS requires this. constexpr DispatchKeySet() : repr_(0) {} - constexpr DispatchKeySet(Full) - : repr_((1ULL << (num_backends + num_functionality_keys - 1)) - 1) {} - + : repr_(std::numeric_limits::max()) {} constexpr DispatchKeySet(FullAfter, DispatchKey t) // LSB after t are OK, but not t itself. - // "functionalities" have a notion of ordering (e.g. Autograd > Sparse > - // Quantized > Dense). But backends don't really have an ordering. - // Therefore, we're enforcing that FullAfter can only be used on - // "functionality" keys. - : repr_( - (1ULL - << (num_backends + static_cast(toFunctionalityKey(t)) - - 1)) - - 1) {} - + : repr_((1ULL << (static_cast(t) - 1)) - 1) {} // Public version of DispatchKeySet(uint64_t) API; external users // must be explicit when they do this! constexpr DispatchKeySet(Raw, uint64_t x) : repr_(x) {} - - constexpr explicit DispatchKeySet(BackendComponent k) { - if (k == BackendComponent::InvalidBit) { - repr_ = 0; - } else { - repr_ = 1ULL << (static_cast(k) - 1); - } - } - - constexpr explicit DispatchKeySet(DispatchKey k) { - if (k == DispatchKey::Undefined) { - // Case 1: handle Undefined specifically - repr_ = 0; - } else if (k <= DispatchKey::EndOfFunctionalityKeys) { - // Case 2: handle "functionality-only" keys - // These keys have a functionality bit set, but no backend bits - // These can technically be either: - // - valid runtime keys (e.g. DispatchKey::AutogradOther, - // DispatchKey::FuncTorchBatched, etc) - // - "building block" keys that aren't actual runtime keys (e.g. - // DispatchKey::Dense or Sparse) - uint64_t functionality_val = 1ULL - << (num_backends + static_cast(k) - 1); - repr_ = functionality_val; - } else if (k <= DispatchKey::EndOfRuntimeBackendKeys) { - // Case 3: "runtime" keys that have a functionality bit AND a backend bit. - // First compute which bit to flip for the functionality. - auto functionality_k = toFunctionalityKey(k); - // The - 1 is because Undefined is technically a "functionality" that - // doesn't show up in the bitset. So e.g. Dense is technically the second - // functionality, but the lowest functionality bit. - uint64_t functionality_val = 1ULL - << (num_backends + static_cast(functionality_k) - 1); - - // then compute which bit to flip for the backend - // Case 4a: handle the runtime instances of "per-backend functionality" - // keys For example, given DispatchKey::CPU, we should set: - // - the Dense functionality bit - // - the CPUBit backend bit - // first compute which bit to flip for the backend - auto backend_k = toBackendComponent(k); - uint64_t backend_val = backend_k == BackendComponent::InvalidBit - ? 0 - : 1ULL << (static_cast(backend_k) - 1); - repr_ = functionality_val + backend_val; - } else { - // At this point, we should have covered every case except for alias keys. - // Technically it would be possible to add alias dispatch keys to a - // DispatchKeySet, but the semantics are a little confusing and this - // currently isn't needed anywhere. - repr_ = 0; - } - } - - constexpr uint64_t keys_to_repr(std::initializer_list ks) { - uint64_t repr = 0; - for (auto k : ks) { - repr |= DispatchKeySet(k).repr_; - } - return repr; - } - - constexpr uint64_t backend_bits_to_repr( - std::initializer_list ks) { - uint64_t repr = 0; - for (auto k : ks) { - repr |= DispatchKeySet(k).repr_; - } - return repr; - } - + explicit constexpr DispatchKeySet(DispatchKey t) + : repr_( + t == DispatchKey::Undefined + ? 0 + : 1ULL << (static_cast(t) - 1)) {} explicit constexpr DispatchKeySet(std::initializer_list ks) - : repr_(keys_to_repr(ks)) {} - - explicit constexpr DispatchKeySet(std::initializer_list ks) - // Note: for some reason, putting this logic directly in the constructor - // appears to fail to compile on CUDA 10.1. - // See an example internal failure at - // https://www.internalfb.com/intern/skycastle/run/76561193669136035/artifact/actionlog.76561193742069401.stderr - : repr_(backend_bits_to_repr(ks)) {} - + : repr_(0) { + for (auto k : ks) { + repr_ |= DispatchKeySet(k).repr_; + } + } // Test if a DispatchKey is in the set - inline bool has(DispatchKey t) const { + bool inline has(DispatchKey t) const { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t != DispatchKey::Undefined); - return has_all(DispatchKeySet(t)); - } - constexpr bool has_backend(BackendComponent t) const { - return has_all(DispatchKeySet(t)); - } - - // Test if a DispatchKey is in the set - // Given a DispatchKeySet of functionality keys and (potentially) backend - // keys, tests if all of them are in the current set. - constexpr bool has_all(DispatchKeySet ks) const { - return static_cast((repr_ & ks.repr_) == ks.repr_); - } - - // Given a DispatchKeySet of functionality keys and (potentially) backend - // keys, tests if any of them are in the current set. This could technically - // be pretty easily implemented using has(). It is strictly a perf - // optimization though. There are many places in the code base where we want - // to test for multiple functionality keys together. HOWEVER, runtime - // per-backend functionality keys aren't allowed to be used with this - // function, because you can end up with weird results. e.g. - // DispatchKeySet(DispatchKey::AutogradCPU).has_any(DispatchKeySet(DispatchKey::CPU)) - // would return true. - inline bool has_any(DispatchKeySet ks) const { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - // Either there are no backend bits in the input keyset - ((ks.repr_ & full_backend_mask) == 0) || - // or there are no per-backend-functionality bits - // See [Note: Per-Backend Functionality Dispatch Keys] - ((ks & - DispatchKeySet({ - DispatchKey::Dense, - DispatchKey::Quantized, - DispatchKey::Sparse, - DispatchKey::AutogradFunctionality, - }) - .repr_) == 0)); - return static_cast((repr_ & ks.repr_) != 0); + return static_cast(repr_ & DispatchKeySet(t).repr_); } // Test if DispatchKeySet is a superset of ks. bool isSupersetOf(DispatchKeySet ks) const { @@ -308,64 +74,31 @@ class DispatchKeySet final { return DispatchKeySet(repr_ | other.repr_); } // Perform set intersection - constexpr DispatchKeySet operator&(DispatchKeySet other) const { + DispatchKeySet operator&(DispatchKeySet other) const { return DispatchKeySet(repr_ & other.repr_); } - // Compute the set difference self - other, - // but ONLY for the functionality keys. - // Any backend bits set on self will remain unchanged. - // See Note [Removing keys from DispatchKeySet Only Affects Functionality - // Keys] + // Compute the set difference self - other DispatchKeySet operator-(DispatchKeySet other) const { - return DispatchKeySet(repr_ & (full_backend_mask | ~other.repr_)); + return DispatchKeySet(repr_ & ~other.repr_); } - // Compute self ^ other constexpr DispatchKeySet operator^(DispatchKeySet other) const { return DispatchKeySet(repr_ ^ other.repr_); } + // Perform set equality bool operator==(DispatchKeySet other) const { return repr_ == other.repr_; } - bool operator!=(DispatchKeySet other) const { - return repr_ != other.repr_; - } // Add a DispatchKey to the DispatchKey set. Does NOT mutate, // returns the extended DispatchKeySet! C10_NODISCARD DispatchKeySet add(DispatchKey t) const { return *this | DispatchKeySet(t); } - C10_NODISCARD DispatchKeySet add(DispatchKeySet ks) const { - return *this | ks; - } - - // Remove a DispatchKey from the DispatchKey set. - // This is generally not an operation you should be doing - // (it's used to implement the printing overload, operator<<) - // - // Note [Removing keys from DispatchKeySet Only Affects Functionality Keys] - // Only functionality bits are allowed to be removed from a keyset. - // For now, we're only allowing removal of "functionality bits" from the - // keyset, which is specifically needed by the fallthrough key calculation - // logic. Why is removing backend bits problematic? Consider this example: - // - // DispatchKeySet([DispatchKey.CPU, DispatchKey.AutogradCUDA, - // DispatchKey.CUDA]).remove(DispatchKey.AutogradCUDA) - // DispatchKeySet([DispatchKey.CPU, - // DispatchKey.AutogradCUDA]).remove(DispatchKey.AutogradCUDA) - // - // What do we want to happen? - // Technically, we'd like it to be true that after removal, - // the first keyset still has the CUDA dispatch key while the second doesn't. - // Unfortunately there's no way to represent that, because the two keysets are - // represented the same way internally: functionality bits: Autograd, Dense - // backend bits: CPU, CUDA - // - // Instead, remove(DispatchKey.AutogradCPU) will only remove the "Autograd" - // bit from the bitset. - constexpr DispatchKeySet remove(DispatchKey t) const { - return DispatchKeySet( - repr_ & ~(DispatchKeySet(t).repr_ & ~full_backend_mask)); + // Remove a DispatchKey from the DispatchKey set. This is + // generally not an operation you should be doing (it's + // used to implement operator<<) + C10_NODISCARD constexpr DispatchKeySet remove(DispatchKey t) const { + return DispatchKeySet(repr_ & ~DispatchKeySet(t).repr_); } // Is the set empty? (AKA undefined tensor) bool empty() const { @@ -374,78 +107,22 @@ class DispatchKeySet final { uint64_t raw_repr() { return repr_; } - - DispatchKey highestFunctionalityKey() const { - auto functionality_idx = indexOfHighestBit(); - // This means that none of the functionality bits were set. - if (functionality_idx < num_backends) - return DispatchKey::Undefined; - // The first num_backend bits in the keyset don't correspond to real - // dispatch keys. - return static_cast(functionality_idx - num_backends); - } - - // This is similar like toBackendComponent(DispatchKey), but less restrictive. - // toBackendComponent() errors out if the key that it was passed has no - // backend bits, which is useful for error checking. We need a version of that - // here that can also handle "fake" backends like FPGA, because they need to - // map to the AutogradOther key. For those backends, we return - // BackendComponent::InvalidBit. - BackendComponent highestBackendKey() const { - // mask to mask out functionality bits - auto backend_idx = - DispatchKeySet(repr_ & full_backend_mask).indexOfHighestBit(); - // all zeros across the backend bits means that no backend bits are set. - if (backend_idx == 0) - return BackendComponent::InvalidBit; - return static_cast(backend_idx); - } - - // returns the DispatchKey of highest priority in the set. + // Return the type id in this set with the highest priority (i.e., + // is the largest in the DispatchKey enum). Intuitively, this + // type id is the one that should handle dispatch (assuming there + // aren't any further exclusions or inclusions). DispatchKey highestPriorityTypeId() const { - auto functionality_k = highestFunctionalityKey(); - if (isPerBackendFunctionalityKey(functionality_k)) { - return toRuntimePerBackendFunctionalityKey( - functionality_k, highestBackendKey()); - } - return functionality_k; + // TODO: If I put Undefined as entry 64 and then adjust the + // singleton constructor to shift from the right, we can get rid of the + // subtraction here. It's modestly more complicated to get right so I + // didn't do it for now. + return static_cast(64 - llvm::countLeadingZeros(repr_)); } - // Returns the index of the most-significant bit in the keyset. - // This is used to as part of the calculation into the operator table to get: - // - the highest "functionality" bit in the keyset. - // - the highest "backend" bit in the keyset. - uint8_t indexOfHighestBit() const { - return 64 - llvm::countLeadingZeros(repr_); - } - - // returns the index in the operator table of highest priority key in the the - // keyset Note that we could in theory implement this using - // highestPriorityTypeId(), but this code is very hotpath and we can do it - // faster without it. - uint64_t getDispatchTableIndexForDispatchKeySet() const { - auto functionality_idx = - DispatchKeySet(repr_ >> num_backends).indexOfHighestBit(); - auto offset_and_mask = offsetsAndMasks()[functionality_idx]; - // Mask the functionality bits out first, then right-shift by 1. - // right-shifting by 1 because everything is zero-indexed. - // E.g. 000001 (CPU) should give us an offset of 0, 000010 (CUDA) should - // give us an offset of 1, etc. - auto backend_idx = - DispatchKeySet((repr_ & offset_and_mask.mask) >> 1).indexOfHighestBit(); - return offset_and_mask.offset + backend_idx; - } - - // returns the "index" of the highest priority backend in the keyset. - // This is pretty similar to getBackendKey(), but: - // - It's hotpath code (part of the runtime bitset calculation) - // - I's returns an integer index, not an enum value - // - Everything is shifted to the right by 1. - // BackendComponent::InvalidBit is technically the lowest enum value, - // but it isn't included in the runtime table. So CPUBit = 1, CUDABit = 2, - // etc. - uint64_t getBackendIndex() const { - return DispatchKeySet((repr_ & full_backend_mask) >> 1).indexOfHighestBit(); + DispatchKey highestPriorityBackendTypeId() const { + return (*this & + ((1ULL << static_cast(DispatchKey::EndOfBackendKeys)) - 1)) + .highestPriorityTypeId(); } private: @@ -453,47 +130,42 @@ class DispatchKeySet final { uint64_t repr_ = 0; public: - // STL iterator for DispatchKeySet. Iterates through all runtime DispatchKeys - // in the set. The iterator is only invalidated by the destruction of the - // underlying DispatchKeySet as the iterator stores a pointer to the raw - // representation of the DispatchKeySet. Note: When we encounter a per-backend - // functionality (e.g. Dense or Sparse), we will iterate through EVERY backend - // in the keyset, for that functionality. For example, if the next - // functionality key to iterate over is Autograd, and the backend bits in the - // keyset correspond to [BackendComponent::CPUBit, BackendComponent::CUDABit], - // then the next two keys we return will be DispatchKey::AutogradCPU, - // DispatchKey::AutogradCUDA (CPU first because it has lower precedence than - // CUDA in DispatchKey.h). + // STL iterator for DispatchKeySet. Iterates through all DispatchKeys in the + // set. The iterator is only invalidated by the destruction of the underlying + // DispatchKeySet as the iterator stores a pointer to the raw representation + // of the DispatchKeySet. class iterator { public: using self_type = iterator; using iterator_category = std::input_iterator_tag; using value_type = DispatchKey; using difference_type = ptrdiff_t; - // final mask value should mask out the entire keyset - static const uint8_t end_iter_mask_val = - num_backends + num_functionality_keys; - // final key value should be the last DispatchKey - static const uint8_t end_iter_key_val = num_functionality_keys; - // current_dispatchkey_idx_ will iterate through all functionality bits. - // current_backendcomponent_idx_ will iterate through all backend bits. - explicit iterator( - const uint64_t* data_ptr, - uint8_t next_functionality = num_backends, - uint8_t next_backend = 0) - : data_ptr_(data_ptr), - next_functionality_(next_functionality), - next_backend_(next_backend), - // These are in an invalid state at construction time, and set by the - // first increment call - current_dispatchkey_idx_(end_iter_key_val), - current_backendcomponent_idx_(end_iter_key_val) { + explicit iterator(const uint64_t* data_ptr, uint8_t i = 0) + : data_ptr_(data_ptr), i_(i) { // Go to the first key in the set ++(*this); } - C10_API self_type& operator++(); + self_type& operator++() { + TORCH_INTERNAL_ASSERT( + i_ <= static_cast(DispatchKey::NumDispatchKeys)); + + // Create a masked version of the set representation to ignore previous + // keys that we've iterated through. + uint64_t masked_data = llvm::maskTrailingZeros(i_) & *data_ptr_; + uint64_t firstKeyIndex = llvm::findFirstSet(masked_data); + + // If there are no keys, set to end iterator value + if (firstKeyIndex == std::numeric_limits::max() || + i_ == static_cast(DispatchKey::NumDispatchKeys)) { + i_ = static_cast(DispatchKey::NumDispatchKeys); + return *this; + } + + i_ = static_cast(firstKeyIndex) + 1; + return *this; + } self_type operator++(int) { self_type previous_iterator = *this; @@ -502,50 +174,18 @@ class DispatchKeySet final { } bool operator==(const self_type& rhs) const { - return next_functionality_ == rhs.next_functionality_ && - current_dispatchkey_idx_ == rhs.current_dispatchkey_idx_ && - next_backend_ == rhs.next_backend_ && - current_backendcomponent_idx_ == rhs.current_backendcomponent_idx_; + return i_ == rhs.i_; } bool operator!=(const self_type& rhs) const { - return next_functionality_ != rhs.next_functionality_ || - current_dispatchkey_idx_ != rhs.current_dispatchkey_idx_ || - next_backend_ != rhs.next_backend_ || - current_backendcomponent_idx_ != rhs.current_backendcomponent_idx_; + return i_ != rhs.i_; } DispatchKey operator*() const { - auto functionality_key = - static_cast(current_dispatchkey_idx_); - if (isPerBackendFunctionalityKey(functionality_key)) { - auto next_key = toRuntimePerBackendFunctionalityKey( - functionality_key, - static_cast(current_backendcomponent_idx_)); - // We expect all of the Dense, Sparse, Quantized, and Autograd keys to - // be ordered the same way with respect to their backends - TORCH_INTERNAL_ASSERT( - toBackendComponent(next_key) == - static_cast(current_backendcomponent_idx_), - "Tried to map functionality key ", - toString(functionality_key), - " and backend bit ", - toString( - static_cast(current_backendcomponent_idx_)), - " to a runtime key, but ended up with ", - toString(next_key), - ". This can happen if the order of the backend dispatch keys in DispatchKey.h isn't consistent.", - " Please double check that enum for inconsistencies."); - return next_key; - } else { - return functionality_key; - } + return static_cast(i_); } private: const uint64_t* data_ptr_; - uint8_t next_functionality_; - uint8_t next_backend_; - uint8_t current_dispatchkey_idx_; - uint8_t current_backendcomponent_idx_; + uint8_t i_; }; public: @@ -555,35 +195,31 @@ class DispatchKeySet final { return iterator(&repr_); } - // We do not need to iterate beyond EndOfFunctionalityKeys so we will treat - // this as the end iterator. + // We do not need to iterate beyond NumDispatchKeys so we will treat this as + // the end iterator. NumDispatchKeys will always be strictly less than 64. iterator end() const { - return iterator(&repr_, iterator::end_iter_mask_val); + return iterator(&repr_, static_cast(DispatchKey::NumDispatchKeys)); } }; C10_API std::string toString(DispatchKeySet); C10_API std::ostream& operator<<(std::ostream&, DispatchKeySet); -C10_API inline uint64_t getDispatchTableIndexForDispatchKey(DispatchKey k) { - return DispatchKeySet(k).getDispatchTableIndexForDispatchKeySet(); -} - -// Alias key DispatchKey::Autograd maps to -// (autograd_dispatch_keyset x full_backend_mask) +// autograd_dispatch_keyset should include all runtime autograd keys. +// Alias key DispatchKey::Autograd maps to autograd_dispatch_keyset. // NB: keys in this set also get associated with CompositeImplicitAutograd -// -// Note [autograd_dispatch_keyset Does Not Include Backend Bits] -// We don't want to include any backend bits (BackendComponent::CPUBit, etc) -// directly in autograd_dispatch_keyset. -// Why? keysets like autograd_dispatch_keyset are commonly used to remove -// autograd keys from a DispatchKeySet throughout the code base. However, you -// are only allowed to remove functionality bits from a keyset, not backend -// bits. See Note [Removing keys from DispatchKeySet Only Affects Functionality -// Keys] for details. To be consistent and avoid confusion, we're explicitly -// setting up autograd_dispatch_keyset to not have any backend bits. constexpr DispatchKeySet autograd_dispatch_keyset = DispatchKeySet({ - DispatchKey::AutogradFunctionality, + DispatchKey::AutogradCPU, + DispatchKey::AutogradCUDA, + DispatchKey::AutogradXLA, + DispatchKey::AutogradLazy, + DispatchKey::AutogradNestedTensor, + DispatchKey::AutogradMLC, + DispatchKey::AutogradHPU, + DispatchKey::AutogradXPU, + DispatchKey::AutogradPrivateUse1, + DispatchKey::AutogradPrivateUse2, + DispatchKey::AutogradPrivateUse3, DispatchKey::AutogradOther, }); @@ -608,28 +244,25 @@ constexpr DispatchKeySet autograd_dispatch_keyset_with_ADInplaceOrView = // backend dispatch keys that map to DispatchKey::AutogradOther // NB: keys in this set also get associated with CompositeImplicitAutograd -constexpr DispatchKeySet autogradother_backends = - DispatchKeySet( - // HIP and VE aren't in this list: they now have their own backend bits - // which means that they can now have their own Autograd keys. - // Technically, HIP will now redispatch to its own custom AutogradHIP - // slot in the runtime table. - {DispatchKey::FPGA, - DispatchKey::ORT, - DispatchKey::Vulkan, - DispatchKey::Metal, - DispatchKey::SparseCsrCPU, - DispatchKey::SparseCsrCUDA, - DispatchKey::CustomRNGKeyId, - DispatchKey::MkldnnCPU, - DispatchKey::Meta, - // Sparse and Quantized backends also live here. - DispatchKey::Sparse, - DispatchKey::Quantized}) - // Including the backend bits because this keyset is used during op - // registration, which requires looping over all runtime autogradother - // backend keys. - | DispatchKeySet(DispatchKeySet::RAW, full_backend_mask); +constexpr DispatchKeySet autogradother_backends = DispatchKeySet( + {DispatchKey::HIP, + DispatchKey::VE, + DispatchKey::FPGA, + DispatchKey::ORT, + DispatchKey::Vulkan, + DispatchKey::Metal, + DispatchKey::QuantizedCPU, + DispatchKey::QuantizedCUDA, + DispatchKey::CustomRNGKeyId, + DispatchKey::MkldnnCPU, + DispatchKey::SparseCPU, + DispatchKey::SparseCUDA, + DispatchKey::SparseHIP, + DispatchKey::SparseVE, + DispatchKey::SparseXPU, + DispatchKey::SparseCsrCPU, + DispatchKey::SparseCsrCUDA, + DispatchKey::Meta}); // The set of dispatch keys that come after autograd // n.b. this relies on the fact that AutogradOther is currently the lowest @@ -659,36 +292,6 @@ constexpr DispatchKeySet after_func_keyset = // away with it by explicitly removing the key here. c10::DispatchKey::ADInplaceOrView); -constexpr DispatchKeySet backend_bitset_mask = - DispatchKeySet(DispatchKeySet::RAW, (1ULL << num_backends) - 1); - -constexpr auto inplace_or_view_ks = - DispatchKeySet(DispatchKey::ADInplaceOrView); -constexpr auto autograd_cpu_ks = DispatchKeySet(DispatchKey::AutogradCPU); -constexpr auto autograd_xpu_ks = DispatchKeySet(DispatchKey::AutogradXPU); -constexpr auto autograd_cuda_ks = DispatchKeySet(DispatchKey::AutogradCUDA); -constexpr auto autograd_xla_ks = DispatchKeySet(DispatchKey::AutogradXLA); -constexpr auto autograd_lazy_ks = DispatchKeySet(DispatchKey::AutogradLazy); -constexpr auto autograd_mlc_ks = DispatchKeySet(DispatchKey::AutogradMLC); -constexpr auto autograd_hpu_ks = DispatchKeySet(DispatchKey::AutogradHPU); -constexpr auto autograd_privateuse1_ks = - DispatchKeySet(DispatchKey::AutogradPrivateUse1); -constexpr auto autograd_privateuse2_ks = - DispatchKeySet(DispatchKey::AutogradPrivateUse2); -constexpr auto autograd_privateuse3_ks = - DispatchKeySet(DispatchKey::AutogradPrivateUse3); -constexpr auto autograd_other_ks = DispatchKeySet(DispatchKey::AutogradOther); - -struct OpTableOffsetAndMask { - uint16_t offset; - uint16_t backend_mask; -}; - -static_assert( - num_backends <= 16, - "Right now we expect the number of backends not to exceed 16. In the (unlikely) event" - " that this changes, the size of OpTableOffsetAndMask::backend_mask needs to be increased too."); - // true if t is a backend dispatch key C10_API bool isBackendDispatchKey(DispatchKey t); @@ -704,53 +307,10 @@ C10_API bool runtimeDispatchKeySetHas(DispatchKey t, DispatchKey k); C10_API DispatchKeySet getBackendKeySetFromAutograd(DispatchKey t); // Returns a DispatchKeySet of autograd related keys mapped to backend. -// for a given backend key, use the associated autograd key. -// for non-backend keys, use AutogradOther as a default. -// Note: it's convenient and fast to return a default here rather than (say) -// returning an optional, or throwing. But it makes callers -// responsible for either a) enforcing the invariant that only backend keys -// be passed as arguments, or b) interpreting our return value carefully. -inline DispatchKeySet getAutogradRelatedKeySetFromBackend(BackendComponent t) { - switch (t) { - case BackendComponent::CPUBit: - return inplace_or_view_ks | autograd_cpu_ks; - case BackendComponent::XPUBit: - return inplace_or_view_ks | autograd_xpu_ks; - case BackendComponent::CUDABit: - return inplace_or_view_ks | autograd_cuda_ks; - case BackendComponent::XLABit: - return inplace_or_view_ks | autograd_xla_ks; - case BackendComponent::LazyBit: - return inplace_or_view_ks | autograd_lazy_ks; - case BackendComponent::MLCBit: - return inplace_or_view_ks | autograd_mlc_ks; - case BackendComponent::HPUBit: - return inplace_or_view_ks | autograd_hpu_ks; - case BackendComponent::PrivateUse1Bit: - return inplace_or_view_ks | autograd_privateuse1_ks; - case BackendComponent::PrivateUse2Bit: - return inplace_or_view_ks | autograd_privateuse2_ks; - case BackendComponent::PrivateUse3Bit: - return inplace_or_view_ks | autograd_privateuse3_ks; - default: - return inplace_or_view_ks | autograd_other_ks; - } -} +C10_API DispatchKeySet getAutogradRelatedKeySetFromBackend(DispatchKey t); // Returns a DispatchKeySet of autocast related keys mapped to backend. -inline DispatchKeySet getAutocastRelatedKeySetFromBackend(BackendComponent t) { - constexpr auto autocast_cpu_ks = DispatchKeySet(DispatchKey::AutocastCPU); - constexpr auto autocast_cuda_ks = DispatchKeySet(DispatchKey::AutocastCUDA); - switch (t) { - case BackendComponent::CPUBit: - return autocast_cpu_ks; - case BackendComponent::CUDABit: - case BackendComponent::XLABit: - return autocast_cuda_ks; - default: - return DispatchKeySet(); - } -} +C10_API DispatchKeySet getAutocastRelatedKeySetFromBackend(DispatchKey t); // This API exists because we have a use case for checking // getRuntimeDispatchKeySet(alias).has(DispatchKey::Undefined) diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index 379807df0c7..b83ee395045 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -190,7 +190,7 @@ TensorImpl::TensorImpl( // TODO: be more explicit about the full key set at call sites so we // don't have to keep recomputing it here - auto k = key_set.highestBackendKey(); + DispatchKey k = key_set.highestPriorityBackendTypeId(); key_set = key_set | getAutocastRelatedKeySetFromBackend(k); diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index d703cb2abb8..86aca278c9d 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -838,7 +838,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_sparse() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::Sparse); + return key_set_.has(DispatchKey::SparseCPU) || + key_set_.has(DispatchKey::SparseCUDA) || + key_set_.has(DispatchKey::SparseHIP) || + key_set_.has(DispatchKey::SparseXPU); } // Whether a tensor is sparse COO or not. Use is_sparse_csr for checking CSR @@ -851,7 +854,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_quantized() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has(DispatchKey::Quantized); + return key_set_.has(DispatchKey::QuantizedCPU) || + key_set_.has(DispatchKey::QuantizedCUDA) || + key_set_.has(DispatchKey::QuantizedXPU); } bool is_meta() const { @@ -863,46 +868,53 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { bool is_cpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::CPUBit) || + return key_set_.has(DispatchKey::CPU) || + key_set_.has(DispatchKey::SparseCPU) || key_set_.has(DispatchKey::SparseCsrCPU) || + key_set_.has(DispatchKey::QuantizedCPU) || key_set_.has(DispatchKey::MkldnnCPU); } bool is_cuda() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::CUDABit) || - key_set_.has(DispatchKey::SparseCsrCUDA); + return key_set_.has(DispatchKey::CUDA) || + key_set_.has(DispatchKey::SparseCUDA) || + key_set_.has(DispatchKey::SparseCsrCUDA) || + key_set_.has(DispatchKey::QuantizedCUDA); } bool is_xpu() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::XPUBit); + return key_set_.has(DispatchKey::XPU) || + key_set_.has(DispatchKey::SparseXPU) || + key_set_.has(DispatchKey::QuantizedXPU); } bool is_xla() const { - return key_set_.has_backend(BackendComponent::XLABit); + return key_set_.has(DispatchKey::XLA); } bool is_hpu() const { - return key_set_.has_backend(BackendComponent::HPUBit); + return key_set_.has(DispatchKey::HPU); } bool is_lazy() const { - return key_set_.has_backend(BackendComponent::LazyBit); + return key_set_.has(DispatchKey::Lazy); } bool is_hip() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::HIPBit); + return key_set_.has(DispatchKey::HIP) || + key_set_.has(DispatchKey::SparseHIP); } bool is_ve() const { // NB: This method is not virtual and avoid dispatches for performance // reasons. - return key_set_.has_backend(BackendComponent::VEBit); + return key_set_.has(DispatchKey::VE) || key_set_.has(DispatchKey::SparseVE); } bool is_mkldnn() const { @@ -1536,22 +1548,13 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { */ inline bool has_compatible_shallow_copy_type(DispatchKeySet from) { auto is_dense = [](DispatchKeySet ts) { - constexpr auto dense_backends = DispatchKeySet( - {BackendComponent::CPUBit, - BackendComponent::CUDABit, - BackendComponent::HIPBit, - BackendComponent::XPUBit}); - constexpr auto dense_k = DispatchKeySet(DispatchKey::Dense); - return ts.has_any(dense_k) && ts.has_any(dense_backends); + return ts.has(DispatchKey::CPU) || ts.has(DispatchKey::CUDA) || + ts.has(DispatchKey::HIP) || ts.has(DispatchKey::XPU); }; auto is_sparse = [](DispatchKeySet ts) { - constexpr auto sparse_backends = DispatchKeySet( - {BackendComponent::CPUBit, - BackendComponent::CUDABit, - BackendComponent::HIPBit, - BackendComponent::XPUBit}); - constexpr auto sparse_k = DispatchKeySet(DispatchKey::Sparse); - return ts.has_any(sparse_k) && ts.has_any(sparse_backends); + return ts.has(DispatchKey::SparseCPU) || + ts.has(DispatchKey::SparseCUDA) || ts.has(DispatchKey::SparseHIP) || + ts.has(DispatchKey::SparseXPU); }; return (key_set_ == from) || (is_dense(key_set_) && is_dense(from)) || (is_sparse(key_set_) && is_sparse(from)); diff --git a/c10/test/core/DispatchKeySet_test.cpp b/c10/test/core/DispatchKeySet_test.cpp index 2c0de14405d..43b06c110e5 100644 --- a/c10/test/core/DispatchKeySet_test.cpp +++ b/c10/test/core/DispatchKeySet_test.cpp @@ -3,163 +3,25 @@ #include #include -#include using namespace c10; -// This test exists not to be comprehensive, but to more clearly show -// what the semantics of DispatchKeySet are. -TEST(DispatchKeySet, ShowSemantics) { - // the "CPU" dispatch key is an instance of a per-backend-functionality key. - // It corresponds to "dense" functionality, "CPU" backend. - // This means that it gets a dense functionality bit, and a cpu backend bit - // set. - auto undefined_set = DispatchKeySet(); - auto dense_cpu_set = DispatchKeySet(DispatchKey::CPU); - ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense)); - ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit)); - ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU)); - - auto dense_lazy_set = DispatchKeySet(DispatchKey::Lazy); - ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Dense)); - ASSERT_TRUE(dense_lazy_set.has_backend(BackendComponent::LazyBit)); - ASSERT_TRUE(dense_lazy_set.has(DispatchKey::Lazy)); - - // You can think of "Dense/Sparse", and "CPUBit/CUDABit", as "building block" - // dispatch keys. You are allowed to directly create keysets out of them! - auto dense_cpu_set_from_building_blocks = DispatchKeySet(DispatchKey::Dense) | - DispatchKeySet(BackendComponent::CPUBit); - ASSERT_TRUE(dense_cpu_set.has(DispatchKey::Dense)); - ASSERT_TRUE(dense_cpu_set.has_backend(BackendComponent::CPUBit)); - ASSERT_TRUE(dense_cpu_set.has(DispatchKey::CPU)); - ASSERT_EQ(dense_cpu_set, dense_cpu_set_from_building_blocks); - - // Similarly, the AutogradCUDA key gets 2 bits in the keyset: - // The "Autograd" functionality bit, and the "CUDA" backend bit - auto autograd_cuda = DispatchKeySet(DispatchKey::AutogradCUDA); - ASSERT_TRUE(autograd_cuda.has(DispatchKey::AutogradFunctionality)); - ASSERT_TRUE(autograd_cuda.has_backend(BackendComponent::CUDABit)); - - // Because DispatchKeySet uses a condensed internal representation, you cannot - // use it to represent the FULL cross product of backends and functionalities - // for example: - auto autograd_dense_cpu_cuda = DispatchKeySet( - {DispatchKey::AutogradFunctionality, - DispatchKey::Dense, - DispatchKey::CUDA, - DispatchKey::CPU}); - auto fpga = DispatchKeySet(DispatchKey::FPGA); - auto fpga_and_cpu = DispatchKeySet({DispatchKey::FPGA, DispatchKey::CPU}); - // this keyset has all of the building block keys: - ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradFunctionality)); - ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::Dense)); - ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CUDABit)); - ASSERT_TRUE(autograd_dense_cpu_cuda.has_backend(BackendComponent::CPUBit)); - - // and it also has the "runtime" keys that correspond to the full - // cross-product of functionality - ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU)); - ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::AutogradCPU)); - ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CPU)); - ASSERT_TRUE(autograd_dense_cpu_cuda.has(DispatchKey::CUDA)); - - // This means that there's no way to represent a keyset with, say, only - // Autograd CUDA + Dense CPU. Instead, you should think of a keyset as - // inheriting the full set of functionalities + backends of its keys. This - // means that the below keysets are all indistinguishable from each other. - ASSERT_EQ( - autograd_dense_cpu_cuda, - DispatchKeySet( - {DispatchKey::AutogradCUDA, - DispatchKey::AutogradCPU, - DispatchKey::CUDA, - DispatchKey::CPU})); - ASSERT_EQ( - autograd_dense_cpu_cuda, - DispatchKeySet({DispatchKey::AutogradCUDA, DispatchKey::CPU})); - ASSERT_EQ( - autograd_dense_cpu_cuda, - DispatchKeySet({DispatchKey::CUDA, DispatchKey::AutogradCPU})); - - // ~~~~~~~~~~ DispatchKeySet iterators ~~~~~~~~~~~ - - // Iterators allow you to iterate individually through the DispatchKey's in a - // DispatchKeySet - auto empty_set = DispatchKeySet(); - auto t1 = empty_set.begin(); - auto t2 = empty_set.end(); - ASSERT_EQ(*empty_set.begin(), *empty_set.end()); - - // However, only keys that correspond to actual runtime indices of kernels in - // the operator table show up when you iterate through a keyset. i.e. - // DispatchKey::Dense, and BackendComponent::CPUBit won't show up in an - // iterator. - auto dense_cpu_iter = dense_cpu_set.begin(); - ASSERT_EQ(*dense_cpu_iter++, DispatchKey::CPU); - ASSERT_EQ(*dense_cpu_iter, *dense_cpu_set.end()); - - auto autograd_dense_cpu_cuda_iter = autograd_dense_cpu_cuda.begin(); - ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CPU); - ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::CUDA); - ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCPU); - ASSERT_EQ(*autograd_dense_cpu_cuda_iter++, DispatchKey::AutogradCUDA); - ASSERT_EQ(*autograd_dense_cpu_cuda_iter, *autograd_dense_cpu_cuda.end()); - - // But other "functionality bits" that are not defined per-backend DO get - // their own slots in the operator table. - auto mixed_keyset = DispatchKeySet(BackendComponent::CPUBit) | - DispatchKeySet( - {DispatchKey::FPGA, // runtime key - DispatchKey::Functionalize, // runtime key - DispatchKey::Dense}); // NOT a runtime key - auto mixed_iter = mixed_keyset.begin(); - ASSERT_EQ(*mixed_iter++, DispatchKey::CPU); - ASSERT_EQ(*mixed_iter++, DispatchKey::FPGA); - ASSERT_EQ(*mixed_iter++, DispatchKey::Functionalize); - ASSERT_EQ(*mixed_iter, *mixed_keyset.end()); -} - TEST(DispatchKeySet, Empty) { DispatchKeySet empty_set; - for (uint8_t i = 0; - i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); + for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); i++) { auto tid = static_cast(i); - if (tid == DispatchKey::Undefined) - continue; ASSERT_FALSE(empty_set.has(tid)); } ASSERT_TRUE(empty_set.empty()); DispatchKeySet empty_set2; ASSERT_TRUE(empty_set == empty_set2); + ASSERT_EQ(empty_set.highestPriorityTypeId(), DispatchKey::Undefined); } -// This covers all keys that correspond to a single backend bit, e.g. -// BackendComponent::CPUBit. Even though these are NOT runtime keys, we still -// allow adding them directly to a keyset -TEST(DispatchKeySet, SingletonBackendComponent) { - for (const auto i : c10::irange(1, num_backends)) { - auto tid = static_cast(i); - DispatchKeySet sing(tid); - ASSERT_EQ(sing, sing); - ASSERT_EQ(sing, DispatchKeySet().add(tid)); - ASSERT_EQ(sing, sing.add(tid)); - ASSERT_EQ(sing, sing | sing); - ASSERT_FALSE(sing.empty()); - ASSERT_TRUE(sing.has(tid)); - } -} - -// This covers all keys that correspond to a single functionality bit: -// - runtime, not-per-backend functionality keys, e.g. -// DispatchKey::FuncTorchBatched -// - runtime, "fake backend" keys, e.g. DispatchKey::FPGA -// - NOT-runtime, per-backend functionality keys, e.g. DispatchKey::Dense -// Even though it's not a runtime key, we still allow adding it directly to a -// keyset. -// DispatchKey:: -TEST(DispatchKeySet, SingletonFunctionalityKeys) { - for (const auto i : c10::irange(1, num_functionality_keys)) { +TEST(DispatchKeySet, Singleton) { + for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); + i++) { auto tid = static_cast(i); DispatchKeySet sing(tid); ASSERT_EQ(sing, sing); @@ -168,145 +30,47 @@ TEST(DispatchKeySet, SingletonFunctionalityKeys) { ASSERT_EQ(sing, sing | sing); ASSERT_FALSE(sing.empty()); ASSERT_TRUE(sing.has(tid)); + ASSERT_EQ(sing.highestPriorityTypeId(), tid); ASSERT_EQ(sing.remove(tid), DispatchKeySet()); } } -// This covers runtime keys that are per-backend, -// and take up more than one bit in a DispatchKeySet. They take up one -// functionality bit + one backend bit. e.g. CPU, CUDA, SparseCPU, SparseCUDA, -// AutogradCPU, AutogradCUDA -TEST(DispatchKeySet, SingletonPerBackendFunctionalityKeys) { - for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); - i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); - i++) { - auto tid = static_cast(i); - // Skip these because they aren't real keys. - if (tid == DispatchKey::StartOfDenseBackends || - tid == DispatchKey::StartOfSparseBackends || - tid == DispatchKey::StartOfQuantizedBackends || - tid == DispatchKey::StartOfAutogradBackends) { - continue; - } - DispatchKeySet sing(tid); - ASSERT_EQ(sing, sing); - ASSERT_EQ(sing, DispatchKeySet().add(tid)); - ASSERT_EQ(sing, sing.add(tid)); - ASSERT_EQ(sing, sing | sing); - ASSERT_FALSE(sing.empty()); - ASSERT_TRUE(sing.has(tid)); - - auto functionality_key = toFunctionalityKey(tid); - auto backend_key = toBackendComponent(tid); - // These two sets should be equivalent: - // DispatchKeySet(DispatchKey::CPU) - // DispatchKeySet({DispatchKey::Dense, BackendComponent::CPUBit}) - auto expected_ks = - DispatchKeySet(functionality_key) | DispatchKeySet(backend_key); - ASSERT_EQ(sing, expected_ks); - // These two sets should be equivalent: - // DispatchKeySet(DispatchKey::CPU).remove(DispatchKey::Dense) - // DispatchKeySet(BackendComponent::CPUBit) - expected_ks = DispatchKeySet(toBackendComponent(tid)); - ASSERT_EQ(sing.remove(tid), expected_ks); - } -} - -TEST(DispatchKeySet, DoubletonPerBackend) { - for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); - i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); +TEST(DispatchKeySet, Doubleton) { + for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); i++) { for (uint8_t j = i + 1; - j <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); + j < static_cast(DispatchKey::NumDispatchKeys); j++) { ASSERT_LT(i, j); auto tid1 = static_cast(i); auto tid2 = static_cast(j); - - // Skip these because they aren't real keys. - if (tid1 == DispatchKey::StartOfDenseBackends || - tid1 == DispatchKey::StartOfSparseBackends || - tid1 == DispatchKey::StartOfQuantizedBackends || - tid1 == DispatchKey::StartOfAutogradBackends) - continue; - if (tid2 == DispatchKey::StartOfDenseBackends || - tid2 == DispatchKey::StartOfSparseBackends || - tid2 == DispatchKey::StartOfQuantizedBackends || - tid2 == DispatchKey::StartOfAutogradBackends) - continue; - - auto backend1 = toBackendComponent(tid1); - auto backend2 = toBackendComponent(tid2); - auto functionality1 = toFunctionalityKey(tid1); - auto functionality2 = toFunctionalityKey(tid2); - - auto combined = DispatchKeySet({tid1, tid2}); - // The combined set has the backend bits - ASSERT_TRUE(combined.has_backend(backend1)); - ASSERT_TRUE(combined.has_backend(backend2)); - // and it has the backend bits - ASSERT_TRUE(combined.has(functionality1)); - ASSERT_TRUE(combined.has(functionality2)); - // and it has the original two runtime keys - ASSERT_TRUE(combined.has(tid1)); - ASSERT_TRUE(combined.has(tid2)); - - // Add all of the keys in the keyset to a real set - std::unordered_set visited_keys; - auto iter = combined.begin(); - while (*iter != *combined.end()) { - visited_keys.insert(*iter); - ++iter; - } - std::unordered_set expected_keys; - expected_keys.insert( - toRuntimePerBackendFunctionalityKey(functionality1, backend1)); - expected_keys.insert( - toRuntimePerBackendFunctionalityKey(functionality1, backend2)); - expected_keys.insert( - toRuntimePerBackendFunctionalityKey(functionality2, backend1)); - expected_keys.insert( - toRuntimePerBackendFunctionalityKey(functionality2, backend2)); - ASSERT_EQ(expected_keys, visited_keys); - - if (backend1 == backend2 || functionality1 == functionality2) { - // We have two runtime keys, with either the same backend or the same - // per-backend functionalities. E.g. {AutogradCUDA, CUDA} or - // {AutogradCPU, AutogradCUDA} There should be 2 total runtime keys in - // this set. - ASSERT_EQ(2, visited_keys.size()); - } else { - // since i and j are different keys, they should not have the same - // functionality and backend - ASSERT_TRUE(backend1 != backend2 && functionality1 != functionality2); - // We have two runtime keys, that have different backends + per-backend - // functionalities. So we should expect the full cross product of - // runtime keys to be in the set. e.g. if i = AutogradCUDA, and j = CPU, - // then combined = {AutogradCUDA, AutogradCPU, CUDA, CPU} - ASSERT_EQ(4, visited_keys.size()); - } + auto doub = DispatchKeySet(tid1).add(tid2); + ASSERT_EQ(doub, DispatchKeySet(tid1) | DispatchKeySet(tid2)); + ASSERT_TRUE(doub.has(tid1)); + ASSERT_TRUE(doub.has(tid2)); + ASSERT_EQ(doub.highestPriorityTypeId(), tid2); // relies on i < j } } } TEST(DispatchKeySet, Full) { DispatchKeySet full(DispatchKeySet::FULL); - for (const auto i : c10::irange(1, num_functionality_keys)) { + for (uint8_t i = 1; i < static_cast(DispatchKey::NumDispatchKeys); + i++) { auto tid = static_cast(i); ASSERT_TRUE(full.has(tid)); } - ASSERT_FALSE(full.has(DispatchKey::EndOfFunctionalityKeys)); } TEST(DispatchKeySet, IteratorBasicOps) { DispatchKeySet empty_set; DispatchKeySet full_set(DispatchKeySet::FULL); - DispatchKeySet mutated_set = empty_set.add(DispatchKey::CPU); + DispatchKeySet mutated_set = empty_set.add(static_cast(1)); // Constructor + Comparison - ASSERT_EQ(*empty_set.begin(), DispatchKey::EndOfFunctionalityKeys); - ASSERT_EQ(*empty_set.end(), DispatchKey::EndOfFunctionalityKeys); - ASSERT_EQ(*mutated_set.begin(), DispatchKey::CPU); + ASSERT_EQ(*empty_set.begin(), DispatchKey::NumDispatchKeys); + ASSERT_EQ(*empty_set.end(), DispatchKey::NumDispatchKeys); + ASSERT_EQ(*mutated_set.begin(), static_cast(1)); ASSERT_TRUE(empty_set.begin() == empty_set.end()); ASSERT_TRUE(full_set.begin() != full_set.end()); @@ -326,37 +90,16 @@ TEST(DispatchKeySet, IteratorEmpty) { ASSERT_EQ(i, 0); } -TEST(DispatchKeySet, IteratorCrossProduct) { - // The iterator should return all runtime keys in the set, - // including the cross product of {backends} x {functionalities} - auto ks = - DispatchKeySet({BackendComponent::CPUBit, BackendComponent::CUDABit}) | - DispatchKeySet( - {DispatchKey::Dense, - DispatchKey::FPGA, - DispatchKey::AutogradFunctionality}); - - auto iter = ks.begin(); - // iterate through dense backends first. - ASSERT_EQ(DispatchKey::CPU, *(iter++)); - ASSERT_EQ(DispatchKey::CUDA, *(iter++)); - // FPGA doesn't have a backend bit, so it isn't included in the cross product. - ASSERT_EQ(DispatchKey::FPGA, *(iter++)); - // iterate through the autograd keys laster. - ASSERT_EQ(DispatchKey::AutogradCPU, *(iter++)); - ASSERT_EQ(DispatchKey::AutogradCUDA, *(iter++)); -} - TEST(DispatchKeySet, IteratorFull) { DispatchKeySet full_set(DispatchKeySet::FULL); uint8_t i = 0; for (const auto& it : full_set) { i++; + ASSERT_TRUE(it == static_cast(i)); + ASSERT_TRUE(it != DispatchKey::NumDispatchKeys); } - // Total # of runtime entries includes an entry for DispatchKey::Undefined, - // which is not included when iterating through the DispatchKeySet. - ASSERT_EQ(i, num_runtime_entries - 1); + ASSERT_EQ(i, static_cast(DispatchKey::NumDispatchKeys) - 1); } TEST(DispatchKeySet, IteratorRangeFull) { @@ -365,61 +108,41 @@ TEST(DispatchKeySet, IteratorRangeFull) { for (DispatchKey dispatch_key : full_set) { i++; + ASSERT_TRUE(dispatch_key == static_cast(i)); } - // Total # of runtime entries includes an entry for DispatchKey::Undefined, - // which is not included when iterating through the DispatchKeySet. - ASSERT_EQ(i, num_runtime_entries - 1); + ASSERT_EQ(i, static_cast(DispatchKey::NumDispatchKeys) - 1); +} + +TEST(DispatchKeySet, SpecificKeys) { + DispatchKeySet keyset({ + static_cast(0), // Undefined should be ignored + static_cast(4), + static_cast(10), + static_cast(15), + }); + std::unordered_set visited_keys; + + for (DispatchKey key : keyset) { + visited_keys.insert(key); + } + + ASSERT_EQ(visited_keys.size(), 3); + ASSERT_TRUE( + visited_keys.find(static_cast(4)) != visited_keys.end()); + ASSERT_TRUE( + visited_keys.find(static_cast(10)) != visited_keys.end()); + ASSERT_TRUE( + visited_keys.find(static_cast(15)) != visited_keys.end()); } TEST(DispatchKeySet, FailAtEndIterator) { DispatchKeySet full_set(DispatchKeySet::FULL); uint64_t raw_repr = full_set.raw_repr(); - // doesn't throw - DispatchKeySet::iterator(&raw_repr, num_backends + num_functionality_keys); // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) EXPECT_THROW( DispatchKeySet::iterator( - &raw_repr, num_backends + num_functionality_keys + 1), + &raw_repr, static_cast(DispatchKey::NumDispatchKeys) + 1), c10::Error); } - -TEST(DispatchKeySet, TestKeyOrderingInvariants) { - for (uint8_t i = static_cast(DispatchKey::StartOfDenseBackends); - i <= static_cast(DispatchKey::EndOfRuntimeBackendKeys); - i++) { - auto k = static_cast(i); - // Note [The Ordering of Per-Backend Dispatch Keys Matters!] - // The DispatchKey enum includes all of the runtime keys for - // Dense/Sparse/Quantized/Autograd, (e.g. CPU, CUDA, SparseCPU, SparseCUDA, - // AutogradCPU, AutogradCUDA, etc). And we expect the ordering of those keys - // to be the same as the ordering of the backends in the `BackendComponent` - // enum. This makes several utilities in `DispatchKey.h` and - // `DispatchKeySet.h` significantly easier to implement. The purpose of the - // test is to assert (through CI) that this invariant is maintained. - // - // The only way that we can really check this invariant is by - // comparing the string names of each enum. - // We only really care about the ordering for "real" keys that are actually - // used, which we expect to be able to print properly. This saves us from - // having to enumerate the full set of possible runtime keys in - // DispatchKey::toString(). It also relies on toString() being implemented - // correctly. - auto functionality_str = std::string(toString(k)); - if (functionality_str == "UNKNOWN_TENSOR_TYPE_ID") - continue; - - auto computed_backend_k = toBackendComponent(k); - auto computed_backend_str = std::string(toString(computed_backend_k)); - // Skip, e.g., the "Bit" from "CPUBit" - computed_backend_str = - computed_backend_str.substr(0, computed_backend_str.size() - 3); - - ASSERT_TRUE( - functionality_str.find(computed_backend_str) != std::string::npos) - << "DispatchKey invariant broken! Found a key that is not ordered correctly" - << " with its backend bit. key = " << toString(k) << ", " << k - << ", computed backend = " << toString(computed_backend_k); - } -} diff --git a/test/test_dispatch.py b/test/test_dispatch.py index c97e9e382fc..37a6054f915 100644 --- a/test/test_dispatch.py +++ b/test/test_dispatch.py @@ -532,8 +532,8 @@ AutogradXLA: fn_math [math kernel] lambda m: m.def_("foo(Tensor x) -> Tensor"), # m.impl("foo", torch::kCompositeImplicitAutograd, [](const Tensor & x) { return x }) lambda m: m.impl_t_t("foo", "CompositeImplicitAutograd", debug="fn_math"), - # m.impl("foo", torch::kFPGA, [](const Tensor & x) { return x }) - lambda m: m.impl_t_t("foo", "FPGA", debug="fn_fpga"), + # m.impl("foo", torch::kQuantizedCPU, [](const Tensor & x) { return x }) + lambda m: m.impl_t_t("foo", "QuantizedCPU", debug="fn_quantizedcpu"), ]) state, table = result.state, result.table self.assertExpectedInline(state, '''\ @@ -541,12 +541,12 @@ name: test::foo schema: test::foo(Tensor x) -> (Tensor) debug: registered at /dev/null:0 alias analysis kind: FROM_SCHEMA -FPGA: fn_fpga :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] +QuantizedCPU: fn_quantizedcpu :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] CompositeImplicitAutograd[alias]: fn_math :: (Tensor _0) -> (Tensor _0) [ boxed unboxed ] ''') # computed dispatch table is too big, so we only check on a few entries we're interested in. - extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',)) + extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('QuantizedCPU',)) self.assertExpectedInline(extracted_table, '''\ Undefined: fn_math [math kernel] @@ -557,7 +557,7 @@ AutogradOther: ambiguous_autogradother [ambiguous autogradother] AutogradCPU: fn_math [math kernel] AutogradCUDA: fn_math [math kernel] AutogradXLA: fn_math [math kernel] -FPGA: fn_fpga [kernel] +QuantizedCPU: fn_quantizedcpu [kernel] ''') def test_computed_table_with_cpu_defaultbackend(self): @@ -616,7 +616,7 @@ CompositeExplicitAutograd[alias]: fn_defaultbackend :: (Tensor _0) -> (Tensor _0 ''') # computed dispatch table is too big, so we only check on a few entries we're interested in. - extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('FPGA',)) + extracted_table = extract_dispatch_table_with_keys(table, dispatch_keys_to_check + ('QuantizedCPU',)) self.assertExpectedInline(extracted_table, '''\ Undefined: fn_defaultbackend [default backend kernel] @@ -627,7 +627,7 @@ AutogradOther: fn_autograd [autograd kernel] AutogradCPU: fn_autograd [autograd kernel] AutogradCUDA: fn_autograd [autograd kernel] AutogradXLA: fn_autograd [autograd kernel] -FPGA: fn_defaultbackend [default backend kernel] +QuantizedCPU: fn_defaultbackend [default backend kernel] ''') def test_computed_table_with_cpu_autograd_math_defaultbackend(self): @@ -808,7 +808,7 @@ key kernel CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -FPGA fn_CompositeImplicitAutograd [math kernel] +QuantizedCPU fn_CompositeImplicitAutograd [math kernel] AutogradOther fn_CompositeImplicitAutograd [math kernel] AutogradCPU fallthrough [backend fallback] AutogradXLA fallthrough [backend fallback] @@ -829,7 +829,7 @@ key kernel CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -FPGA fn_CompositeImplicitAutograd [math kernel] +QuantizedCPU fn_CompositeImplicitAutograd [math kernel] AutogradOther fn_CompositeImplicitAutograd [math kernel] AutogradCPU fn_AutogradCPU [kernel] AutogradXLA fallthrough [backend fallback] @@ -864,7 +864,7 @@ key kernel CPU fn_CPU [kernel] XLA fn_XLA [kernel] Lazy fn_Lazy [kernel] -FPGA fn_CompositeExplicitAutograd [default backend kernel] +QuantizedCPU fn_CompositeExplicitAutograd [default backend kernel] AutogradOther fallthrough [backend fallback] AutogradCPU fn_AutogradCPU [kernel] AutogradXLA fallthrough [backend fallback] @@ -889,7 +889,7 @@ CompositeExplicitAutograd[alias] fn_CompositeExplicitAutograd def test_autogradother(self): dispatcher = PythonDispatcher() - dispatcher.register(["CPU", "FPGA", "CompositeImplicitAutograd"]) + dispatcher.register(["CPU", "QuantizedCPU", "CompositeImplicitAutograd"]) self.assertExpectedInline( dispatcher.dispatchTable(), '''\ @@ -900,7 +900,7 @@ key kernel CPU fn_CPU [kernel] XLA fn_CompositeImplicitAutograd [math kernel] Lazy fn_CompositeImplicitAutograd [math kernel] -FPGA fn_FPGA [kernel] +QuantizedCPU fn_QuantizedCPU [kernel] AutogradOther ambiguous_autogradother [ambiguous autogradother] AutogradCPU fallthrough [backend fallback] AutogradXLA fn_CompositeImplicitAutograd [math kernel] @@ -915,8 +915,8 @@ AutogradLazy fn_CompositeImplicitAutograd [math kernel] Registered Kernels key kernel --------------------------- -FPGA fn_FPGA CPU fn_CPU +QuantizedCPU fn_QuantizedCPU CompositeImplicitAutograd[alias] fn_CompositeImplicitAutograd ''' ) diff --git a/test/test_sparse.py b/test/test_sparse.py index 34d5155bfa8..cbc98f572bd 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -3410,21 +3410,21 @@ class TestSparseOneOff(TestCase): def test_cuda_from_cpu(self): with self.assertRaisesRegex( RuntimeError, - "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): + "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), torch.randn(4, 4, 4), [3, 4, 4]) with self.assertRaisesRegex( RuntimeError, - "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): + "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): torch.sparse.FloatTensor(torch.zeros(1, 4).long().cuda(), torch.randn(4, 4, 4, 0), [3, 4, 4, 0]) with self.assertRaisesRegex( RuntimeError, - "Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!"): + "backend of indices \\(CUDA\\) must match backend of values \\(CPU\\)"): torch.sparse.FloatTensor(torch.LongTensor(1, 0).cuda(), torch.randn(0, 4, 4, 0), [0, 4, 4, 0]) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 5abbc3487e8..6bc0d7df100 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -48,66 +48,58 @@ class DispatchKey(Enum): Undefined = 0 CatchAll = Undefined - Dense = auto() + CPU = auto() + CUDA = auto() + HIP = auto() FPGA = auto() ORT = auto() + XLA = auto() + Lazy = auto() Vulkan = auto() Metal = auto() + XPU = auto() MKLDNN = auto() OpenGL = auto() OpenCL = auto() IDEEP = auto() - Quantized = auto() + QuantizedCPU = auto() + QuantizedCUDA = auto() + QuantizedXPU = auto() CustomRNGKeyId = auto() MkldnnCPU = auto() - Sparse = auto() + SparseCPU = auto() + SparseCUDA = auto() SparseCsrCPU = auto() SparseCsrCUDA = auto() + SparseHIP = auto() + SparseXPU = auto() + NestedTensor = auto() + PrivateUse1 = auto() + PrivateUse2 = auto() + PrivateUse3 = auto() + EndOfBackendKeys = PrivateUse3 ZeroTensor = auto() Meta = auto() BackendSelect = auto() Named = auto() AutogradOther = auto() - AutogradFunctionality = auto() + AutogradCPU = auto() + AutogradCUDA = auto() + AutogradXLA = auto() + AutogradLazy = auto() AutogradNestedTensor = auto() + AutogradXPU = auto() + AutogradPrivateUse1 = auto() + AutogradPrivateUse2 = auto() + AutogradPrivateUse3 = auto() Tracer = auto() Autocast = auto() Batched = auto() VmapMode = auto() TESTING_ONLY_GenericWrapper = auto() TESTING_ONLY_GenericMode = auto() - EndOfFunctionalityKeys = TESTING_ONLY_GenericMode - - CPU = auto() - CUDA = auto() - HIP = auto() - XLA = auto() - Lazy = auto() - XPU = auto() - NestedTensor = auto() - PrivateUse1 = auto() - PrivateUse2 = auto() - PrivateUse3 = auto() - - QuantizedCPU = auto() - QuantizedCUDA = auto() - QuantizedXPU = auto() - - SparseCPU = auto() - SparseCUDA = auto() - SparseHIP = auto() - SparseXPU = auto() - - AutogradCPU = auto() - AutogradCUDA = auto() - AutogradXLA = auto() - AutogradLazy = auto() - AutogradXPU = auto() - AutogradPrivateUse1 = auto() - AutogradPrivateUse2 = auto() - AutogradPrivateUse3 = auto() - + NumDispatchKeys = auto() Autograd = auto() CompositeImplicitAutograd = auto() CompositeExplicitAutograd = auto() diff --git a/torch/_python_dispatcher.py b/torch/_python_dispatcher.py index fe0c6253fdd..aa19a18efb3 100644 --- a/torch/_python_dispatcher.py +++ b/torch/_python_dispatcher.py @@ -15,9 +15,9 @@ keys for a single example of each use case. These use cases are listed below: - CPU/AutogradCPU: represents in-tree backends which we usually have dedicated inference & autograd kernel in pytorch core library. E.g. CPU, CUDA -- FPGA/AutogradOther: represents in-tree backends which we usually have backend specific +- QuantizedCPU/AutogradOther: represents in-tree backends which we usually have backend specific inference kernels, but they share the same autograd kernel specified in AutogradOther. - E.g. FPGA, SparseCsrCPU + E.g. QuantizedCPU, QuantizedCUDA - XLA/AutogradXLA: represents out-of-tree backends which we don't have either inference or autograd kernel defined in pytorch core library. Backend owner is responsible for registering both inference & autograd kernels in their extensions(e.g. torch-xla) for the operators they support. @@ -53,7 +53,7 @@ class PythonDispatcher: name = "foo" runtime_keys = [ "CPU", "AutogradCPU", - "FPGA", "AutogradOther", + "QuantizedCPU", "AutogradOther", "XLA", "AutogradXLA", "Lazy", "AutogradLazy", ] From fb4504da2f064a26a3443dd5501cac56ebae5579 Mon Sep 17 00:00:00 2001 From: mattip Date: Mon, 14 Feb 2022 15:24:34 -0800 Subject: [PATCH 023/199] DOC: release documentation version should be major.minor (#72706) Summary: Fixes pytorch/pytorch.github.io#929 The pytorch doc team would like to move to only major.minor documentation at https://pytorch.org/docs/versions.html, not major.minor.patch. This has been done in the CI scripts, but the generated documentation still has the patch version. Remove it when building RELEASE documentation. This allows simplifying the logic, using `'.'.join(torch_version.split('.')[:2])` since we no longer care about trimming off the HASH: it automatically gets removed. holly1238, brianjo Pull Request resolved: https://github.com/pytorch/pytorch/pull/72706 Reviewed By: samdow Differential Revision: D34215815 Pulled By: albanD fbshipit-source-id: 8437036cc6636674d9ab8b1666f37b561d0527e1 (cherry picked from commit d8caf988f958656f357a497372a782ff69829f9e) --- docs/source/conf.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 0b1343145bc..8cf8459614d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -326,14 +326,11 @@ release = 'master' # Customized html_title here. # Default is " ".join(project, release, "documentation") if not set if RELEASE: - # remove hash (start with 'a') from version number if any - version_end = torch_version.find('a') - if version_end == -1: - html_title = " ".join((project, torch_version, "documentation")) - version = torch_version - else: - html_title = " ".join((project, torch_version[:version_end], "documentation")) - version = torch_version[:version_end] + # Turn 1.11.0aHASH into 1.11 + # Note: the release candidates should no longer have the aHASH suffix, but in any + # case we wish to leave only major.minor, even for rc builds. + version = '.'.join(torch_version.split('.')[:2]) + html_title = " ".join((project, version, "documentation")) release = version # The language for content autogenerated by Sphinx. Refer to documentation From 52c516ecb8ad048b8131394dbcae768621821160 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Mon, 14 Feb 2022 15:39:56 -0800 Subject: [PATCH 024/199] [Pytorch Edge] Minor improve documentation in test_backend_with_compiler Summary: Went through all these files and the design doc to understand the to_backend api. Figured I could add some comments to these files to make the apis a little clearer for those that come after. (Note: this ignores all push blocking failures!) Test Plan: na Reviewed By: raziel, larryliu0820 Differential Revision: D34221989 fbshipit-source-id: 699fcbd8714bfb6b58c6c0bf0e5fbc019d2ef6f8 (cherry picked from commit 0b3f5d73e8de216b4402aed2f2b80be8781f145b) --- test/cpp/jit/test_backend_compiler_lib.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/test/cpp/jit/test_backend_compiler_lib.cpp b/test/cpp/jit/test_backend_compiler_lib.cpp index 0db8bd428e9..1bdf255a0d5 100644 --- a/test/cpp/jit/test_backend_compiler_lib.cpp +++ b/test/cpp/jit/test_backend_compiler_lib.cpp @@ -72,7 +72,12 @@ class BackendWithCompiler : public PyTorchBackendInterface { return true; } - // Since the actual compilation is done AOT, + // Since the actual compilation is done AOT for this backend, compile just + // forwards everything along. In a non toy setup this could grab information + // from that runtime that might be relevant to execute, such as build flags + // the resolution of the devices camera, or basically any runtime specific + // information that wouldnt be available server side where preprocess is + // called. c10::impl::GenericDict compile( c10::IValue processed, c10::impl::GenericDict method_compile_spec) override { @@ -86,8 +91,14 @@ class BackendWithCompiler : public PyTorchBackendInterface { return c10::impl::toGenericDict(handles); } + // Function that actually executes the model in the backend. Here there is + // nothing to dispatch to, so the backend is implemented locally within + // execute and it only supports add, subtract, and constant. In a non toy + // backend you can imagine how this function could be used to actually + // dispatch the inputs to the relevant backend/device. c10::impl::GenericList execute( - c10::IValue handle, + c10::IValue + handle, // example: [('prim::Constant#1', 14), ('aten::add', 15)] c10::impl::GenericList inputs) override { TORCH_INTERNAL_ASSERT(inputs.size() == 2); c10::IValue val0 = inputs[0]; @@ -107,6 +118,7 @@ class BackendWithCompiler : public PyTorchBackendInterface { auto start_time_us = torch::profiler::impl::getTime() / 1000; try { if (instruction.rfind("prim::Constant", 0) == 0) { + // 15 is the length of 'prim::Constant#' the constant val comes after TORCH_CHECK( instruction.size() > 15, "Constant value is expected in ", From 2d110d514f9611dd00bf63ae5ef7d5ce017c900f Mon Sep 17 00:00:00 2001 From: jiej Date: Mon, 14 Feb 2022 16:37:45 -0800 Subject: [PATCH 025/199] Nvfuser code bump 2_1_2022 (#72127) Summary: Things changed in this PR that requires review: 1. aten/src/ATen/core/interned_strings.h 2. torch/csrc/jit/ir/alias_analysis.h : exposing createValue to allow efficient mutation 3. torch/csrc/jit/runtime/symbolic_shape_registry.cpp : added gelu/tanh/erf in registry 4. torch/jit/_script.py : throws scripting model sees autocast as decorator since it's not supported nvfuser code update: 1. codegen improvements and performance tuning 2. integration bug fixes for shape expression logic 3. kernel segmentation update to address perf regression from horizontal fusion 4. scalar cpu tensor promotion to support inter-device operation between cpu scalar tensor and cuda tensor Things reverted from local changes: aten::gelu with approximation (tracked in PR: https://github.com/pytorch/pytorch/pull/61439) Pull Request resolved: https://github.com/pytorch/pytorch/pull/72127 Reviewed By: HamidShojanazeri Differential Revision: D34113233 Pulled By: jbschlosser fbshipit-source-id: b82cde32b71e324eca0ea57cb8c9f9647278ca74 (cherry picked from commit e009bc5c4e943211c4953e6fdf7c9913fa66b3c9) --- aten/src/ATen/core/interned_strings.h | 4 + benchmarks/cpp/nvfuser/batch_norm.cpp | 5 +- .../cpp/nvfuser/batch_norm_backward.cpp | 3 +- benchmarks/cpp/nvfuser/bert.cpp | 18 +- benchmarks/cpp/nvfuser/gelu_backward.cpp | 19 +- benchmarks/cpp/nvfuser/heuristic_cache.cpp | 3 +- benchmarks/cpp/nvfuser/heuristic_lookup.cpp | 3 +- benchmarks/cpp/nvfuser/instance_norm.cpp | 5 +- benchmarks/cpp/nvfuser/layer_norm.cpp | 3 +- .../cpp/nvfuser/layer_norm_backward.cpp | 3 +- benchmarks/cpp/nvfuser/shape_inference.cpp | 3 +- benchmarks/cpp/nvfuser/softmax_dropout.cpp | 7 +- benchmarks/cpp/nvfuser/utils.cpp | 10 +- test/cpp/jit/test_gpu.cpp | 4127 ++++++++++------- test/cpp/jit/test_gpu_shift.cpp | 1918 +++++--- test/cpp/jit/test_gpu_validator.h | 26 +- test/test_jit_cuda_fuser.py | 554 ++- tools/build_variables.bzl | 12 +- torch/csrc/jit/codegen/cuda/arith.cpp | 396 +- torch/csrc/jit/codegen/cuda/arith.h | 76 +- torch/csrc/jit/codegen/cuda/codegen.cpp | 653 +-- torch/csrc/jit/codegen/cuda/codegen.h | 2 +- torch/csrc/jit/codegen/cuda/compute_at.cpp | 60 +- torch/csrc/jit/codegen/cuda/compute_at.h | 10 +- .../csrc/jit/codegen/cuda/compute_at_map.cpp | 99 - torch/csrc/jit/codegen/cuda/compute_at_map.h | 27 - torch/csrc/jit/codegen/cuda/dispatch.cpp | 496 +- torch/csrc/jit/codegen/cuda/dispatch.h | 449 +- .../jit/codegen/cuda/evaluator_common.cpp | 113 +- .../csrc/jit/codegen/cuda/evaluator_common.h | 55 +- torch/csrc/jit/codegen/cuda/executor.cpp | 146 +- torch/csrc/jit/codegen/cuda/executor.h | 19 +- .../jit/codegen/cuda/executor_kernel_arg.cpp | 76 +- .../jit/codegen/cuda/executor_kernel_arg.h | 35 +- .../csrc/jit/codegen/cuda/executor_utils.cpp | 373 +- torch/csrc/jit/codegen/cuda/executor_utils.h | 41 +- torch/csrc/jit/codegen/cuda/expr_evaluator.h | 2 +- torch/csrc/jit/codegen/cuda/fusion.cpp | 293 +- torch/csrc/jit/codegen/cuda/fusion.h | 95 +- .../jit/codegen/cuda/fusion_segmenter.cpp | 66 +- .../csrc/jit/codegen/cuda/fusion_segmenter.h | 8 +- torch/csrc/jit/codegen/cuda/graph_fuser.cpp | 338 +- torch/csrc/jit/codegen/cuda/index_compute.cpp | 1333 +++--- torch/csrc/jit/codegen/cuda/index_compute.h | 113 +- .../codegen/cuda/index_reference_replay.cpp | 82 +- .../jit/codegen/cuda/index_reference_replay.h | 15 +- .../csrc/jit/codegen/cuda/instrumentation.cpp | 2 +- torch/csrc/jit/codegen/cuda/interface.cpp | 374 +- torch/csrc/jit/codegen/cuda/interface.h | 2 +- torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp | 136 +- torch/csrc/jit/codegen/cuda/ir_base_nodes.h | 148 +- .../{kernel_ir_builder.cpp => ir_builder.cpp} | 220 +- torch/csrc/jit/codegen/cuda/ir_builder.h | 127 + torch/csrc/jit/codegen/cuda/ir_cloner.cpp | 49 +- torch/csrc/jit/codegen/cuda/ir_cloner.h | 21 +- torch/csrc/jit/codegen/cuda/ir_container.cpp | 279 ++ torch/csrc/jit/codegen/cuda/ir_container.h | 174 + torch/csrc/jit/codegen/cuda/ir_graphviz.cpp | 5 +- torch/csrc/jit/codegen/cuda/ir_graphviz.h | 2 +- .../jit/codegen/cuda/ir_interface_nodes.h | 86 +- .../csrc/jit/codegen/cuda/ir_internal_nodes.h | 124 +- torch/csrc/jit/codegen/cuda/ir_iostream.cpp | 419 +- torch/csrc/jit/codegen/cuda/ir_iostream.h | 75 +- torch/csrc/jit/codegen/cuda/ir_nodes.cpp | 355 +- torch/csrc/jit/codegen/cuda/ir_printer.h | 2 +- torch/csrc/jit/codegen/cuda/ir_utils.cpp | 88 +- torch/csrc/jit/codegen/cuda/ir_utils.h | 9 +- torch/csrc/jit/codegen/cuda/iter_visitor.cpp | 136 +- torch/csrc/jit/codegen/cuda/iter_visitor.h | 57 +- torch/csrc/jit/codegen/cuda/kernel.cpp | 201 +- torch/csrc/jit/codegen/cuda/kernel.h | 120 +- torch/csrc/jit/codegen/cuda/kernel_cache.cpp | 47 +- torch/csrc/jit/codegen/cuda/kernel_cache.h | 2 +- .../codegen/cuda/kernel_expr_evaluator.cpp | 44 +- .../jit/codegen/cuda/kernel_expr_evaluator.h | 15 +- torch/csrc/jit/codegen/cuda/kernel_ir.cpp | 531 +-- torch/csrc/jit/codegen/cuda/kernel_ir.h | 1196 +---- .../csrc/jit/codegen/cuda/kernel_ir_builder.h | 131 - .../jit/codegen/cuda/kernel_ir_dispatch.cpp | 180 + .../jit/codegen/cuda/kernel_ir_dispatch.h | 118 + .../jit/codegen/cuda/kernel_ir_printer.cpp | 451 -- .../csrc/jit/codegen/cuda/kernel_ir_printer.h | 129 - torch/csrc/jit/codegen/cuda/lower2device.cpp | 539 +-- torch/csrc/jit/codegen/cuda/lower2device.h | 45 +- .../jit/codegen/cuda/lower_alias_memory.cpp | 119 +- .../jit/codegen/cuda/lower_alias_memory.h | 5 +- .../jit/codegen/cuda/lower_allocation.cpp | 428 +- .../csrc/jit/codegen/cuda/lower_allocation.h | 7 +- .../jit/codegen/cuda/lower_double_buffer.cpp | 508 ++ .../jit/codegen/cuda/lower_double_buffer.h | 142 + .../csrc/jit/codegen/cuda/lower_expr_sort.cpp | 4 +- .../codegen/cuda/lower_fusion_simplifier.cpp | 119 + .../codegen/cuda/lower_fusion_simplifier.h | 26 + torch/csrc/jit/codegen/cuda/lower_index.cpp | 215 +- torch/csrc/jit/codegen/cuda/lower_index.h | 49 +- .../jit/codegen/cuda/lower_insert_syncs.cpp | 631 ++- .../jit/codegen/cuda/lower_insert_syncs.h | 40 +- torch/csrc/jit/codegen/cuda/lower_loops.cpp | 62 +- torch/csrc/jit/codegen/cuda/lower_loops.h | 11 +- .../jit/codegen/cuda/lower_magic_zero.cpp | 113 +- .../csrc/jit/codegen/cuda/lower_magic_zero.h | 6 +- .../cuda/lower_misaligned_vectorization.cpp | 330 +- .../cuda/lower_misaligned_vectorization.h | 7 +- .../csrc/jit/codegen/cuda/lower_predicate.cpp | 145 +- torch/csrc/jit/codegen/cuda/lower_predicate.h | 16 +- .../jit/codegen/cuda/lower_replace_size.cpp | 288 ++ .../jit/codegen/cuda/lower_replace_size.h | 25 + torch/csrc/jit/codegen/cuda/lower_shift.cpp | 270 +- torch/csrc/jit/codegen/cuda/lower_shift.h | 36 +- .../codegen/cuda/lower_thread_predicate.cpp | 48 +- .../jit/codegen/cuda/lower_thread_predicate.h | 6 +- .../codegen/cuda/lower_trivial_broadcast.cpp | 119 + .../codegen/cuda/lower_trivial_broadcast.h | 51 + .../codegen/cuda/lower_trivial_reductions.cpp | 25 +- .../codegen/cuda/lower_trivial_reductions.h | 16 +- torch/csrc/jit/codegen/cuda/lower_unroll.cpp | 101 +- torch/csrc/jit/codegen/cuda/lower_unroll.h | 24 +- torch/csrc/jit/codegen/cuda/lower_utils.cpp | 382 +- torch/csrc/jit/codegen/cuda/lower_utils.h | 135 +- .../jit/codegen/cuda/lower_validation.cpp | 23 +- .../csrc/jit/codegen/cuda/lower_validation.h | 2 +- .../jit/codegen/cuda/lower_warp_reduce.cpp | 184 +- .../csrc/jit/codegen/cuda/lower_warp_reduce.h | 2 +- torch/csrc/jit/codegen/cuda/manager.cpp | 20 + torch/csrc/jit/codegen/cuda/manager.h | 2 +- torch/csrc/jit/codegen/cuda/mutator.cpp | 416 +- torch/csrc/jit/codegen/cuda/mutator.h | 2 +- .../jit/codegen/cuda/non_divisible_split.h | 2 +- torch/csrc/jit/codegen/cuda/ops/alias.cpp | 115 + torch/csrc/jit/codegen/cuda/ops/alias.h | 38 + torch/csrc/jit/codegen/cuda/ops/all_ops.h | 1 + torch/csrc/jit/codegen/cuda/ops/composite.cpp | 114 +- torch/csrc/jit/codegen/cuda/ops/composite.h | 8 +- .../jit/codegen/cuda/ops/normalization.cpp | 67 +- .../csrc/jit/codegen/cuda/ops/normalization.h | 2 +- .../codegen/cuda/parallel_dimension_map.cpp | 51 +- .../jit/codegen/cuda/parallel_dimension_map.h | 6 +- .../jit/codegen/cuda/parallel_type_bitmap.h | 2 +- torch/csrc/jit/codegen/cuda/parser.cpp | 302 +- torch/csrc/jit/codegen/cuda/parser.h | 2 +- .../jit/codegen/cuda/partial_split_map.cpp | 28 +- .../csrc/jit/codegen/cuda/partial_split_map.h | 6 +- torch/csrc/jit/codegen/cuda/partition.cpp | 69 +- torch/csrc/jit/codegen/cuda/partition.h | 2 +- .../jit/codegen/cuda/predicate_compute.cpp | 258 +- .../csrc/jit/codegen/cuda/predicate_compute.h | 39 +- .../csrc/jit/codegen/cuda/reference_tensor.h | 2 +- .../csrc/jit/codegen/cuda/root_domain_map.cpp | 64 +- torch/csrc/jit/codegen/cuda/root_domain_map.h | 19 +- .../codegen/cuda/runtime/block_sync_atomic.cu | 6 +- .../codegen/cuda/runtime/grid_reduction.cu | 2 +- .../jit/codegen/cuda/runtime/grid_sync.cu | 6 +- .../csrc/jit/codegen/cuda/runtime/helpers.cu | 16 + torch/csrc/jit/codegen/cuda/runtime/tensor.cu | 10 + .../csrc/jit/codegen/cuda/runtime/welford.cu | 16 +- .../codegen/cuda/scheduler/normalization.cpp | 52 +- .../jit/codegen/cuda/scheduler/pointwise.cpp | 16 +- .../jit/codegen/cuda/scheduler/pointwise.h | 5 + .../jit/codegen/cuda/scheduler/reduction.cpp | 121 +- .../cuda/scheduler/reduction_heuristic.h | 34 +- .../cuda/scheduler/reduction_utils.cpp | 395 +- .../jit/codegen/cuda/scheduler/registry.cpp | 119 +- .../csrc/jit/codegen/cuda/scheduler/utils.cpp | 38 +- torch/csrc/jit/codegen/cuda/tensor_view.cpp | 194 +- .../csrc/jit/codegen/cuda/transform_iter.cpp | 8 +- torch/csrc/jit/codegen/cuda/transform_iter.h | 2 +- .../jit/codegen/cuda/transform_replay.cpp | 38 +- .../csrc/jit/codegen/cuda/transform_replay.h | 2 +- .../jit/codegen/cuda/transform_rfactor.cpp | 38 +- .../csrc/jit/codegen/cuda/transform_rfactor.h | 2 +- .../csrc/jit/codegen/cuda/transform_view.cpp | 164 +- torch/csrc/jit/codegen/cuda/transform_view.h | 12 +- torch/csrc/jit/codegen/cuda/type.cpp | 37 +- torch/csrc/jit/codegen/cuda/type.h | 16 +- .../csrc/jit/codegen/cuda/type_inference.cpp | 16 +- .../csrc/jit/codegen/cuda/type_promotion.cpp | 10 +- torch/csrc/jit/codegen/cuda/utils.cpp | 21 + torch/csrc/jit/codegen/cuda/utils.h | 8 +- torch/csrc/jit/ir/alias_analysis.h | 6 +- torch/jit/_script.py | 4 + 180 files changed, 14997 insertions(+), 11607 deletions(-) rename torch/csrc/jit/codegen/cuda/{kernel_ir_builder.cpp => ir_builder.cpp} (50%) create mode 100644 torch/csrc/jit/codegen/cuda/ir_builder.h create mode 100644 torch/csrc/jit/codegen/cuda/ir_container.cpp create mode 100644 torch/csrc/jit/codegen/cuda/ir_container.h delete mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_builder.h create mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp create mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h delete mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp delete mode 100644 torch/csrc/jit/codegen/cuda/kernel_ir_printer.h create mode 100644 torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp create mode 100644 torch/csrc/jit/codegen/cuda/lower_double_buffer.h create mode 100644 torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp create mode 100644 torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h create mode 100644 torch/csrc/jit/codegen/cuda/lower_replace_size.cpp create mode 100644 torch/csrc/jit/codegen/cuda/lower_replace_size.h create mode 100644 torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp create mode 100644 torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h create mode 100644 torch/csrc/jit/codegen/cuda/ops/alias.cpp create mode 100644 torch/csrc/jit/codegen/cuda/ops/alias.h diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 36fb0f91e4c..b2d6a43731f 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -45,6 +45,10 @@ namespace c10 { _(prim, CudaFusionGuard) \ _(prim, FunctionalGraph) \ _(prim, add_optional) \ + _(prim, view_copy) \ + _(prim, reshape_copy) \ + _(prim, squeeze_copy) \ + _(prim, unsqueeze_copy) \ _(prim, DifferentiableGraph) \ _(prim, TensorExprGroup) \ _(prim, TensorExprDynamicGroup) \ diff --git a/benchmarks/cpp/nvfuser/batch_norm.cpp b/benchmarks/cpp/nvfuser/batch_norm.cpp index ef6bdd667d6..57e889b19fb 100644 --- a/benchmarks/cpp/nvfuser/batch_norm.cpp +++ b/benchmarks/cpp/nvfuser/batch_norm.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -44,8 +45,8 @@ static void setupBatchNorm(Fusion* fusion, DataType dtype) { bias = castOp(DataType::Float, bias); } - auto momentum_ptr = new Double(kMomentum); - auto eps_ptr = new Double(kEps); + auto momentum_ptr = IrBuilder::create(kMomentum); + auto eps_ptr = IrBuilder::create(kEps); auto result = batch_norm( input, diff --git a/benchmarks/cpp/nvfuser/batch_norm_backward.cpp b/benchmarks/cpp/nvfuser/batch_norm_backward.cpp index e4a9fdcb034..77a09564de5 100644 --- a/benchmarks/cpp/nvfuser/batch_norm_backward.cpp +++ b/benchmarks/cpp/nvfuser/batch_norm_backward.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,7 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) { grad_output = castOp(DataType::Float, grad_output); } - auto eps_ptr = new Double(kEps); + auto eps_ptr = IrBuilder::create(kEps); auto result = batch_norm_backward( input, diff --git a/benchmarks/cpp/nvfuser/bert.cpp b/benchmarks/cpp/nvfuser/bert.cpp index f8a389331ee..a1dd58d5646 100644 --- a/benchmarks/cpp/nvfuser/bert.cpp +++ b/benchmarks/cpp/nvfuser/bert.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -36,7 +37,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) { fusion->addInput(tv1); // TODO: should be input - auto d16 = new Double(1.0); + auto d16 = IrBuilder::create(1.0); if (is_fp16) { tv0 = castOp(DataType::Float, tv0); @@ -47,7 +48,7 @@ static void setupDivMaxSoftmaxDropoutForward(Fusion* fusion, DataType dtype) { auto tv3 = add(tv2, tv0); auto tv10 = softmax(tv3, 3); - auto dropout_tvs = dropout(tv10, new Double(0.9)); + auto dropout_tvs = dropout(tv10, IrBuilder::create(0.9)); auto tv12 = dropout_tvs.mask; auto tv14 = dropout_tvs.output; @@ -83,9 +84,9 @@ static void setupDivMaxSoftmaxDropoutBackward(Fusion* fusion, DataType dtype) { } // TODO: should be inputs - auto d32 = new Double(1.0); + auto d32 = IrBuilder::create(1.0); // fusion->addInput(d32); - auto d33 = new Double(2.0); + auto d33 = IrBuilder::create(2.0); // fusion->addInput(d33); auto tv4 = mul(tv2, tv3); @@ -252,14 +253,15 @@ static void setupBiasDropoutAddLayernormFwd(Fusion* fusion, DataType dtype) { auto tv5 = broadcast(tv4, {true, true, false}); auto tv6 = add(tv3, tv5); - auto dropout_outs = dropout(tv6, new Double(0.9)); + auto dropout_outs = dropout(tv6, IrBuilder::create(0.9)); auto tv8 = dropout_outs.output; auto tv10 = dropout_outs.mask; auto tv11 = add(tv10, tv2); - auto layer_norm_outs = layer_norm(tv11, 1, tv0, tv1, new Double(1e-5)); + auto layer_norm_outs = + layer_norm(tv11, 1, tv0, tv1, IrBuilder::create(1e-5)); auto tv14 = layer_norm_outs.output; auto tv21 = layer_norm_outs.mean; auto tv26 = layer_norm_outs.invstd; @@ -481,7 +483,7 @@ static void setupBiasDropoutAddLayernormBwd2(Fusion* fusion, DataType dtype) { tv1 = castOp(DataType::Float, tv1); tv8 = castOp(DataType::Float, tv8); } - auto d36 = mul(new Double(1.0), tv1->axis(2)->extent()); + auto d36 = mul(IrBuilder::create(1.0), tv1->axis(2)->extent()); auto d47 = unaryOp(UnaryOpType::Reciprocal, d36); auto tv9 = broadcast(tv5, {true, true, false}); @@ -583,7 +585,7 @@ static void setupBiasDropoutAddLayernormBwd3(Fusion* fusion, DataType dtype) { } // Uncertain this is the right value, but going for it anyways - auto d34 = div(new Double(1.0), tv0->axis(2)->extent()); + auto d34 = div(IrBuilder::create(1.0), tv0->axis(2)->extent()); auto tv25 = mul(tv21, tv0); auto tv26 = mul(tv25, d34); diff --git a/benchmarks/cpp/nvfuser/gelu_backward.cpp b/benchmarks/cpp/nvfuser/gelu_backward.cpp index 9d53d9c2759..f1811795462 100644 --- a/benchmarks/cpp/nvfuser/gelu_backward.cpp +++ b/benchmarks/cpp/nvfuser/gelu_backward.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -41,23 +42,23 @@ static void setupFusion(Fusion* fusion) { auto t5 = castOp(DataType::Float, t4); auto t6 = broadcast(t3, {true, true, false}); auto t7 = add(t6, t5); - auto t8 = mul(t7, new Double(k_079)); - auto t9 = mul(t7, new Double(k_004)); + auto t8 = mul(t7, IrBuilder::create(k_079)); + auto t9 = mul(t7, IrBuilder::create(k_004)); auto t10 = mul(t9, t7); - auto t11 = add(t10, new Int(1)); + auto t11 = add(t10, IrBuilder::create(1)); auto t12 = mul(t8, t11); auto t13 = unaryOp(UnaryOpType::Tanh, t12); - auto t14 = mul(t7, new Double(0.5)); + auto t14 = mul(t7, IrBuilder::create(0.5)); auto t15 = mul(t13, t13); auto t16 = unaryOp(UnaryOpType::Neg, t15); - auto t17 = add(t16, new Int(1)); - auto t18 = mul(t7, new Double(k_010)); + auto t17 = add(t16, IrBuilder::create(1)); + auto t18 = mul(t7, IrBuilder::create(k_010)); auto t19 = mul(t18, t7); - auto t20 = add(t19, new Double(k_079)); + auto t20 = add(t19, IrBuilder::create(k_079)); auto t21 = mul(t17, t20); auto t22 = mul(t14, t21); - auto t23 = add(t13, new Int(1)); - auto t24 = mul(t23, new Double(0.5)); + auto t23 = add(t13, IrBuilder::create(1)); + auto t24 = mul(t23, IrBuilder::create(0.5)); auto t25 = add(t22, t24); auto t26 = mul(t25, t1); diff --git a/benchmarks/cpp/nvfuser/heuristic_cache.cpp b/benchmarks/cpp/nvfuser/heuristic_cache.cpp index 22b8ec4ce97..65f850a016c 100644 --- a/benchmarks/cpp/nvfuser/heuristic_cache.cpp +++ b/benchmarks/cpp/nvfuser/heuristic_cache.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -129,7 +130,7 @@ static auto getLayerForwardNormRuntime( Fusion& fusion = *fusion_ptr.get(); const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); auto input = makeSymbolicTensor(shape.size()); fusion.addInput(input); diff --git a/benchmarks/cpp/nvfuser/heuristic_lookup.cpp b/benchmarks/cpp/nvfuser/heuristic_lookup.cpp index 22b8ec4ce97..65f850a016c 100644 --- a/benchmarks/cpp/nvfuser/heuristic_lookup.cpp +++ b/benchmarks/cpp/nvfuser/heuristic_lookup.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -129,7 +130,7 @@ static auto getLayerForwardNormRuntime( Fusion& fusion = *fusion_ptr.get(); const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); auto input = makeSymbolicTensor(shape.size()); fusion.addInput(input); diff --git a/benchmarks/cpp/nvfuser/instance_norm.cpp b/benchmarks/cpp/nvfuser/instance_norm.cpp index 395ac6c8c9c..007291d75f5 100644 --- a/benchmarks/cpp/nvfuser/instance_norm.cpp +++ b/benchmarks/cpp/nvfuser/instance_norm.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -39,8 +40,8 @@ static void setupInstanceNorm(Fusion* fusion, DataType dtype) { const bool kTraining = true; const float kMomentum = 0.1; const float kEps = 1e-5; - auto momentum_ptr = new Double(kMomentum); - auto eps_ptr = new Double(kEps); + auto momentum_ptr = IrBuilder::create(kMomentum); + auto eps_ptr = IrBuilder::create(kEps); auto norm = instance_norm( input, diff --git a/benchmarks/cpp/nvfuser/layer_norm.cpp b/benchmarks/cpp/nvfuser/layer_norm.cpp index c4f79b2b668..7500ac8525b 100644 --- a/benchmarks/cpp/nvfuser/layer_norm.cpp +++ b/benchmarks/cpp/nvfuser/layer_norm.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,7 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) { const int kReductionAxis = 1; const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); // setup fusion auto input = makeContigTensor(2, dtype); diff --git a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp index 43eafcc42fb..045465e7125 100644 --- a/benchmarks/cpp/nvfuser/layer_norm_backward.cpp +++ b/benchmarks/cpp/nvfuser/layer_norm_backward.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -22,7 +23,7 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); const int kReductionAxis = 1; - Double* eps_ptr = new Double(1e-5); + Double* eps_ptr = IrBuilder::create(1e-5); // setup fusion auto grad_out = makeContigTensor(2, dtype); diff --git a/benchmarks/cpp/nvfuser/shape_inference.cpp b/benchmarks/cpp/nvfuser/shape_inference.cpp index 33a9404b073..15acc51bb37 100644 --- a/benchmarks/cpp/nvfuser/shape_inference.cpp +++ b/benchmarks/cpp/nvfuser/shape_inference.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -151,7 +152,7 @@ static auto getLayerForwardNormRuntime( Fusion& fusion = *fusion_ptr.get(); const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); auto input = makeSymbolicTensor(shape.size()); fusion.addInput(input); diff --git a/benchmarks/cpp/nvfuser/softmax_dropout.cpp b/benchmarks/cpp/nvfuser/softmax_dropout.cpp index b4890eaf8d8..828940933f4 100644 --- a/benchmarks/cpp/nvfuser/softmax_dropout.cpp +++ b/benchmarks/cpp/nvfuser/softmax_dropout.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -35,7 +36,7 @@ static void setupSoftmaxDropout( auto attention_scores = makeContigTensor(4, dtype); auto attention_mask = makeContigTensor(4, dtype); - Double* divisor = new Double(); + Double* divisor = IrBuilder::create(); fusion->addInput(attention_scores); fusion->addInput(attention_mask); @@ -49,8 +50,8 @@ static void setupSoftmaxDropout( attention_scores = div(attention_scores, divisor); attention_scores = add(attention_scores, attention_mask); auto attention_probs = softmax(attention_scores, kReductionAxis); - auto prob = new Double(kDropoutProbability); - auto scale = new Double(kScale); + auto prob = IrBuilder::create(kDropoutProbability); + auto scale = IrBuilder::create(kScale); auto dropout_results = dropout(attention_probs, prob, scale); auto output = dropout_results.output; diff --git a/benchmarks/cpp/nvfuser/utils.cpp b/benchmarks/cpp/nvfuser/utils.cpp index 053fc693908..daf2b21a053 100644 --- a/benchmarks/cpp/nvfuser/utils.cpp +++ b/benchmarks/cpp/nvfuser/utils.cpp @@ -16,8 +16,8 @@ std::string toString(ReductionParams rparams) { if (rparams.schedule_3D) { ss << "3D Schedule // " << "Outer Reduction: " - << (rparams.cross_block_outer_reduce ? "cross block / " : "") - << (rparams.cross_grid_outer_reduce ? "cross grid / " : "") + << (rparams.cross_block_outer_reduction ? "cross block / " : "") + << (rparams.cross_grid_outer_reduction ? "cross grid / " : "") << (rparams.split_grid_dim_outer_reduction ? "split grid dim / " : ""); if (rparams.batches_per_block_outer_reduction > 1 || rparams.persistent_kernel) { @@ -38,9 +38,9 @@ std::string toString(ReductionParams rparams) { } ss << " // Inner Reduction Domain: " - << (rparams.cross_block_inner_reduce ? "cross block reduction / " : "") + << (rparams.cross_block_inner_reduction ? "cross block reduction / " : "") << (rparams.pad_inner_reduction_to_warp ? "pad to warp / " : "") - << (rparams.cross_grid_inner_reduce ? "cross grid reduction / " : ""); + << (rparams.cross_grid_inner_reduction ? "cross grid reduction / " : ""); if (rparams.batches_per_block_inner_reduction > 1 || rparams.persistent_kernel) { @@ -48,7 +48,7 @@ std::string toString(ReductionParams rparams) { << " / "; } - ss << (rparams.cross_grid_inner_reduce && + ss << (rparams.cross_grid_inner_reduction && rparams.split_grid_dim_inner_reduction ? "split grid dimension / " : "") diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index f229ac2679e..b7a4489abe9 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -18,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -86,19 +87,95 @@ void checkIntValue( void checkIntValue( kir::ExpressionEvaluator& evaluator, - const kir::Val* val, - kir::Int::ScalarType expected_value) { + const Val* val, + Int::ScalarType expected_value) { const auto actual_value = evaluator.evaluate(val); TORCH_CHECK(actual_value.has_value()); TORCH_CHECK(actual_value.value() == expected_value); } -bool isPredicated(TensorView* tv, GpuLower& gpulw) { - auto parent_scope = gpulw.lowerValue(tv)->definition()->parentScope(); - if (parent_scope->isA()) { - return !parent_scope->predicate()->value()->isConst(); +TensorView* loweredTv(TensorView* tv, GpuLower& gpulw) { + auto used_tvs = ir_utils::allTvs(gpulw.kernel()->as()); + TensorView* matching_tv = nullptr; + for (auto lowered_tv : used_tvs) { + if (lowered_tv->name() == tv->name()) { + matching_tv = lowered_tv; + } + } + TORCH_INTERNAL_ASSERT(matching_tv != nullptr); + return matching_tv; +} + +class PredicatedChecker : public kir::IrVisitor { + public: + // Checks if the provided tv is written to within a non-trivial conditional + static bool isPredicated(TensorView* tv, GpuLower& gpulw) { + PredicatedChecker checker( + loweredTv(tv, gpulw), gpulw.kernel()->topLevelExprs()); + return checker.is_predicated_; + } + + private: + PredicatedChecker() = delete; + + PredicatedChecker(TensorView* tv, std::vector exprs) : tv_(tv) { + kir::IrVisitor::handle(exprs); + } + + using kir::IrVisitor::handle; + bool is_predicated_ = false; + bool predicated_ite_ = false; + TensorView* tv_ = nullptr; + + void handle(kir::IfThenElse* ite) final { + auto prev_ite = predicated_ite_; + predicated_ite_ = !ite->predicate()->value()->isConstScalar(); + kir::IrVisitor::handle(ite); + predicated_ite_ = prev_ite; + } + + void handle(Expr* expr) final { + if (expr->outputs().size() && expr->outputs()[0]->isA()) { + auto ti = expr->outputs()[0]->as(); + if (ti->view() == tv_) { + is_predicated_ = is_predicated_ | predicated_ite_; + } + } + kir::IrVisitor::handle(expr); + } +}; + +class UnswitchInElseChecker : public kir::IrVisitor { + public: + // Checks if there are any unswitched for loops within an else clause + static bool check(GpuLower& gpulw) { + UnswitchInElseChecker checker(gpulw.kernel()->topLevelExprs()); + return checker.found_in_else_; + } + + private: + UnswitchInElseChecker() = delete; + UnswitchInElseChecker(std::vector exprs) { + kir::IrVisitor::handle(exprs); + } + + using kir::IrVisitor::handle; + bool within_else_ = false; + bool found_in_else_ = false; + + void handle(kir::IfThenElse* ite) final { + auto prev_within_else = within_else_; + within_else_ = true; + kir::IrVisitor::handle(ite->elseBody().exprs()); + within_else_ = prev_within_else; + } + + void handle(kir::ForLoop* for_loop) final { + if (for_loop->iter_domain()->getParallelType() == ParallelType::Unswitch) { + found_in_else_ = found_in_else_ || within_else_; + } + kir::IrVisitor::handle(for_loop); } - return true; }; } // namespace @@ -110,7 +187,7 @@ bool isPredicated(TensorView* tv, GpuLower& gpulw) { // (These tests exercise IrGraphGenerator through a non-trivial IR, // to make sure that it runs w/o crashing. The actual output is not // validated) -TEST(NVFuserTest, IrGraphGenerator_CUDA) { +TEST_F(NVFuserTest, FusionIrGraphGenerator_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -123,10 +200,12 @@ TEST(NVFuserTest, IrGraphGenerator_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv2 = add(tv0, new Double(3.141)); + TensorView* tv2 = add(tv0, IrBuilder::create(3.141)); TensorView* tv3 = broadcast(tv0, {false, true, false, true}); - TensorView* tv4 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv3); - TensorView* tv5 = clamp(tv4, new Double(0.f), new Double(1.f)); + TensorView* tv4 = + reductionOp(BinaryOpType::Add, {2}, IrBuilder::create(0), tv3); + TensorView* tv5 = clamp( + tv4, IrBuilder::create(0.f), IrBuilder::create(1.f)); TensorView* tv6 = add(tv2, tv2); // Another checkpoint before adding outputs @@ -149,7 +228,7 @@ TEST(NVFuserTest, IrGraphGenerator_CUDA) { .empty()); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(-1)->parallelize(ParallelType::TIDx); @@ -162,11 +241,11 @@ TEST(NVFuserTest, IrGraphGenerator_CUDA) { .empty()); } -TEST(NVFuserTest, FusionDispatch_CUDA) { +TEST_F(NVFuserTest, FusionDispatch_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - Double* f = new Double{2.f}; + Double* f = IrBuilder::create(2.f); std::stringstream ss1, ss2, ss3; ss1 << f; ss2 << static_cast(f); @@ -177,14 +256,14 @@ TEST(NVFuserTest, FusionDispatch_CUDA) { } // Evaluate basic scalar operations with constant values -TEST(NVFuserTest, FusionExprEvalConstants_CUDA) { +TEST_F(NVFuserTest, FusionExprEvalConstants_CUDA) { Fusion fusion; FusionGuard fg(&fusion); ExpressionEvaluator evaluator(&fusion); - auto* a = new Int(7); - auto* b = new Int(3); + auto* a = IrBuilder::create(7); + auto* b = IrBuilder::create(3); // Avoid div operation because it casts int operands to float checkIntValue(evaluator, neg(a), -7); @@ -195,17 +274,17 @@ TEST(NVFuserTest, FusionExprEvalConstants_CUDA) { } // Evaluate basic scalar operations with bound values -TEST(NVFuserTest, FusionExprEvalBindings_CUDA) { +TEST_F(NVFuserTest, FusionExprEvalBindings_CUDA) { Fusion fusion; FusionGuard fg(&fusion); ExpressionEvaluator evaluator(&fusion); - auto* a = new Int(); - auto* b = new Int(); + auto* a = IrBuilder::create(); + auto* b = IrBuilder::create(); auto* c = add(a, b); auto* d = neg(ceilDiv(c, b)); - auto* e = new Int(0); + auto* e = IrBuilder::create(0); // trying to evaluate before binding should give empty results TORCH_CHECK(!evaluator.evaluate(a).has_value()); @@ -240,7 +319,7 @@ TEST(NVFuserTest, FusionExprEvalBindings_CUDA) { } // Evaluate expressions in a simple IR -TEST(NVFuserTest, FusionExprEvalBasic_CUDA) { +TEST_F(NVFuserTest, FusionExprEvalBasic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -251,7 +330,7 @@ TEST(NVFuserTest, FusionExprEvalBasic_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addOutput(tv3); @@ -296,16 +375,16 @@ TEST(NVFuserTest, FusionExprEvalBasic_CUDA) { } // Evaluate expressions in a more complex IR -TEST(NVFuserTest, FusionExprEvalComplex_CUDA) { +TEST_F(NVFuserTest, FusionExprEvalComplex_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(-1.0)); - TensorView* tv2 = add(tv0, new Double(3.0)); - TensorView* tv3 = mul(tv0, new Double(2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(-1.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(3.0)); + TensorView* tv3 = mul(tv0, IrBuilder::create(2.0)); TensorView* tv4 = add(tv2, tv1); TensorView* tv5 = add(tv4, tv3); TensorView* tv6 = add(tv0, tv3); @@ -348,7 +427,7 @@ TEST(NVFuserTest, FusionExprEvalComplex_CUDA) { } // Evaluate expressions post lowering -TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) { +TEST_F(NVFuserTest, FusionExprEvalPostLower_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -359,7 +438,7 @@ TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addOutput(tv3); @@ -375,8 +454,8 @@ TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) { tv2->axis(-1)->parallelize(ParallelType::TIDx); tv3->axis(-1)->parallelize(ParallelType::TIDx); - auto* bid_x = add(tv3->axis(0)->extent(), new Int(0)); - auto* tid_x = add(tv3->axis(-1)->extent(), new Int(0)); + auto* bid_x = add(tv3->axis(0)->extent(), IrBuilder::create(0)); + auto* tid_x = add(tv3->axis(-1)->extent(), IrBuilder::create(0)); // Lower GpuLower gpulw(&fusion); @@ -406,37 +485,39 @@ TEST(NVFuserTest, FusionExprEvalPostLower_CUDA) { } // Kernel IR: Evaluate basic scalar operations with constant values -TEST(NVFuserTest, FusionKernelExprEvalConstants_CUDA) { - kir::Kernel kernel; - kir::IrBuilder ir_builder(&kernel); +TEST_F(NVFuserTest, FusionKernelExprEvalConstants_CUDA) { + Fusion fusion; + kir::Kernel kernel(&fusion); + FusionGuard fg((&kernel)->as()); - auto a = ir_builder.create(7); - auto b = ir_builder.create(3); - auto c = ir_builder.subExpr(a, b); - auto d = ir_builder.divExpr(a, b); - auto e = ir_builder.mulExpr(c, d); + auto a = IrBuilder::create(7); + auto b = IrBuilder::create(3); + auto c = IrBuilder::subExpr(a, b); + auto d = IrBuilder::divExpr(a, b); + auto e = IrBuilder::mulExpr(c, d); kir::ExpressionEvaluator evaluator; - checkIntValue(evaluator, ir_builder.negExpr(a), -7); - checkIntValue(evaluator, ir_builder.addExpr(a, b), 10); - checkIntValue(evaluator, ir_builder.negExpr(e), -8); - checkIntValue(evaluator, ir_builder.modExpr(a, b), 1); - checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 3); + checkIntValue(evaluator, IrBuilder::negExpr(a), -7); + checkIntValue(evaluator, IrBuilder::addExpr(a, b), 10); + checkIntValue(evaluator, IrBuilder::negExpr(e), -8); + checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1); + checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3); } // Kernel IR: Evaluate basic scalar operations with bound values -TEST(NVFuserTest, FusionKernelExprEvalBindings_CUDA) { - kir::Kernel kernel; - kir::IrBuilder ir_builder(&kernel); +TEST_F(NVFuserTest, FusionKernelExprEvalBindings_CUDA) { + Fusion fusion; + kir::Kernel kernel(&fusion); + FusionGuard fg((&kernel)->as()); kir::ExpressionEvaluator evaluator; - auto a = ir_builder.create(c10::nullopt); - auto b = ir_builder.create(c10::nullopt); - auto c = ir_builder.addExpr(a, b); - auto d = ir_builder.negExpr(ir_builder.ceilDivExpr(c, b)); - auto e = ir_builder.create(0); + auto a = IrBuilder::create(c10::nullopt); + auto b = IrBuilder::create(c10::nullopt); + auto c = IrBuilder::addExpr(a, b); + auto d = IrBuilder::negExpr(IrBuilder::ceilDivExpr(c, b)); + auto e = IrBuilder::create(0); // trying to evaluate before binding should give empty results TORCH_CHECK(!evaluator.evaluate(a).has_value()); @@ -452,9 +533,9 @@ TEST(NVFuserTest, FusionKernelExprEvalBindings_CUDA) { ASSERT_ANY_THROW(evaluator.bind(e, 100)); checkIntValue(evaluator, c, 10); - checkIntValue(evaluator, ir_builder.subExpr(a, b), 4); - checkIntValue(evaluator, ir_builder.modExpr(a, b), 1); - checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 3); + checkIntValue(evaluator, IrBuilder::subExpr(a, b), 4); + checkIntValue(evaluator, IrBuilder::modExpr(a, b), 1); + checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 3); checkIntValue(evaluator, d, -4); // Reset the evaluation context @@ -464,13 +545,13 @@ TEST(NVFuserTest, FusionKernelExprEvalBindings_CUDA) { evaluator.bind(b, 5); checkIntValue(evaluator, c, 7); - checkIntValue(evaluator, ir_builder.subExpr(a, b), -3); - checkIntValue(evaluator, ir_builder.modExpr(a, b), 2); - checkIntValue(evaluator, ir_builder.ceilDivExpr(a, b), 1); + checkIntValue(evaluator, IrBuilder::subExpr(a, b), -3); + checkIntValue(evaluator, IrBuilder::modExpr(a, b), 2); + checkIntValue(evaluator, IrBuilder::ceilDivExpr(a, b), 1); checkIntValue(evaluator, d, -2); } -TEST(NVFuserTest, FusionClear_CUDA) { +TEST_F(NVFuserTest, FusionClear_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -483,7 +564,7 @@ TEST(NVFuserTest, FusionClear_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addOutput(tv3); @@ -507,14 +588,14 @@ TEST(NVFuserTest, FusionClear_CUDA) { TORCH_CHECK(fusion.inputs().empty()); TORCH_CHECK(fusion.outputs().empty()); - TORCH_CHECK(!fusion.hasReduction()); + TORCH_CHECK(ir_utils::getReductionOps(&fusion).empty()); // 3. Rebuild the IR { TensorView* tv0 = makeSymbolicTensor(3); TensorView* tv1 = makeSymbolicTensor(3); - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addInput(tv0); @@ -539,7 +620,7 @@ TEST(NVFuserTest, FusionClear_CUDA) { at::Tensor input2 = at::randn_like(input1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input1, input2}); auto outputs = fe.runFusion({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; @@ -548,7 +629,7 @@ TEST(NVFuserTest, FusionClear_CUDA) { TORCH_CHECK(output_ref.equal(outputs[0])); } -TEST(NVFuserTest, FusionCopy_CUDA) { +TEST_F(NVFuserTest, FusionCopy_CUDA) { Fusion original_fusion; // Create the test IR @@ -557,7 +638,7 @@ TEST(NVFuserTest, FusionCopy_CUDA) { auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(3); - auto tv2 = add(tv1, new Double(2.0)); + auto tv2 = add(tv1, IrBuilder::create(2.0)); auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2); original_fusion.addInput(tv0); @@ -622,7 +703,7 @@ TEST(NVFuserTest, FusionCopy_CUDA) { ASSERT_EQ(original_kernel, clone_kernel); } -TEST(NVFuserTest, FusionMove_CUDA) { +TEST_F(NVFuserTest, FusionMove_CUDA) { Fusion fusion; // Create the test IR @@ -631,7 +712,7 @@ TEST(NVFuserTest, FusionMove_CUDA) { auto tv0 = makeSymbolicTensor(3); auto tv1 = makeSymbolicTensor(3); - auto tv2 = add(tv1, new Double(2.0)); + auto tv2 = add(tv1, IrBuilder::create(2.0)); auto tv3 = sub(add(tv0, mul(tv2, tv2)), tv2); fusion.addInput(tv0); @@ -692,28 +773,28 @@ TEST(NVFuserTest, FusionMove_CUDA) { ASSERT_EQ(lowered_ir.str(), moved_lowered_ir.str()); } -TEST(NVFuserTest, FusionSimpleArith_CUDA) { +TEST_F(NVFuserTest, FusionSimpleArith_CUDA) { std::stringstream ss1, ss2; Fusion fusion; FusionGuard fg(&fusion); - Double* d1 = new Double(1.f); - Double* d2 = new Double{2.f}; - Double* d3 = new Double(); + Double* d1 = IrBuilder::create(1.f); + Double* d2 = IrBuilder::create(2.f); + Double* d3 = IrBuilder::create(); // Disrupt the fusion to make sure guard works well { Fusion fusion2; FusionGuard fg(&fusion2); - Double* d1 = new Double(1.f); - Double* d2 = new Double(2.f); + Double* d1 = IrBuilder::create(1.f); + Double* d2 = IrBuilder::create(2.f); add(d1, d2); ss2 << fusion2; } - new BinaryOp(BinaryOpType::Add, d3, d1, d2); + IrBuilder::create(BinaryOpType::Add, d3, d1, d2); ss1 << fusion; TORCH_CHECK( @@ -721,22 +802,22 @@ TEST(NVFuserTest, FusionSimpleArith_CUDA) { "Error where explicit add nodes don't match implicit add nodes."); } -TEST(NVFuserTest, FusionSimpleTypePromote_CUDA) { +TEST_F(NVFuserTest, FusionSimpleTypePromote_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - Double* d4 = new Double{4.f}; - Int* i1 = new Int{3}; + Double* d4 = IrBuilder::create(4.f); + Int* i1 = IrBuilder::create(3); auto d5 = add(d4, i1); TORCH_CHECK(d5->getDataType() == DataType::Double); } -TEST(NVFuserTest, FusionRegister_CUDA) { +TEST_F(NVFuserTest, FusionRegister_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - Double* v1 = new Double{1.f}; - Double* v2 = new Double{2.f}; + Double* v1 = IrBuilder::create(1.f); + Double* v2 = IrBuilder::create(2.f); Val* v3 = binaryOp(BinaryOpType::Add, v1, v2); Val* v4 = binaryOp(BinaryOpType::Add, v1, v2); TORCH_CHECK(v1->name() + 1 == v2->name()); @@ -748,14 +829,18 @@ TEST(NVFuserTest, FusionRegister_CUDA) { // dummy expr with 2 outputs only for toposort test. struct DummyExpr : public Expr { ~DummyExpr() = default; - DummyExpr(Val* _outlhs, Val* _outrhs, Val* _lhs, Val* _rhs) - : Expr(ExprType::UnaryOp) // Not terribly safe... + DummyExpr( + IrBuilderPasskey passkey, + Val* _outlhs, + Val* _outrhs, + Val* _lhs, + Val* _rhs) + : Expr(passkey, ExprType::UnaryOp) // Not terribly safe... { addOutput(_outlhs); addOutput(_outrhs); addInput(_lhs); addInput(_rhs); - this->name_ = FusionGuard::getCurFusion()->registerExpr(this); } DummyExpr(const DummyExpr& other) = delete; DummyExpr& operator=(const DummyExpr& other) = delete; @@ -763,7 +848,7 @@ struct DummyExpr : public Expr { DummyExpr& operator=(DummyExpr&& other) = delete; }; -TEST(NVFuserTest, FusionTopoSort_CUDA) { +TEST_F(NVFuserTest, FusionTopoSort_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -771,23 +856,23 @@ TEST(NVFuserTest, FusionTopoSort_CUDA) { // e1: v4 = add(v3, v2) // e2: v5 = add(v2, v4) // e3: v6 = add(v5, v5) - Double* v0 = new Double{1.f}; - Double* v1 = new Double{2.f}; - Double* v2 = new Double(); - Double* v3 = new Double(); - Double* v4 = new Double(); - Double* v5 = new Double(); - Double* v6 = new Double(); + Double* v0 = IrBuilder::create(1.f); + Double* v1 = IrBuilder::create(2.f); + Double* v2 = IrBuilder::create(); + Double* v3 = IrBuilder::create(); + Double* v4 = IrBuilder::create(); + Double* v5 = IrBuilder::create(); + Double* v6 = IrBuilder::create(); std::vector inputs = {v0, v1}; for (auto val : inputs) { fusion.addInput(val); } - Expr* e0 = new DummyExpr(v3, v2, v1, v0); - Expr* e1 = new BinaryOp(BinaryOpType::Add, v4, v3, v2); - Expr* e2 = new BinaryOp(BinaryOpType::Add, v5, v2, v4); - Expr* e3 = new BinaryOp(BinaryOpType::Add, v6, v5, v5); + Expr* e0 = IrBuilder::create(v3, v2, v1, v0); + Expr* e1 = IrBuilder::create(BinaryOpType::Add, v4, v3, v2); + Expr* e2 = IrBuilder::create(BinaryOpType::Add, v5, v2, v4); + Expr* e3 = IrBuilder::create(BinaryOpType::Add, v6, v5, v5); fusion.addOutput(v2); fusion.addOutput(v3); @@ -824,7 +909,7 @@ TEST(NVFuserTest, FusionTopoSort_CUDA) { TORCH_CHECK(v6->definition()->name() == 3); } -TEST(NVFuserTest, FusionTensor_CUDA) { +TEST_F(NVFuserTest, FusionTensor_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); Fusion fusion; @@ -833,7 +918,7 @@ TEST(NVFuserTest, FusionTensor_CUDA) { { auto tensor = at::randn({2, 3, 4, 5}, options); auto tensor_type = TensorType::create(tensor); - auto fuser_tensor = new TensorView(tensor_type); + auto fuser_tensor = IrBuilder::create(tensor_type); TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); TORCH_CHECK(fuser_tensor->domain() != nullptr); @@ -856,7 +941,7 @@ TEST(NVFuserTest, FusionTensor_CUDA) { auto sliced_tensor = tensor.slice(1, 0, -1, 2); auto tensor_type = TensorType::create(sliced_tensor); - auto fuser_tensor = new TensorView(tensor_type); + auto fuser_tensor = IrBuilder::create(tensor_type); TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); TORCH_CHECK(fuser_tensor->domain() != nullptr); @@ -873,7 +958,7 @@ TEST(NVFuserTest, FusionTensor_CUDA) { auto tensor = at::randn({2, 3, 4, 5}, options); auto permuted_tensor = tensor.permute({0, 3, 1, 2}); auto tensor_type = TensorType::create(permuted_tensor); - auto fuser_tensor = new TensorView(tensor_type); + auto fuser_tensor = IrBuilder::create(tensor_type); TORCH_CHECK((int64_t)fuser_tensor->nDims() == tensor.dim()); TORCH_CHECK(fuser_tensor->getDataType().value() == DataType::Float); TORCH_CHECK(fuser_tensor->domain() != nullptr); @@ -888,15 +973,15 @@ TEST(NVFuserTest, FusionTensor_CUDA) { } } -TEST(NVFuserTest, FusionFilterVals_CUDA) { +TEST_F(NVFuserTest, FusionFilterVals_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); auto tv1 = makeSymbolicTensor(1); - auto scalar0 = new Double(0); - auto scalar1 = new Int(0); - auto scalar2 = new Int(1); + auto scalar0 = IrBuilder::create(0); + auto scalar1 = IrBuilder::create(0); + auto scalar2 = IrBuilder::create(1); const std::vector vals = {tv0, scalar0, tv1, scalar1, scalar2}; @@ -926,7 +1011,7 @@ TEST(NVFuserTest, FusionFilterVals_CUDA) { "Not expecting any results"); } -TEST(NVFuserTest, FusionTVSplit_CUDA) { +TEST_F(NVFuserTest, FusionTVSplit_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -943,7 +1028,7 @@ TEST(NVFuserTest, FusionTVSplit_CUDA) { static_cast(outer)->lhs()->sameAs( tv->getRootDomain()[2]->extent()) && static_cast(static_cast(outer)->rhs()) - ->sameAs(new Int(2))); + ->sameAs(IrBuilder::create(2))); IterDomain* inner = static_cast(tv->axis(3)); TORCH_CHECK( @@ -952,7 +1037,7 @@ TEST(NVFuserTest, FusionTVSplit_CUDA) { static_cast(inner->extent())->value().value() == 2); } -TEST(NVFuserTest, FusionTVMerge_CUDA) { +TEST_F(NVFuserTest, FusionTVMerge_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -970,7 +1055,7 @@ TEST(NVFuserTest, FusionTVMerge_CUDA) { tv->getRootDomain()[2]->extent()); } -TEST(NVFuserTest, FusionTVReorder_CUDA) { +TEST_F(NVFuserTest, FusionTVReorder_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1020,38 +1105,43 @@ TEST(NVFuserTest, FusionTVReorder_CUDA) { TORCH_CHECK(ref[1]->sameAs(tv->axis(1))); } -TEST(NVFuserTest, FusionEquality_CUDA) { +TEST_F(NVFuserTest, FusionEquality_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - Double* fval1 = new Double(); + Double* fval1 = IrBuilder::create(); Double* fval1_copy = fval1; - Double* fval2 = new Double(); - Double* fone = new Double(1.0); + Double* fval2 = IrBuilder::create(); + Double* fone = IrBuilder::create(1.0); TORCH_CHECK(fval1->sameAs(fval1_copy)); TORCH_CHECK(!fval1->sameAs(fval2)); TORCH_CHECK(!fone->sameAs(fval1)); - TORCH_CHECK(fone->sameAs(new Double(1.0))); + TORCH_CHECK(fone->sameAs(IrBuilder::create(1.0))); - Int* ival1 = new Int(); + Int* ival1 = IrBuilder::create(); Int* ival1_copy = ival1; - Int* ival2 = new Int(); - Int* ione = new Int(1); + Int* ival2 = IrBuilder::create(); + Int* ione = IrBuilder::create(1); TORCH_CHECK(ival1->sameAs(ival1_copy)); TORCH_CHECK(!ival1->sameAs(ival2)); TORCH_CHECK(!ione->sameAs(ival1)); - TORCH_CHECK(ione->sameAs(new Int(1))); + TORCH_CHECK(ione->sameAs(IrBuilder::create(1))); - BinaryOp* add1 = new BinaryOp(BinaryOpType::Add, new Double(), fval1, ival1); - BinaryOp* add1_copy = - new BinaryOp(BinaryOpType::Add, new Double(), fval1, ival1); - BinaryOp* sub1 = new BinaryOp(BinaryOpType::Sub, new Double(), fval1, ival1); + BinaryOp* add1 = IrBuilder::create( + BinaryOpType::Add, IrBuilder::create(), fval1, ival1); + BinaryOp* add1_copy = IrBuilder::create( + BinaryOpType::Add, IrBuilder::create(), fval1, ival1); + BinaryOp* sub1 = IrBuilder::create( + BinaryOpType::Sub, IrBuilder::create(), fval1, ival1); - UnaryOp* neg1 = new UnaryOp(UnaryOpType::Neg, new Double(), fval1); - UnaryOp* neg2 = new UnaryOp(UnaryOpType::Neg, new Double(), fval2); - UnaryOp* neg1_copy = new UnaryOp(UnaryOpType::Neg, new Double(), fval1); + UnaryOp* neg1 = IrBuilder::create( + UnaryOpType::Neg, IrBuilder::create(), fval1); + UnaryOp* neg2 = IrBuilder::create( + UnaryOpType::Neg, IrBuilder::create(), fval2); + UnaryOp* neg1_copy = IrBuilder::create( + UnaryOpType::Neg, IrBuilder::create(), fval1); TORCH_CHECK(add1->sameAs(add1_copy)); TORCH_CHECK(!add1->sameAs(sub1)); @@ -1061,22 +1151,22 @@ TEST(NVFuserTest, FusionEquality_CUDA) { TORCH_CHECK(!neg1->sameAs(neg2)); } -TEST(NVFuserTest, FusionDependency_CUDA) { +TEST_F(NVFuserTest, FusionDependency_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - Double* d0 = new Double(0.f); - Double* d1 = new Double(1.f); + Double* d0 = IrBuilder::create(0.f); + Double* d1 = IrBuilder::create(1.f); auto d2 = add(d0, d1); auto d3 = add(d2, d2); - Double* d4 = new Double(4.f); - Double* d5 = new Double(5.f); + Double* d4 = IrBuilder::create(4.f); + Double* d5 = IrBuilder::create(5.f); auto d6 = add(d4, d5); - Double* d7 = new Double(7.f); - Double* d8 = new Double(8.f); + Double* d7 = IrBuilder::create(7.f); + Double* d8 = IrBuilder::create(8.f); auto d9 = add(d7, d8); auto d10 = add(d6, d9); @@ -1131,7 +1221,7 @@ TEST(NVFuserTest, FusionDependency_CUDA) { TORCH_CHECK(dep_chain.empty()); } -TEST(NVFuserTest, FusionParser_CUDA) { +TEST_F(NVFuserTest, FusionParser_CUDA) { // This test may not pass if using a custom block sync as there may // be additional calls. Skip the test as it's not specifically // relevant with block synchronizatin. @@ -1174,31 +1264,31 @@ TEST(NVFuserTest, FusionParser_CUDA) { const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Tensor T3) { if ((((((((((nvfuser_index_t)blockIdx.x) * 1) + 0) * 1) + 0) * 128) + ((nvfuser_index_t)threadIdx.x)) < T0.size[0])) { - constexpr nvfuser_index_t ki183 = 0; + constexpr nvfuser_index_t i33 = 0; float T5[1]; - constexpr nvfuser_index_t ki217 = 0; - T5[ki217] = 0; - constexpr nvfuser_index_t ki208 = 0; - T5[ki208] - = T1[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki208) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; + constexpr nvfuser_index_t i45 = 0; + T5[i45] = 0; + constexpr nvfuser_index_t i41 = 0; + T5[i41] + = T1[(((((((((nvfuser_index_t)blockIdx.x) * 1) + i33) * 1) + i41) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; float T4[1]; - constexpr nvfuser_index_t ki223 = 0; - T4[ki223] = 0; - constexpr nvfuser_index_t ki203 = 0; - T4[ki203] - = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki203) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; + constexpr nvfuser_index_t i47 = 0; + T4[i47] = 0; + constexpr nvfuser_index_t i39 = 0; + T4[i39] + = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + i33) * 1) + i39) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; float T6[1]; - constexpr nvfuser_index_t ki192 = 0; + constexpr nvfuser_index_t i37 = 0; float T2[1]; T2[0] - = T4[ki192] - * T5[ki192]; - T6[ki192] + = T4[i37] + * T5[i37]; + T6[i37] = T2[0] - * T4[ki192]; - constexpr nvfuser_index_t ki185 = 0; - T3[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki183) * 1) + ki185) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)] - = T6[ki185]; + * T4[i37]; + constexpr nvfuser_index_t i35 = 0; + T3[(((((((((nvfuser_index_t)blockIdx.x) * 1) + i33) * 1) + i35) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)] + = T6[i35]; } } )"; @@ -1227,62 +1317,25 @@ __global__ void CUDAGeneratedKernel(Tensor T0, Tensor T1, Te } FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1, input2}, lparams); auto outputs = fe.runFusion({input1, input2}, lparams); at::Tensor output_ref = input1 * input2 * input1; TORCH_CHECK(output_ref.equal(outputs[0])); } -TEST(NVFuserTest, FusionForLoop_CUDA) { -// TODO(kir): re-enable this test -// due to the current "GpuLower guard" approach, we can only create -// kernel IR during GpuLower::lower() -#if 0 - Fusion fusion; - FusionGuard fg(&fusion); - - const auto TV0 = new TensorView( - new TensorDomain({new IterDomain(new Int(0), new Int(16))}), - DataType::Float); - const auto TV1 = new TensorView( - new TensorDomain({new IterDomain(new Int(0), new Int(16))}), - DataType::Float); - - fusion.addInput(TV0); - fusion.addInput(TV1); - - auto ID0 = new kir::IterDomain(new IterDomain(new Int(0), new Int(8))); - - TensorView* TV2 = add(TV0, TV1); - BinaryOp* op = static_cast(TV2->definition(); - fusion.addOutput(TV2); - - auto fl = new kir::ForLoop(new kir::Int(c10::nullopt), ID0, {op}); - - std::stringstream result; - std::stringstream ref; - result << fl; - ref << "for(size_t i3{0}; i3 < iS{8}; ++i3 ) {\nT2[ iS{16} ] = T0[ iS{16} ] + T1[ iS{16} ]\n}"; - - if (result.str().compare(ref.str()) == 0) { - std::stringstream err_msg; - err_msg << "ForLoop printing has changed or something has gone wrong. " - << result.str() << "\n does not match reference: " << ref.str() - << std::endl; - TORCH_CHECK(false, err_msg.str()); - } -#endif -} - -TEST(NVFuserTest, FusionOuterSplit_CUDA) { +TEST_F(NVFuserTest, FusionOuterSplit_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(3); - new BinaryOp(BinaryOpType::Add, tv0, new Double(0.0), new Double(1.0)); - TensorView* tv1 = add(tv0, new Double(2.0)); - TensorView* tv2 = add(tv1, new Double(3.0)); + IrBuilder::create( + BinaryOpType::Add, + tv0, + IrBuilder::create(0.0), + IrBuilder::create(1.0)); + TensorView* tv1 = add(tv0, IrBuilder::create(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(3.0)); fusion.addOutput(tv2); //[I0, I1, I2] @@ -1312,15 +1365,19 @@ TEST(NVFuserTest, FusionOuterSplit_CUDA) { TORCH_CHECK(output_ref.equal(output)); } -TEST(NVFuserTest, FusionCodeGen_CUDA) { +TEST_F(NVFuserTest, FusionCodeGen_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(3); - new BinaryOp(BinaryOpType::Add, tv0, new Double(0.0), new Double(1.0)); - TensorView* tv1 = add(tv0, new Double(2.0)); - TensorView* tv2 = add(tv1, new Double(3.0)); + IrBuilder::create( + BinaryOpType::Add, + tv0, + IrBuilder::create(0.0), + IrBuilder::create(1.0)); + TensorView* tv1 = add(tv0, IrBuilder::create(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(3.0)); fusion.addOutput(tv2); //[I0, I1, I2] @@ -1349,13 +1406,13 @@ TEST(NVFuserTest, FusionCodeGen_CUDA) { TORCH_CHECK(output_ref.equal(output)); } -TEST(NVFuserTest, FusionCodeGen2_CUDA) { +TEST_F(NVFuserTest, FusionCodeGen2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(3); TensorView* tv1 = makeSymbolicTensor(3); - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); fusion.addInput(tv0); @@ -1382,7 +1439,7 @@ TEST(NVFuserTest, FusionCodeGen2_CUDA) { at::Tensor input2 = at::randn_like(input1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input1, input2}); auto outputs = fe.runFusion({input1, input2}); at::Tensor tv2_ref = input2 + 2.0; @@ -1391,7 +1448,7 @@ TEST(NVFuserTest, FusionCodeGen2_CUDA) { TORCH_CHECK(output_ref.equal(outputs[0])); } -TEST(NVFuserTest, FusionSimplePWise_CUDA) { +TEST_F(NVFuserTest, FusionSimplePWise_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // dimensionality of the problem @@ -1407,7 +1464,7 @@ TEST(NVFuserTest, FusionSimplePWise_CUDA) { // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs @@ -1439,7 +1496,7 @@ TEST(NVFuserTest, FusionSimplePWise_CUDA) { at::Tensor output = at::empty_like(input1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input1, input2}); fe.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; @@ -1448,7 +1505,7 @@ TEST(NVFuserTest, FusionSimplePWise_CUDA) { TORCH_CHECK(output_ref.equal(output)); } -TEST(NVFuserTest, FusionExecKernel_CUDA) { +TEST_F(NVFuserTest, FusionExecKernel_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1462,7 +1519,7 @@ TEST(NVFuserTest, FusionExecKernel_CUDA) { // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs @@ -1490,7 +1547,7 @@ TEST(NVFuserTest, FusionExecKernel_CUDA) { at::Tensor input2 = at::ones_like(input1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input1, input2}); auto outputs = fe.runFusion({input1, input2}); at::Tensor check = at::full({1, 128}, 4, options); @@ -1502,7 +1559,7 @@ int ceilDiv_(int a, int b) { return (a + b - 1) / b; } -TEST(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { // Case 1 // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 @@ -1517,10 +1574,10 @@ TEST(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = add(tv1, new Double(3.0)); - TensorView* tv4 = mul(tv1, new Double(2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); + TensorView* tv2 = mul(tv1, IrBuilder::create(-1.0)); + TensorView* tv3 = add(tv1, IrBuilder::create(3.0)); + TensorView* tv4 = mul(tv1, IrBuilder::create(2.0)); TensorView* tv5 = add(tv3, tv2); TensorView* tv6 = add(tv5, tv4); @@ -1538,7 +1595,8 @@ TEST(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { tv0->computeAt(tv7, 1); - GpuLower gpulw(&fusion); + ComputeAtMap loop_map(ComputeAtMap::MappingMode::LOOP); + loop_map.build(&fusion); // The this-position of the last tensor should be zero. TORCH_CHECK( @@ -1550,11 +1608,12 @@ TEST(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { // The position of every other tensor should be 1. for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); - TORCH_CHECK(gpulw.caLoopMap().areMapped(tv7->axis(0), tv->axis(0))); + + TORCH_CHECK(loop_map.areMapped(tv7->axis(0), tv->axis(0))); } for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); @@ -1579,14 +1638,14 @@ TEST(NVFuserTest, FusionAdvancedComputeAt1_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { // Case 2 // tv1 = tv0 * -1 // tv2 = tv0 + 3 @@ -1600,9 +1659,9 @@ TEST(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(-1.0)); - TensorView* tv2 = add(tv0, new Double(3.0)); - TensorView* tv3 = mul(tv0, new Double(2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(-1.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(3.0)); + TensorView* tv3 = mul(tv0, IrBuilder::create(2.0)); TensorView* tv4 = add(tv2, tv1); TensorView* tv5 = add(tv4, tv3); @@ -1621,7 +1680,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { tv0->computeAt(tv6, 1); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -1643,13 +1702,13 @@ TEST(NVFuserTest, FusionAdvancedComputeAt2_CUDA) { std::vector aten_outputs = {t5, t6}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { // Case 3 // T2 = T1 * 0.979361 // T3 = T2 * T0 @@ -1662,7 +1721,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { TensorView* tv1 = makeSymbolicTensor(4); fusion.addInput(tv1); - TensorView* tv2 = mul(tv1, new Double(.979361)); + TensorView* tv2 = mul(tv1, IrBuilder::create(.979361)); TensorView* tv3 = mul(tv2, tv0); fusion.addOutput(tv3); @@ -1679,7 +1738,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { tv3->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -1700,14 +1759,14 @@ TEST(NVFuserTest, FusionAdvancedComputeAt3_CUDA) { at::Tensor cg_output = at::empty_like(t0, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { // Case 4 // T4 = T2 - T3 // T5 = T1 + T4 @@ -1747,7 +1806,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { tv6->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -1769,14 +1828,14 @@ TEST(NVFuserTest, FusionAdvancedComputeAt4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { // Case 5 // tv2 = tv0 + 2.0 // tv3 = tv1 * tv2 @@ -1788,7 +1847,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(2.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); @@ -1809,14 +1868,14 @@ TEST(NVFuserTest, FusionAdvancedComputeAt5_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1824,7 +1883,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(2.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); @@ -1848,26 +1907,26 @@ TEST(NVFuserTest, FusionAdvancedComputeAt6_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1.0)); + auto tv1 = add(tv0, IrBuilder::create(1.0)); auto tv2 = makeSymbolicTensor(1); fusion.addInput(tv2); - auto tv3 = add(tv2, new Double(3.0)); + auto tv3 = add(tv2, IrBuilder::create(3.0)); auto tv4 = add(tv1, tv3); fusion.addOutput(tv4); @@ -1899,9 +1958,6 @@ TEST(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { auto tv5_domain_current = tv5->domain()->domain(); TORCH_CHECK(tv5_domain == tv5_domain_current, "Invalid TV5 domain"); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int numel_x = 100; const int numel_y = 200; @@ -1919,25 +1975,27 @@ TEST(NVFuserTest, FusionAdvancedComputeAt7_CUDA) { std::vector aten_inputs = {t0, t2, t6}; std::vector aten_outputs = {t4, t7}; + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1.0)); + auto tv1 = add(tv0, IrBuilder::create(1.0)); auto tv2 = makeSymbolicTensor(1); fusion.addInput(tv2); - auto tv3 = add(tv2, new Double(3.0)); + auto tv3 = add(tv2, IrBuilder::create(3.0)); auto tv4 = add(tv1, tv3); fusion.addOutput(tv4); @@ -1964,9 +2022,6 @@ TEST(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { tv2->computeAt(tv4, -1); tv0->computeAt(tv7, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int numel_x = 100; const int numel_y = 200; @@ -1984,13 +2039,15 @@ TEST(NVFuserTest, FusionAdvancedComputeAt8_CUDA) { std::vector aten_inputs = {t0, t2, t6}; std::vector aten_outputs = {t4, t7}; + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeWith1_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeWith1_CUDA) { // Case 1 // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 @@ -2005,10 +2062,10 @@ TEST(NVFuserTest, FusionAdvancedComputeWith1_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = add(tv1, new Double(3.0)); - TensorView* tv4 = mul(tv1, new Double(2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); + TensorView* tv2 = mul(tv1, IrBuilder::create(-1.0)); + TensorView* tv3 = add(tv1, IrBuilder::create(3.0)); + TensorView* tv4 = mul(tv1, IrBuilder::create(2.0)); TensorView* tv5 = add(tv3, tv2); TensorView* tv6 = add(tv5, tv4); @@ -2036,14 +2093,17 @@ TEST(NVFuserTest, FusionAdvancedComputeWith1_CUDA) { tv7->nDims() == 3 && tv6->getComputeAtPosition() == 0 && tv6->getMaxProducerPosition() == 1); + ComputeAtMap loop_map(ComputeAtMap::MappingMode::LOOP); + loop_map.build(&fusion); + // The position of every other tensor should be 1. for (auto tv : {tv1, tv2, tv3, tv4, tv5}) { TORCH_CHECK(tv->nDims() == 3 && tv->getComputeAtPosition() == 1); - TORCH_CHECK(gpulw.caLoopMap().areMapped(tv7->axis(0), tv->axis(0))); + TORCH_CHECK(loop_map.areMapped(tv7->axis(0), tv->axis(0))); } for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); @@ -2068,14 +2128,14 @@ TEST(NVFuserTest, FusionAdvancedComputeWith1_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeWith2_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeWith2_CUDA) { // Case 2 // tv1 = tv0 * -1 // tv2 = tv0 + 3 @@ -2089,9 +2149,9 @@ TEST(NVFuserTest, FusionAdvancedComputeWith2_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(-1.0)); - TensorView* tv2 = add(tv0, new Double(3.0)); - TensorView* tv3 = mul(tv0, new Double(2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(-1.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(3.0)); + TensorView* tv3 = mul(tv0, IrBuilder::create(2.0)); TensorView* tv4 = add(tv2, tv1); TensorView* tv5 = add(tv4, tv3); @@ -2110,7 +2170,7 @@ TEST(NVFuserTest, FusionAdvancedComputeWith2_CUDA) { tv0->computeWith(tv6, 1); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -2132,13 +2192,13 @@ TEST(NVFuserTest, FusionAdvancedComputeWith2_CUDA) { std::vector aten_outputs = {t5, t6}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeWith3_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeWith3_CUDA) { // Case 3 // T2 = T1 * 0.979361 // T3 = T2 * T0 @@ -2151,7 +2211,7 @@ TEST(NVFuserTest, FusionAdvancedComputeWith3_CUDA) { TensorView* tv1 = makeSymbolicTensor(4); fusion.addInput(tv1); - TensorView* tv2 = mul(tv1, new Double(.979361)); + TensorView* tv2 = mul(tv1, IrBuilder::create(.979361)); TensorView* tv3 = mul(tv2, tv0); fusion.addOutput(tv3); @@ -2173,7 +2233,7 @@ TEST(NVFuserTest, FusionAdvancedComputeWith3_CUDA) { tv3->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -2194,14 +2254,14 @@ TEST(NVFuserTest, FusionAdvancedComputeWith3_CUDA) { at::Tensor cg_output = at::empty_like(t0, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeWith4_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeWith4_CUDA) { // Case 4 // T4 = T2 - T3 // T5 = T1 + T4 @@ -2240,7 +2300,7 @@ TEST(NVFuserTest, FusionAdvancedComputeWith4_CUDA) { tv6->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -2262,14 +2322,14 @@ TEST(NVFuserTest, FusionAdvancedComputeWith4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeWith5_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeWith5_CUDA) { // Case 5 // tv2 = tv0 + 2.0 // tv3 = tv1 * tv2 @@ -2281,7 +2341,7 @@ TEST(NVFuserTest, FusionAdvancedComputeWith5_CUDA) { fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(2.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); @@ -2302,14 +2362,14 @@ TEST(NVFuserTest, FusionAdvancedComputeWith5_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeWith6_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeWith6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2317,7 +2377,7 @@ TEST(NVFuserTest, FusionAdvancedComputeWith6_CUDA) { fusion.addInput(tv0); TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(2.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); @@ -2341,14 +2401,14 @@ TEST(NVFuserTest, FusionAdvancedComputeWith6_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv2 * -2 @@ -2358,9 +2418,9 @@ TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv1, new Double(-2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); + TensorView* tv2 = mul(tv1, IrBuilder::create(-1.0)); + TensorView* tv3 = mul(tv1, IrBuilder::create(-2.0)); fusion.addOutput(tv2); fusion.addOutput(tv3); @@ -2387,10 +2447,12 @@ TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { TORCH_CHECK( tv3->getComputeAtPosition() == 0 && tv3->getMaxProducerPosition() == 1); + ComputeAtMap loop_map(ComputeAtMap::MappingMode::LOOP); + loop_map.build(&fusion); + // Note that tv2 is also computed at tv3. for (auto tv : {tv1, tv2}) { - TORCH_CHECK( - gpulw.caLoopMap().areMapped(tv->axis(0), computeAtTarget->axis(0))); + TORCH_CHECK(loop_map.areMapped(tv->axis(0), computeAtTarget->axis(0))); } TORCH_CHECK(tv3->getComputeAtPosition() == 0); @@ -2414,7 +2476,7 @@ TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( @@ -2422,7 +2484,7 @@ TEST(NVFuserTest, FusionComputeAtMultiConsumers_CUDA) { } // Similar to ComputeAtMultiConsumers, but with a common consumer. -TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv2 * -2 @@ -2434,11 +2496,11 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv1, new Double(-2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); + TensorView* tv2 = mul(tv1, IrBuilder::create(-1.0)); + TensorView* tv3 = mul(tv1, IrBuilder::create(-2.0)); TensorView* tv4 = add(tv2, tv3); - TensorView* tv5 = mul(tv4, new Double(5.0)); + TensorView* tv5 = mul(tv4, IrBuilder::create(5.0)); fusion.addOutput(tv3); fusion.addOutput(tv4); fusion.addOutput(tv5); @@ -2492,14 +2554,14 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer1_CUDA) { at::empty_like(aten_input, options)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv2 * -1 @@ -2511,10 +2573,10 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv2, new Double(-1.0)); - TensorView* tv4 = add(tv1, new Double(4.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); + TensorView* tv2 = mul(tv1, IrBuilder::create(-1.0)); + TensorView* tv3 = mul(tv2, IrBuilder::create(-1.0)); + TensorView* tv4 = add(tv1, IrBuilder::create(4.0)); TensorView* tv5 = add(tv3, tv4); fusion.addOutput(tv5); @@ -2541,7 +2603,7 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { // All tensors should have the same dimenionality as the target for (Val* val : fusion.vals()) { - if (fusion.hasInput(val) || + if (val->isFusionInput() || val->getValType().value() != ValType::TensorView) { continue; } @@ -2555,7 +2617,7 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { } for (auto tv : ir_utils::filterByType(fusion.vals())) { - if (!fusion.hasInput(tv)) { + if (!tv->isFusionInput()) { tv->axis(1)->parallelize(ParallelType::Unroll); tv->axis(-1)->parallelize(ParallelType::TIDx); } @@ -2574,7 +2636,7 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, {cg_output}); testValidate( @@ -2583,7 +2645,7 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer2_CUDA) { // Similar to the above common consumer test but adds an additional // tensor that has no common consumer with the other tensors. -TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv2 * -1 @@ -2596,12 +2658,12 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv2, new Double(-1.0)); - TensorView* tv4 = add(tv1, new Double(4.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); + TensorView* tv2 = mul(tv1, IrBuilder::create(-1.0)); + TensorView* tv3 = mul(tv2, IrBuilder::create(-1.0)); + TensorView* tv4 = add(tv1, IrBuilder::create(4.0)); TensorView* tv5 = add(tv3, tv4); - TensorView* tv6 = add(tv1, new Double(6.0)); + TensorView* tv6 = add(tv1, IrBuilder::create(6.0)); fusion.addOutput(tv5); fusion.addOutput(tv6); @@ -2627,7 +2689,7 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { // All tensors should have the same dimenionality as the target for (auto tv : ir_utils::filterByType(fusion.vals())) { - if (fusion.hasInput(tv)) { + if (tv->isFusionInput()) { continue; } TORCH_CHECK(tv->nDims() == computeAtTarget->nDims()); @@ -2640,7 +2702,7 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { } for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = val->as(); tv->axis(1)->parallelize(ParallelType::Unroll); @@ -2664,7 +2726,7 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( @@ -2673,7 +2735,7 @@ TEST(NVFuserTest, FusionComputeAtCommonConsumer3_CUDA) { // Similar to ComputeAtCommonConsumer1 but with an addtiona ltensor // that does not have data dependency with the consumer. -TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 // tv3 = tv1 * -2 @@ -2686,13 +2748,13 @@ TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = mul(tv1, new Double(-2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); + TensorView* tv2 = mul(tv1, IrBuilder::create(-1.0)); + TensorView* tv3 = mul(tv1, IrBuilder::create(-2.0)); TensorView* tv4 = add(tv2, tv3); - TensorView* tv5 = mul(tv4, new Double(5.0)); + TensorView* tv5 = mul(tv4, IrBuilder::create(5.0)); // Notice that tv6 is not a consumer of tv4. - TensorView* tv6 = mul(tv1, new Double(6.0)); + TensorView* tv6 = mul(tv1, IrBuilder::create(6.0)); fusion.addOutput(tv3); fusion.addOutput(tv4); fusion.addOutput(tv5); @@ -2737,7 +2799,7 @@ TEST(NVFuserTest, FusionComputeAtNoCommonConsumer_CUDA) { at::empty_like(aten_input, options)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( @@ -2822,7 +2884,7 @@ void checkIdMapped( } // namespace -TEST(NVFuserTest, FusionRootMappingBasic_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingBasic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2876,7 +2938,7 @@ TEST(NVFuserTest, FusionRootMappingBasic_CUDA) { checkIdMapped(tv4, tv4->getRootDomain(), tv5, tv5->getRootDomain()); } -TEST(NVFuserTest, FusionRootMappingRfactor_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingRfactor_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2960,7 +3022,7 @@ TEST(NVFuserTest, FusionRootMappingRfactor_CUDA) { {true, true, false}); } -TEST(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2987,7 +3049,7 @@ TEST(NVFuserTest, FusionRootMappingReductionDependency1_CUDA) { {true, false}); } -TEST(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3021,7 +3083,7 @@ TEST(NVFuserTest, FusionRootMappingReductionDependency2_CUDA) { checkIdMapped(tv2, tv2->getRootDomain(), tv3, tv3->getRootDomain()); } -TEST(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3050,7 +3112,7 @@ TEST(NVFuserTest, FusionRootMappingReductionDependency3_CUDA) { {true, false}); } -TEST(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3095,13 +3157,13 @@ TEST(NVFuserTest, FusionRootMappingReductionDependency4_CUDA) { } // Reproducer of issue #749 -TEST(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); auto tv3 = broadcast(tv2, {false, true}); auto tv4 = add(tv0, tv3); @@ -3153,13 +3215,13 @@ TEST(NVFuserTest, FusionRootMappingReductionDependency5_CUDA_CUDA) { } // Similar to RootMappingReductionDependency5 but with rFactor -TEST(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); auto tv3 = broadcast(tv2, {false, true}); auto tv4 = add(tv0, tv3); @@ -3227,7 +3289,7 @@ TEST(NVFuserTest, FusionRootMappingReductionDependency6_CUDA_CUDA) { {true, true}); } -TEST(NVFuserTest, FusionRootMappingMultipleBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingMultipleBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3265,7 +3327,9 @@ TEST(NVFuserTest, FusionRootMappingMultipleBroadcast_CUDA) { {false, false}); } -TEST(NVFuserTest, FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) { +TEST_F( + NVFuserTest, + FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3299,7 +3363,7 @@ TEST(NVFuserTest, FusionRootMappingMultipleBroadcastWithNoCommonConsumer_CUDA) { {false, true}); } -TEST(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3386,7 +3450,7 @@ TEST(NVFuserTest, FusionRootMappingBroadcastNonUniqueSize_CUDA) { {true, false}); } -TEST(NVFuserTest, FusionRootMappingBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3426,7 +3490,7 @@ TEST(NVFuserTest, FusionRootMappingBroadcast_CUDA) { } // Reproducer of issue #723 -TEST(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) { +TEST_F(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3461,7 +3525,7 @@ TEST(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto t3 = t0; @@ -3470,13 +3534,13 @@ TEST(NVFuserTest, FusionRootMappingTrivialReduction_CUDA) { testValidate(&fusion, outputs, aten_inputs, {t3, t4}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionComputeAtFailDueToRootMapping_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtFailDueToRootMapping_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = broadcast(tv1, {true, false}); auto tv3 = broadcast(tv1, {false, true}); auto tv4 = add(tv2, tv3); @@ -3486,7 +3550,7 @@ TEST(NVFuserTest, FusionComputeAtFailDueToRootMapping_CUDA) { ASSERT_ANY_THROW(tv1->computeAt(tv4, 1)); } -TEST(NVFuserTest, FusionScalarInputs_CUDA) { +TEST_F(NVFuserTest, FusionScalarInputs_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3495,13 +3559,13 @@ TEST(NVFuserTest, FusionScalarInputs_CUDA) { TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); - Double* d0 = new Double(); + Double* d0 = IrBuilder::create(); fusion.addInput(d0); - Double* d1 = new Double(); + Double* d1 = IrBuilder::create(); fusion.addInput(d1); - Double* d2 = new Double(); + Double* d2 = IrBuilder::create(); fusion.addInput(d2); - Double* d3 = new Double(); + Double* d3 = IrBuilder::create(); fusion.addInput(d3); Val* d4 = mul(d0, d1); Val* d5 = sub(d2, d3); @@ -3524,7 +3588,7 @@ TEST(NVFuserTest, FusionScalarInputs_CUDA) { tv4->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -3568,14 +3632,14 @@ TEST(NVFuserTest, FusionScalarInputs_CUDA) { at::Scalar(fl3)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionLoopUnroll_CUDA) { +TEST_F(NVFuserTest, FusionLoopUnroll_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3589,7 +3653,7 @@ TEST(NVFuserTest, FusionLoopUnroll_CUDA) { // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs @@ -3621,7 +3685,7 @@ TEST(NVFuserTest, FusionLoopUnroll_CUDA) { at::Tensor input1 = at::randn({129, 13, 3}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input0, input1}); auto outputs = fe.runFusion({input0, input1}); TORCH_CHECK(outputs[0].equal(input0.add(input1.add(2.0)))); @@ -3636,11 +3700,11 @@ Val* gen_jit_operand(std::pair desc) { return makeSymbolicTensor(2, desc.second); } else if (desc.first == ValType::Scalar) { if (desc.second == DataType::Float) { - return new Double(); + return IrBuilder::create(); } else if (desc.second == DataType::Double) { - return new Double(); + return IrBuilder::create(); } else if (desc.second == DataType::Int) { - return new Int(); + return IrBuilder::create(); } else { TORCH_CHECK(false, "Not currently supported type: ", desc.first); } @@ -3763,7 +3827,7 @@ void test_op( at::manual_seed(0); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs_ivalues); fe.runFusion(aten_inputs_ivalues, output_vect); cudaDeviceSynchronize(); @@ -3809,7 +3873,7 @@ void test_op( std::make_index_sequence{}); } -TEST(NVFuserTest, FusionUnaryOps_CUDA) { +TEST_F(NVFuserTest, FusionUnaryOps_CUDA) { using OpTuple = std::tuple; @@ -3833,7 +3897,6 @@ TEST(NVFuserTest, FusionUnaryOps_CUDA) { OpTuple{at::expm1, UnaryOpType::Expm1, "expm1"}, OpTuple{at::floor, UnaryOpType::Floor, "floor"}, OpTuple{at::frac, UnaryOpType::Frac, "frac"}, - // OpTuple{at::gelu, UnaryOpType::Gelu, "gelu"}, OpTuple{at::lgamma, UnaryOpType::Lgamma, "lgamma"}, OpTuple{at::log, UnaryOpType::Log, "log"}, OpTuple{at::log10, UnaryOpType::Log10, "log10"}, @@ -3904,7 +3967,7 @@ TEST(NVFuserTest, FusionUnaryOps_CUDA) { } } -TEST(NVFuserTest, FusionBinaryOps_CUDA) { +TEST_F(NVFuserTest, FusionBinaryOps_CUDA) { using AtenFuncSig = at::Tensor (*)(const at::Tensor&, const at::Tensor&); using OpTuple = std::tuple; @@ -4009,7 +4072,7 @@ TEST(NVFuserTest, FusionBinaryOps_CUDA) { } } -TEST(NVFuserTest, FusionTernaryOps_CUDA) { +TEST_F(NVFuserTest, FusionTernaryOps_CUDA) { std::vector dtypes = {DataType::Double, DataType::Float}; for (auto dtype : dtypes) { @@ -4024,9 +4087,15 @@ TEST(NVFuserTest, FusionTernaryOps_CUDA) { /*JIT Func */ [&](Val* in1) -> Val* { if (dtype == DataType::Float) { - return clamp(in1, new Double(0.f), new Double(1.f)); + return clamp( + in1, + IrBuilder::create(0.f), + IrBuilder::create(1.f)); } else { - return clamp(in1, new Double(0.f), new Double(1.f)); + return clamp( + in1, + IrBuilder::create(0.f), + IrBuilder::create(1.f)); } }, /*Output */ std::make_pair(ValType::TensorView, dtype), @@ -4043,9 +4112,15 @@ TEST(NVFuserTest, FusionTernaryOps_CUDA) { /*JIT Func */ [&](Val* in1) -> Val* { if (dtype == DataType::Float) { - return threshold(in1, new Double(0.f), new Double(1.f)); + return threshold( + in1, + IrBuilder::create(0.f), + IrBuilder::create(1.f)); } else { - return threshold(in1, new Double(0.f), new Double(1.f)); + return threshold( + in1, + IrBuilder::create(0.f), + IrBuilder::create(1.f)); } }, /*Output */ std::make_pair(ValType::TensorView, dtype), @@ -4070,7 +4145,7 @@ TEST(NVFuserTest, FusionTernaryOps_CUDA) { } } -TEST(NVFuserTest, FusionCompoundOps_CUDA) { +TEST_F(NVFuserTest, FusionCompoundOps_CUDA) { std::vector dtypes = {DataType::Double, DataType::Float}; for (auto dtype : dtypes) { @@ -4114,7 +4189,7 @@ TEST(NVFuserTest, FusionCompoundOps_CUDA) { } } -TEST(NVFuserTest, FusionCastOps_CUDA) { +TEST_F(NVFuserTest, FusionCastOps_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4139,7 +4214,7 @@ TEST(NVFuserTest, FusionCastOps_CUDA) { const at::ArrayRef input_ivalues(inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, input_ivalues); auto outputs = fe.runFusion(input_ivalues); ref_output = at::_cast_Half(at::_cast_Double(input1)); @@ -4156,7 +4231,7 @@ TEST(NVFuserTest, FusionCastOps_CUDA) { // Start off simple, block on the outer dim // block stride + thread all reduce + unrolling on inner dim -TEST(NVFuserTest, FusionReduction1_CUDA) { +TEST_F(NVFuserTest, FusionReduction1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4165,10 +4240,13 @@ TEST(NVFuserTest, FusionReduction1_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); tv1->split(1, 128); // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] @@ -4207,7 +4285,7 @@ TEST(NVFuserTest, FusionReduction1_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4216,7 +4294,7 @@ TEST(NVFuserTest, FusionReduction1_CUDA) { &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReduction2_CUDA) { +TEST_F(NVFuserTest, FusionReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4225,7 +4303,8 @@ TEST(NVFuserTest, FusionReduction2_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); @@ -4278,14 +4357,14 @@ TEST(NVFuserTest, FusionReduction2_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReduction3_CUDA) { +TEST_F(NVFuserTest, FusionReduction3_CUDA) { // What if Z participates in the reduction with X? Fusion fusion; FusionGuard fg(&fusion); @@ -4295,7 +4374,8 @@ TEST(NVFuserTest, FusionReduction3_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); @@ -4328,7 +4408,7 @@ TEST(NVFuserTest, FusionReduction3_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, {cg_output}); auto aten_output = aten_input.to(at::kDouble).sum({1}); @@ -4337,7 +4417,7 @@ TEST(NVFuserTest, FusionReduction3_CUDA) { &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReduction4_CUDA) { +TEST_F(NVFuserTest, FusionReduction4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4351,7 +4431,8 @@ TEST(NVFuserTest, FusionReduction4_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv2); + TensorView* tv3 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv2); // tv3[I0, R1] = tv2[I0, I1] TensorView* tv4 = makeSymbolicTensor(1); @@ -4393,7 +4474,7 @@ TEST(NVFuserTest, FusionReduction4_CUDA) { at::Tensor t4 = at::randn({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t1, t4}); auto cg_outputs = fe.runFusion({t0, t1, t4}); auto t2 = t0.add(t1); @@ -4404,7 +4485,7 @@ TEST(NVFuserTest, FusionReduction4_CUDA) { &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReduction5_CUDA) { +TEST_F(NVFuserTest, FusionReduction5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4413,7 +4494,8 @@ TEST(NVFuserTest, FusionReduction5_CUDA) { fusion.addInput(tv0); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); @@ -4431,7 +4513,7 @@ TEST(NVFuserTest, FusionReduction5_CUDA) { tv1->axis(0)->parallelize(ParallelType::BIDy); for (auto* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { val->as()->axis(-1)->parallelize(ParallelType::TIDx); } @@ -4446,7 +4528,7 @@ TEST(NVFuserTest, FusionReduction5_CUDA) { at::Tensor cg_output = at::empty({bidy, tidx}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4454,7 +4536,7 @@ TEST(NVFuserTest, FusionReduction5_CUDA) { &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReduction6_CUDA) { +TEST_F(NVFuserTest, FusionReduction6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4466,10 +4548,13 @@ TEST(NVFuserTest, FusionReduction6_CUDA) { fusion.addInput(tv0); // tv1[I0, R1, R2] = tv0[I0, I1, I2] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1, 2}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); tv1->split(2, bdimx); // tv1[I0, R1, R2o, R2i{128}] = tv0[I0, I1, I2] @@ -4508,14 +4593,14 @@ TEST(NVFuserTest, FusionReduction6_CUDA) { at::Tensor input = at::randn({numel_x, numel_y, numel_z}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1, 2}); testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionMultiGridReduction_CUDA) { +TEST_F(NVFuserTest, FusionMultiGridReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4540,7 +4625,7 @@ TEST(NVFuserTest, FusionMultiGridReduction_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); std::vector aten_outputs = { @@ -4548,7 +4633,7 @@ TEST(NVFuserTest, FusionMultiGridReduction_CUDA) { testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionMultiGridReduction2_CUDA) { +TEST_F(NVFuserTest, FusionMultiGridReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4566,7 +4651,7 @@ TEST(NVFuserTest, FusionMultiGridReduction2_CUDA) { ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } -TEST(NVFuserTest, FusionReductionTFT_CUDA) { +TEST_F(NVFuserTest, FusionReductionTFT_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4575,7 +4660,8 @@ TEST(NVFuserTest, FusionReductionTFT_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); @@ -4613,7 +4699,7 @@ TEST(NVFuserTest, FusionReductionTFT_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -4621,7 +4707,7 @@ TEST(NVFuserTest, FusionReductionTFT_CUDA) { &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReductionOuterSplit_CUDA) { +TEST_F(NVFuserTest, FusionReductionOuterSplit_CUDA) { // based off FusionReduction4 Fusion fusion; FusionGuard fg(&fusion); @@ -4636,7 +4722,8 @@ TEST(NVFuserTest, FusionReductionOuterSplit_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - TensorView* tv3 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv2); + TensorView* tv3 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv2); // tv3[I0, R1] = tv2[I0, I1] TensorView* tv4 = makeSymbolicTensor(1); @@ -4676,7 +4763,7 @@ TEST(NVFuserTest, FusionReductionOuterSplit_CUDA) { at::Tensor t4 = at::randn({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t1, t4}); auto cg_outputs = fe.runFusion({t0, t1, t4}); auto t2 = t0.add(t1); @@ -4687,7 +4774,7 @@ TEST(NVFuserTest, FusionReductionOuterSplit_CUDA) { &fusion, cg_outputs, {t0, t1, t4}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBranches_CUDA) { +TEST_F(NVFuserTest, FusionBranches_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4699,7 +4786,7 @@ TEST(NVFuserTest, FusionBranches_CUDA) { fusion.addInput(tv1); fusion.addInput(tv2); - auto tv3 = add(tv0, new Double(1.0)); + auto tv3 = add(tv0, IrBuilder::create(1.0)); auto tv4 = add(tv3, tv1); auto tv5 = add(tv3, tv2); auto tv6 = add(tv4, tv5); @@ -4735,7 +4822,7 @@ TEST(NVFuserTest, FusionBranches_CUDA) { std::vector aten_inputs = {t0, t1, t2}; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto t3 = t0.add(1.0); @@ -4747,14 +4834,14 @@ TEST(NVFuserTest, FusionBranches_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSimpleBCast1_CUDA) { +TEST_F(NVFuserTest, FusionSimpleBCast1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1.5)); + TensorView* tv1 = add(tv0, IrBuilder::create(1.5)); TensorView* tv2 = makeSymbolicTensor(2); fusion.addInput(tv2); @@ -4797,14 +4884,14 @@ TEST(NVFuserTest, FusionSimpleBCast1_CUDA) { std::vector aten_inputs = {t0, t2, t3}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSimpleBCast2_CUDA) { +TEST_F(NVFuserTest, FusionSimpleBCast2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4821,7 +4908,7 @@ TEST(NVFuserTest, FusionSimpleBCast2_CUDA) { TensorView* tv4 = makeSymbolicTensor(2); fusion.addInput(tv4); - TensorView* tv5 = sub(tv4, new Double(0.1)); + TensorView* tv5 = sub(tv4, IrBuilder::create(0.1)); TensorView* tv6 = broadcast(tv5, {true, false, false}); @@ -4856,28 +4943,30 @@ TEST(NVFuserTest, FusionSimpleBCast2_CUDA) { std::vector aten_inputs = {t0, t1, t4}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSimpleBCast3_CUDA) { +TEST_F(NVFuserTest, FusionSimpleBCast3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views std::vector dom; - dom.push_back(new IterDomain(new Int(0), new Int())); - dom.push_back(new IterDomain( - new Int(0), - new Int(1), + dom.push_back(IrBuilder::create( + IrBuilder::create(0), IrBuilder::create())); + dom.push_back(IrBuilder::create( + IrBuilder::create(0), + IrBuilder::create(1), ParallelType::Serial, IterType::BroadcastWithStride)); // tv0[I1, B{1}] - TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float); + TensorView* tv0 = IrBuilder::create( + IrBuilder::create(dom), DataType::Float); fusion.addInput(tv0); // tv1[I0, I1, I2] @@ -4908,26 +4997,28 @@ TEST(NVFuserTest, FusionSimpleBCast3_CUDA) { at::Tensor cg_output = at::empty({x, y, z}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSimpleBCast4_CUDA) { +TEST_F(NVFuserTest, FusionSimpleBCast4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views std::vector dom; - dom.push_back(new IterDomain( - new Int(0), - new Int(1), + dom.push_back(IrBuilder::create( + IrBuilder::create(0), + IrBuilder::create(1), ParallelType::Serial, IterType::BroadcastWithStride)); - dom.push_back(new IterDomain(new Int(0), new Int())); - TensorView* tv0 = new TensorView(new TensorDomain(dom), DataType::Float); + dom.push_back(IrBuilder::create( + IrBuilder::create(0), IrBuilder::create())); + TensorView* tv0 = IrBuilder::create( + IrBuilder::create(dom), DataType::Float); TensorView* tv1 = makeSymbolicTensor(3); fusion.addInput(tv0); @@ -4963,30 +5054,35 @@ TEST(NVFuserTest, FusionSimpleBCast4_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSimpleBCast5_CUDA) { +TEST_F(NVFuserTest, FusionSimpleBCast5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); constexpr int m = 2, k = 3, n = 4; - auto zero = new Int(0); - auto M = new IterDomain(zero, new Int(m)); - auto K = new IterDomain(zero, new Int(k)); - auto N = new IterDomain(zero, new Int(n)); + auto zero = IrBuilder::create(0); + auto M = IrBuilder::create(zero, IrBuilder::create(m)); + auto K = IrBuilder::create(zero, IrBuilder::create(k)); + auto N = IrBuilder::create(zero, IrBuilder::create(n)); // Set up your input tensor views - TensorView* tv0 = - new TensorView(new TensorDomain({M, K}, {true, true}), DataType::Float); + TensorView* tv0 = IrBuilder::create( + IrBuilder::create( + std::vector({M, K}), std::vector({true, true})), + DataType::Float); // Note: IterDomain must not be reused, so K needs to be cloned. - TensorView* tv1 = new TensorView( - new TensorDomain({K->clone(), N}, {true, true}), DataType::Float); + TensorView* tv1 = IrBuilder::create( + IrBuilder::create( + std::vector({K->clone(), N}), + std::vector({true, true})), + DataType::Float); fusion.addInput(tv0); fusion.addInput(tv1); @@ -5018,21 +5114,21 @@ TEST(NVFuserTest, FusionSimpleBCast5_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionComplexBCast1_CUDA) { +TEST_F(NVFuserTest, FusionComplexBCast1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int x = 2, y = 3, z = 4; auto tv0 = makeConcreteTensor({y}); - auto tv1 = div(tv0, new Double(2.0)); + auto tv1 = div(tv0, IrBuilder::create(2.0)); auto tv2 = broadcast(tv1, {false, true}); auto tv3 = makeConcreteTensor({y, z}); auto tv4 = mul(tv2, tv3); @@ -5074,21 +5170,21 @@ TEST(NVFuserTest, FusionComplexBCast1_CUDA) { std::vector aten_inputs = {t0, t3, t6}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionComplexBCast2_CUDA) { +TEST_F(NVFuserTest, FusionComplexBCast2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); int x = 2, y = 3, z = 4; auto tv0 = makeConcreteTensor({y, z}); - auto tv1 = div(tv0, new Double(2.0)); + auto tv1 = div(tv0, IrBuilder::create(2.0)); auto tv2 = sum(tv1, {1}); auto tv3 = broadcast(tv2, {true, false}); auto tv4 = makeConcreteTensor({x, y}); @@ -5119,7 +5215,7 @@ TEST(NVFuserTest, FusionComplexBCast2_CUDA) { at::Tensor t4 = at::randn({x, y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t4}); auto cg_outputs = fe.runFusion({t0, t4}); auto t1 = t0.div(2.0); @@ -5131,7 +5227,7 @@ TEST(NVFuserTest, FusionComplexBCast2_CUDA) { &fusion, {cg_outputs}, {t0, t4}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing1_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5143,7 +5239,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing1_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1.0)); + auto tv2 = add(tv0, IrBuilder::create(1.0)); auto tv3 = broadcast(tv2, {true, false, false, false}); auto tv4 = add(tv3, tv1); @@ -5178,14 +5274,14 @@ TEST(NVFuserTest, FusionAdvancedIndexing1_CUDA) { std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing2_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5197,7 +5293,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing2_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1.0)); + auto tv2 = add(tv0, IrBuilder::create(1.0)); auto tv3 = broadcast(tv2, {true, false, false, false}); auto tv4 = add(tv3, tv1); @@ -5232,14 +5328,14 @@ TEST(NVFuserTest, FusionAdvancedIndexing2_CUDA) { std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing3_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5250,7 +5346,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing3_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1.0)); + auto tv2 = add(tv0, IrBuilder::create(1.0)); auto tv3 = add(tv2, tv1); fusion.addOutput(tv3); @@ -5266,14 +5362,14 @@ TEST(NVFuserTest, FusionAdvancedIndexing3_CUDA) { auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing4_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5283,7 +5379,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing4_CUDA) { TensorView* tv1 = makeConcreteTensor({4, 4, 8}); fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(1)); + TensorView* tv2 = add(tv0, IrBuilder::create(1)); TensorView* tv3 = broadcast(tv2, {true, false, false}); TensorView* tv4 = add(tv3, tv1); fusion.addOutput(tv4); @@ -5298,14 +5394,14 @@ TEST(NVFuserTest, FusionAdvancedIndexing4_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing5_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5315,7 +5411,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing5_CUDA) { TensorView* tv1 = makeSymbolicTensor(3); fusion.addInput(tv1); - TensorView* tv2 = add(tv0, new Double(1)); + TensorView* tv2 = add(tv0, IrBuilder::create(1)); TensorView* tv3 = broadcast(tv2, {true, false, true}); TensorView* tv4 = add(tv3, tv1); fusion.addOutput(tv4); @@ -5336,14 +5432,14 @@ TEST(NVFuserTest, FusionAdvancedIndexing5_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing6_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5371,7 +5467,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing6_CUDA) { scheduleReduction(&fusion, reduction_params.value()); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input0, input1}, reduction_params.value().lparams); auto cg_outputs = fe.runFusion({input0, input1}, reduction_params.value().lparams); @@ -5388,7 +5484,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing6_CUDA) { reduction_params.value().lparams); } -TEST(NVFuserTest, FusionAdvancedIndexing7_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing7_CUDA) { // Might be able to use this one without 6 as the heuristics in 6 may change // and this test is to cover the same issue. Fusion fusion; @@ -5417,15 +5513,14 @@ TEST(NVFuserTest, FusionAdvancedIndexing7_CUDA) { tv4->axis(0)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int numel_x = 100; const int numel_y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto at_t0 = at::randn({numel_x}, options); auto at_t1 = at::randn({numel_x, numel_y}, options); + FusionExecutor fe; + fe.compileFusion(&fusion, {at_t0, at_t1}); auto cg_outputs = fe.runFusion({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) @@ -5436,7 +5531,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing7_CUDA) { &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing8_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing8_CUDA) { // Same as 7 but with outer splits instead of inner Fusion fusion; FusionGuard fg(&fusion); @@ -5464,15 +5559,14 @@ TEST(NVFuserTest, FusionAdvancedIndexing8_CUDA) { tv4->axis(0)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int numel_x = 100; const int numel_y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto at_t0 = at::randn({numel_x}, options); auto at_t1 = at::randn({numel_x, numel_y}, options); + FusionExecutor fe; + fe.compileFusion(&fusion, {at_t0, at_t1}); auto cg_outputs = fe.runFusion({at_t0, at_t1}); auto aten_output = (at_t0.unsqueeze(-1).expand({numel_x, numel_y}) + at_t1) @@ -5483,7 +5577,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing8_CUDA) { &fusion, cg_outputs, {at_t0, at_t1}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing9_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing9_CUDA) { // Same as 7 but with outer splits instead of inner Fusion fusion; FusionGuard fg(&fusion); @@ -5493,7 +5587,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing9_CUDA) { auto tv1 = broadcast(tv0, {false, true}); - auto tv2 = mul(tv1, new Double(2)); + auto tv2 = mul(tv1, IrBuilder::create(2)); fusion.addOutput(tv2); auto tv3 = makeSymbolicTensor(3); @@ -5513,7 +5607,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing9_CUDA) { auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto cg_outputs = fe.runFusion(aten_inputs, lparams); auto at_t1 = at_t0.unsqueeze(-1); @@ -5525,7 +5619,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing9_CUDA) { &fusion, cg_outputs, aten_inputs, {at_t2, at_t4}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedIndexing10_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing10_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5539,7 +5633,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing10_CUDA) { // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs @@ -5575,7 +5669,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing10_CUDA) { at::Tensor output = at::empty_like(input1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input1, input2}); fe.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; @@ -5584,7 +5678,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing10_CUDA) { TORCH_CHECK(output_ref.equal(output)); } -TEST(NVFuserTest, FusionAdvancedIndexing11_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedIndexing11_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5596,7 +5690,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing11_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - auto tv2 = add(tv1, new Double(1.0)); + auto tv2 = add(tv1, IrBuilder::create(1.0)); auto tv3 = broadcast(tv2, {true, false, true, true}); auto tv4 = add(tv3, tv0); @@ -5631,7 +5725,7 @@ TEST(NVFuserTest, FusionAdvancedIndexing11_CUDA) { std::vector aten_inputs = {t0, t1}; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -5639,16 +5733,16 @@ TEST(NVFuserTest, FusionAdvancedIndexing11_CUDA) { } // Intended to stress the lowering of our code generator -TEST(NVFuserTest, FusionAdvancedLowering1_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedLowering1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeConcreteTensor({9, 5}); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv1, new Double(3)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(2)); + TensorView* tv3 = add(tv1, IrBuilder::create(3)); TensorView* tv4 = sum(tv3, {1}); fusion.addOutput(tv2); @@ -5671,15 +5765,14 @@ TEST(NVFuserTest, FusionAdvancedLowering1_CUDA) { std::vector aten_outputs = {t2, t4}; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedLowering2_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedLowering2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5691,7 +5784,7 @@ TEST(NVFuserTest, FusionAdvancedLowering2_CUDA) { TensorView* tv2 = makeSymbolicTensor(3); fusion.addInput(tv2); - TensorView* tv3 = add(tv0, new Double(1)); + TensorView* tv3 = add(tv0, IrBuilder::create(1)); TensorView* tv4 = broadcast(tv3, {false, true}); TensorView* tv5 = add(tv4, tv1); TensorView* tv6 = add(tv5, tv2); @@ -5727,8 +5820,7 @@ TEST(NVFuserTest, FusionAdvancedLowering2_CUDA) { std::vector aten_outputs = {t6}; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -5736,7 +5828,7 @@ TEST(NVFuserTest, FusionAdvancedLowering2_CUDA) { } // TODO: Complete test -TEST(NVFuserTest, FusionAdvancedLowering3_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedLowering3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5746,13 +5838,13 @@ TEST(NVFuserTest, FusionAdvancedLowering3_CUDA) { fusion.addInput(tv1); // [b0, i1] - auto tv2 = add(tv0, new Double(2.0)); + auto tv2 = add(tv0, IrBuilder::create(2.0)); // [i0, i1] - auto tv3 = add(tv1, new Double(3.0)); + auto tv3 = add(tv1, IrBuilder::create(3.0)); // [b0, i1] - auto tv4 = add(tv2, new Double(4.0)); + auto tv4 = add(tv2, IrBuilder::create(4.0)); // [io, i1] auto tv5 = add(tv2, tv3); @@ -5776,8 +5868,7 @@ TEST(NVFuserTest, FusionAdvancedLowering3_CUDA) { std::vector aten_outputs = {t4, t5}; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -5787,7 +5878,7 @@ TEST(NVFuserTest, FusionAdvancedLowering3_CUDA) { // This excercises indexing with broadcast root axes. Non-broadcast // axes need to be preferred when propagating index exprs to root // axes. See, e.g., Index::getConsumerIndex_impl. -TEST(NVFuserTest, FusionAdvancedLowering4_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedLowering4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5804,9 +5895,6 @@ TEST(NVFuserTest, FusionAdvancedLowering4_CUDA) { tv4->split(0, 8); tv0->computeAt(tv4, 1); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); const int bx = 10; const int by = 20; @@ -5815,6 +5903,8 @@ TEST(NVFuserTest, FusionAdvancedLowering4_CUDA) { at::Tensor t3 = at::randn({bx, by, bz}, options); std::vector aten_inputs = {t0, t3}; + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = @@ -5824,7 +5914,7 @@ TEST(NVFuserTest, FusionAdvancedLowering4_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedLowering5_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedLowering5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5854,15 +5944,14 @@ TEST(NVFuserTest, FusionAdvancedLowering5_CUDA) { std::vector aten_outputs = {t3}; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedLowering6_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedLowering6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5902,8 +5991,7 @@ TEST(NVFuserTest, FusionAdvancedLowering6_CUDA) { std::vector aten_outputs = {t5, t7}; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -5911,7 +5999,7 @@ TEST(NVFuserTest, FusionAdvancedLowering6_CUDA) { } // Test a simple Gemm but also play around with fusion executor features -TEST(NVFuserTest, FusionSimpleGemm_CUDA) { +TEST_F(NVFuserTest, FusionSimpleGemm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -5978,7 +6066,7 @@ TEST(NVFuserTest, FusionSimpleGemm_CUDA) { at::Tensor t1 = at::randn({K, N}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); // Lets specify a few bounds in launch params to make sure it works fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); @@ -5996,7 +6084,7 @@ TEST(NVFuserTest, FusionSimpleGemm_CUDA) { } // Softmax with a 1D tensor. Parallelized only with a single thread block. -TEST(NVFuserTest, FusionSoftmax1D_CUDA) { +TEST_F(NVFuserTest, FusionSoftmax1D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6042,7 +6130,7 @@ TEST(NVFuserTest, FusionSoftmax1D_CUDA) { at::Tensor t3_output = at::empty_like(cg_output, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); fe.runFusion({t0}, {cg_output}); auto aten_output = at::_softmax(t0.to(at::kDouble), -1, false); @@ -6051,7 +6139,7 @@ TEST(NVFuserTest, FusionSoftmax1D_CUDA) { } // Softmax with a 1D tensor with input normalization. -TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { +TEST_F(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6063,8 +6151,8 @@ TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { fusion.addInput(input_tv0); // Normalize with the max value before computing exp. - TensorView* max_val_tv1 = - reductionOp(BinaryOpType::Max, {-1}, new Double(0), input_tv0); + TensorView* max_val_tv1 = reductionOp( + BinaryOpType::Max, {-1}, IrBuilder::create(0), input_tv0); TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {true}); TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2); TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3); @@ -6111,7 +6199,7 @@ TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { at::Tensor t3_output = at::empty({dimx}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -6121,7 +6209,7 @@ TEST(NVFuserTest, FusionSoftmax1DNormalized_CUDA) { // Softmax with a 3D tensor, where the inner-most 3rd dimension is // normalized. Pallelized with multiple thread blocks. -TEST(NVFuserTest, FusionSoftmax3D_CUDA) { +TEST_F(NVFuserTest, FusionSoftmax3D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6171,7 +6259,7 @@ TEST(NVFuserTest, FusionSoftmax3D_CUDA) { at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -6181,7 +6269,7 @@ TEST(NVFuserTest, FusionSoftmax3D_CUDA) { } // Softmax with a 3D tensor with input normalization. -TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { +TEST_F(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6195,8 +6283,8 @@ TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { fusion.addInput(input_tv0); // Normalize with the max value before computing exp. - TensorView* max_val_tv1 = - reductionOp(BinaryOpType::Max, {-1}, new Double(0), input_tv0); + TensorView* max_val_tv1 = reductionOp( + BinaryOpType::Max, {-1}, IrBuilder::create(0), input_tv0); TensorView* bcast_max_tv2 = broadcast(max_val_tv1, {false, false, true}); TensorView* sub_tv3 = sub(input_tv0, bcast_max_tv2); TensorView* exp_tv4 = unaryOp(UnaryOpType::Exp, sub_tv3); @@ -6246,7 +6334,7 @@ TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { at::Tensor t3_output = at::empty({dimx, dimy, dimz}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = at::_softmax(input.to(at::kDouble), -1, false); @@ -6254,7 +6342,7 @@ TEST(NVFuserTest, FusionSoftmax3DNormalized_CUDA) { testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) { +TEST_F(NVFuserTest, FusionSoftmaxComputeAt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6265,7 +6353,7 @@ TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) { auto tv1 = sum(tv0, {1}); auto tv2 = broadcast(tv1, {false, true}); - auto tv3 = add(tv0, new Double(1.0)); + auto tv3 = add(tv0, IrBuilder::create(1.0)); auto tv4 = mul(tv2, tv3); @@ -6280,10 +6368,7 @@ TEST(NVFuserTest, FusionSoftmaxComputeAt_CUDA) { } // Similar to FusionReduction but uses grid reduction -TEST(NVFuserTest, FusionGridReduction1_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionGridReduction1_CUDA) { const int gdimx = 32; const int bdimx = 128; @@ -6295,10 +6380,13 @@ TEST(NVFuserTest, FusionGridReduction1_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); tv1->split(1, bdimx); // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] @@ -6331,7 +6419,7 @@ TEST(NVFuserTest, FusionGridReduction1_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -6341,10 +6429,7 @@ TEST(NVFuserTest, FusionGridReduction1_CUDA) { } // Same test as the above but uses BIDy and TIDx for reduction -TEST(NVFuserTest, FusionGridReduction2_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionGridReduction2_CUDA) { const int gdimy = 32; const int bdimx = 128; @@ -6356,10 +6441,13 @@ TEST(NVFuserTest, FusionGridReduction2_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); tv1->split(1, bdimx); // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] @@ -6391,7 +6479,7 @@ TEST(NVFuserTest, FusionGridReduction2_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -6400,7 +6488,7 @@ TEST(NVFuserTest, FusionGridReduction2_CUDA) { } // Same test but uses BIDy and BIDz for reduction. No TID used. -TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) { +TEST_F(NVFuserTest, FusionGridReduction3dim1_CUDA) { // Grid reductions when there aren't any threads are serial reductions // keep these numbers low so our error isn't too high compared to normal cuda // reductions @@ -6415,10 +6503,13 @@ TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); tv1->split(1, gdimy); // tv1[I0, R1o, R1i{128}] = tv0[I0, I1] @@ -6450,7 +6541,7 @@ TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -6459,7 +6550,7 @@ TEST(NVFuserTest, FusionGridReduction3dim1_CUDA) { } // Same as testGPU_FusionGridReduction3dim1 but reduces dimension 0 -TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) { +TEST_F(NVFuserTest, FusionGridReduction3dim0_CUDA) { // Grid reductions when there aren't any threads are serial reductions // keep these numbers low so our error isn't too high compared to normal cuda // reductions @@ -6474,10 +6565,13 @@ TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) { fusion.addInput(tv0); // tv1[R0, I1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {0}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {0}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); tv1->split(0, gdimy); // tv1[R0o, R0i{128}, I1] = tv0[I0, I1] @@ -6507,7 +6601,7 @@ TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({0}); @@ -6516,7 +6610,7 @@ TEST(NVFuserTest, FusionGridReduction3dim0_CUDA) { } // This is similar to the FusionReduction, but swaps BIDx and TIDx -TEST(NVFuserTest, FusionGridReduction4_CUDA) { +TEST_F(NVFuserTest, FusionGridReduction4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6528,10 +6622,13 @@ TEST(NVFuserTest, FusionGridReduction4_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); tv1->split(1, gdimx); // tv1[I0, R1o, R1i{1024}] = tv0[I0, I1] @@ -6570,7 +6667,7 @@ TEST(NVFuserTest, FusionGridReduction4_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -6580,7 +6677,7 @@ TEST(NVFuserTest, FusionGridReduction4_CUDA) { // Grid reduction with 2D thread blocks but only TIDx and BIDx are // mapped to a reduction dim -TEST(NVFuserTest, FusionGridReduction5_CUDA) { +TEST_F(NVFuserTest, FusionGridReduction5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6593,10 +6690,13 @@ TEST(NVFuserTest, FusionGridReduction5_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); tv1->split(1, bdimx); // tv1[I0, R1o, R1i{64}] = tv0[I0, I1] @@ -6624,7 +6724,7 @@ TEST(NVFuserTest, FusionGridReduction5_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({1}); @@ -6632,7 +6732,7 @@ TEST(NVFuserTest, FusionGridReduction5_CUDA) { } // Similar to FusionGridReduction1 but with 3D tensors -TEST(NVFuserTest, FusionGridReduction6_CUDA) { +TEST_F(NVFuserTest, FusionGridReduction6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6641,10 +6741,13 @@ TEST(NVFuserTest, FusionGridReduction6_CUDA) { fusion.addInput(tv0); // tv1[I0, R1, R2] = tv0[I0, I1, I2] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1, 2}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1, 2}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(fusion.hasReduction(), "Could not detect reduction in fusion."); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).size(), + "Could not detect reduction in fusion."); // Splitting for TID tv1->split(2, 128); @@ -6686,7 +6789,7 @@ TEST(NVFuserTest, FusionGridReduction6_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({1, 2}); @@ -6696,7 +6799,7 @@ TEST(NVFuserTest, FusionGridReduction6_CUDA) { } // See issue #1049 -TEST(NVFuserTest, FusionGridReduction7_CUDA) { +TEST_F(NVFuserTest, FusionGridReduction7_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6718,7 +6821,7 @@ TEST(NVFuserTest, FusionGridReduction7_CUDA) { at::Tensor cg_output = at::empty({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto out = fe.runFusion({input}); auto aten_output = input.sum({0}); @@ -6726,7 +6829,7 @@ TEST(NVFuserTest, FusionGridReduction7_CUDA) { testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGridReduction8_CUDA) { +TEST_F(NVFuserTest, FusionGridReduction8_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6746,7 +6849,7 @@ TEST(NVFuserTest, FusionGridReduction8_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto out = fe.runFusion({input}); auto aten_output = input.sum({0}); @@ -6754,10 +6857,7 @@ TEST(NVFuserTest, FusionGridReduction8_CUDA) { testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGridReduction9_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionGridReduction9_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6788,7 +6888,7 @@ TEST(NVFuserTest, FusionGridReduction9_CUDA) { at::ArrayRef aten_inputs = {t0, t2}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_output = fe.runFusion(aten_inputs); auto aten_output = t0.sum({1}).add(t2); @@ -6796,7 +6896,7 @@ TEST(NVFuserTest, FusionGridReduction9_CUDA) { testValidate(&fusion, cg_output, {t0, t2}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGridReduction10_CUDA) { +TEST_F(NVFuserTest, FusionGridReduction10_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6831,7 +6931,7 @@ TEST(NVFuserTest, FusionGridReduction10_CUDA) { at::Tensor t0 = at::randn({numel_w, numel_x, numel_y, numel_z}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto cg_output = fe.runFusion({t0}); auto aten_output = t0.sum({1, 2, 3}); @@ -6839,7 +6939,7 @@ TEST(NVFuserTest, FusionGridReduction10_CUDA) { testValidate(&fusion, cg_output, {t0}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) { +TEST_F(NVFuserTest, FusionNonRedAxisBind_CUDA) { int bid_x = 3; int tid_x = 2; int red_dim = 0; @@ -6851,8 +6951,8 @@ TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = - reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); + TensorView* tv1 = reductionOp( + BinaryOpType::Add, {red_dim}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); tv1->split(-1, tid_x); @@ -6863,7 +6963,7 @@ TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) { at::Tensor input = at::randn({16, bid_x * tid_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = input.to(at::kDouble).sum({red_dim}); @@ -6871,7 +6971,7 @@ TEST(NVFuserTest, FusionNonRedAxisBind_CUDA) { testValidate(&fusion, cg_outputs, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSplitBCast_CUDA) { +TEST_F(NVFuserTest, FusionSplitBCast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6881,8 +6981,8 @@ TEST(NVFuserTest, FusionSplitBCast_CUDA) { fusion.addInput(input_tv0); fusion.addInput(input_tv1); - TensorView* sum_tv2 = - reductionOp(BinaryOpType::Add, {2}, new Double(0), input_tv0); + TensorView* sum_tv2 = reductionOp( + BinaryOpType::Add, {2}, IrBuilder::create(0), input_tv0); TensorView* bcast_tv3 = broadcast(sum_tv2, {false, false, true}); TensorView* output_tv4 = div(input_tv1, bcast_tv3); @@ -6915,11 +7015,11 @@ TEST(NVFuserTest, FusionSplitBCast_CUDA) { at::Tensor cg_output = at::empty({32, 32, 128}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t1}); fe.runFusion({t0, t1}, {cg_output}); } -TEST(NVFuserTest, FusionBCastInnerDim_CUDA) { +TEST_F(NVFuserTest, FusionBCastInnerDim_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6933,7 +7033,7 @@ TEST(NVFuserTest, FusionBCastInnerDim_CUDA) { TORCH_CHECK(!tv2->axis(0)->isReduction() && tv2->axis(1)->isBroadcast()); } -TEST(NVFuserTest, FusionBCastReduce_CUDA) { +TEST_F(NVFuserTest, FusionBCastReduce_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -6949,14 +7049,16 @@ TEST(NVFuserTest, FusionBCastReduce_CUDA) { // Multiple consumer reduction with computeAt // https://github.com/csarofeen/pytorch/issues/110 -TEST(NVFuserTest, FusionReductionMultiConsumer_CUDA) { +TEST_F(NVFuserTest, FusionReductionMultiConsumer_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = unaryOp(UnaryOpType::Exp, tv0); - auto tv2 = reductionOp(BinaryOpType::Max, {-1}, new Double(0), tv1); - auto tv3 = reductionOp(BinaryOpType::Min, {-1}, new Double(0), tv1); + auto tv2 = + reductionOp(BinaryOpType::Max, {-1}, IrBuilder::create(0), tv1); + auto tv3 = + reductionOp(BinaryOpType::Min, {-1}, IrBuilder::create(0), tv1); auto tv4 = add(tv2, tv3); fusion.addOutput(tv4); tv1->computeAt(tv2, -1, ComputeAtMode::BestEffort); @@ -6964,7 +7066,7 @@ TEST(NVFuserTest, FusionReductionMultiConsumer_CUDA) { TORCH_CHECK(tv1->getComputeAtPosition() == 2); } -TEST(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { for (const auto i : c10::irange(2)) { Fusion fusion; FusionGuard fg(&fusion); @@ -6973,8 +7075,8 @@ TEST(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); TensorView* tv3 = add(tv1, tv2); // Set outputs tv2 or tv1 and then tv3 if (i == 0) { @@ -6996,7 +7098,7 @@ TEST(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { aten_input + 1, (aten_input + 1) * 2}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( @@ -7004,7 +7106,7 @@ TEST(NVFuserTest, FusionComputeAtExprOrder1_CUDA) { } } -TEST(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7012,8 +7114,8 @@ TEST(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); TensorView* tv3 = add(tv1, tv2); fusion.addOutput(tv3); @@ -7029,14 +7131,14 @@ TEST(NVFuserTest, FusionComputeAtExprOrder2_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7045,10 +7147,10 @@ TEST(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { TensorView* tv0 = makeConcreteTensor({dimx, dimy}); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv2, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(2)); + TensorView* tv3 = add(tv2, IrBuilder::create(3)); + TensorView* tv4 = add(tv3, IrBuilder::create(4)); TensorView* tv5 = mul(tv2, tv4); fusion.addOutput(tv5); @@ -7065,14 +7167,14 @@ TEST(NVFuserTest, FusionComputeAtExprOrder3_CUDA) { auto aten_output = t2.mul(t4); torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) { +TEST_F(NVFuserTest, FusionZeroDimComputeAt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7080,7 +7182,7 @@ TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) { fusion.addInput(tv0); auto tv1 = sum(tv0, {0}); - auto tv2 = add(tv1, new Double(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); TORCH_CHECK(tv2->nDims() == 0); tv1->computeAt(tv2, 0); @@ -7090,14 +7192,14 @@ TEST(NVFuserTest, FusionZeroDimComputeAt_CUDA) { auto aten_output = aten_input.to(at::kDouble).sum() + 1; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionZeroDimBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionZeroDimBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7130,14 +7232,14 @@ TEST(NVFuserTest, FusionZeroDimBroadcast_CUDA) { at::Tensor cg_output = at::empty({}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_output}); testValidate( &fusion, {cg_output}, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionZeroDimReduction_CUDA) { +TEST_F(NVFuserTest, FusionZeroDimReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7166,14 +7268,14 @@ TEST(NVFuserTest, FusionZeroDimReduction_CUDA) { at::Tensor cg_output = at::empty({}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBCastAfterReduce_CUDA) { +TEST_F(NVFuserTest, FusionBCastAfterReduce_CUDA) { Fusion fusion; FusionGuard fg(&fusion); const int tidx = 128; @@ -7218,14 +7320,14 @@ TEST(NVFuserTest, FusionBCastAfterReduce_CUDA) { std::vector aten_inputs = {t0, t4}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t4}); auto cg_outputs = fe.runFusion({t0, t4}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionOutputBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionOutputBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7243,15 +7345,14 @@ TEST(NVFuserTest, FusionOutputBroadcast_CUDA) { auto aten_output = aten_input.unsqueeze(2).unsqueeze(1).unsqueeze(0); FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { +TEST_F(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7270,15 +7371,14 @@ TEST(NVFuserTest, FusionReductionKeepDimBasic_CUDA) { aten_input.to(at::kDouble).sum({0, 2, -1}, /*keepdim=*/true); FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) { +TEST_F(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; @@ -7291,7 +7391,11 @@ TEST(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) { fusion.addInput(tv0); TensorView* tv1 = reductionOp( - BinaryOpType::Add, {red_dim}, new Double(0), tv0, /*keep_dim=*/true); + BinaryOpType::Add, + {red_dim}, + IrBuilder::create(0), + tv0, + /*keep_dim=*/true); fusion.addOutput(tv1); @@ -7307,11 +7411,10 @@ TEST(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) { TORCH_CHECK(reduction_params, "Reduction schedule was not generated!"); scheduleReduction(&fusion, reduction_params.value()); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto lparams = reduction_params.value().lparams; + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}, lparams); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( @@ -7325,7 +7428,7 @@ TEST(NVFuserTest, FusionReductionKeepDimScheduler_CUDA) { lparams); } -TEST(NVFuserTest, FusionSumTo_CUDA) { +TEST_F(NVFuserTest, FusionSumTo_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7340,7 +7443,7 @@ TEST(NVFuserTest, FusionSumTo_CUDA) { sum_to_shape.begin(), sum_to_shape.end(), std::back_inserter(sum_to_symb), - [](int s) -> Int* { return new Int(s); }); + [](int s) -> Int* { return IrBuilder::create(s); }); TensorView* tv0 = makeConcreteTensor(tensor_shape); fusion.addInput(tv0); @@ -7355,8 +7458,7 @@ TEST(NVFuserTest, FusionSumTo_CUDA) { auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref); FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); TORCH_CHECK( @@ -7367,7 +7469,7 @@ TEST(NVFuserTest, FusionSumTo_CUDA) { &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSumToNoop_CUDA) { +TEST_F(NVFuserTest, FusionSumToNoop_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7382,7 +7484,7 @@ TEST(NVFuserTest, FusionSumToNoop_CUDA) { sum_to_shape.begin(), sum_to_shape.end(), std::back_inserter(sum_to_symb), - [](int s) -> Int* { return new Int(s); }); + [](int s) -> Int* { return IrBuilder::create(s); }); TensorView* tv0 = makeConcreteTensor(tensor_shape); fusion.addInput(tv0); @@ -7390,7 +7492,7 @@ TEST(NVFuserTest, FusionSumToNoop_CUDA) { TensorView* tv1 = sum_to(tv0, sum_to_symb); // Dummy operator to avoid tv0 both input and output - TensorView* tv2 = add(tv1, new Double(0)); + TensorView* tv2 = add(tv1, IrBuilder::create(0)); fusion.addOutput(tv2); const auto options = @@ -7399,8 +7501,7 @@ TEST(NVFuserTest, FusionSumToNoop_CUDA) { at::Tensor aten_input = at::randn(tensor_shape_ref, options); FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); auto aten_output = at::sum_to(aten_input.to(at::kDouble), sum_to_shape_ref); @@ -7412,7 +7513,7 @@ TEST(NVFuserTest, FusionSumToNoop_CUDA) { &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReductionScheduler_CUDA) { +TEST_F(NVFuserTest, FusionReductionScheduler_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; @@ -7424,8 +7525,8 @@ TEST(NVFuserTest, FusionReductionScheduler_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = - reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); + TensorView* tv1 = reductionOp( + BinaryOpType::Add, {red_dim}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); const auto options = @@ -7442,7 +7543,7 @@ TEST(NVFuserTest, FusionReductionScheduler_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); @@ -7458,7 +7559,7 @@ TEST(NVFuserTest, FusionReductionScheduler_CUDA) { } // Simple reduction parallelized on a symbolic size. -TEST(NVFuserTest, FusionSymbolicReduction_CUDA) { +TEST_F(NVFuserTest, FusionSymbolicReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7467,7 +7568,8 @@ TEST(NVFuserTest, FusionSymbolicReduction_CUDA) { fusion.addInput(tv0); // tv1[I0, R1] = tv0[I0, I1] - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); // Interface should just be a direct split with a Parallel type. We can @@ -7501,7 +7603,7 @@ TEST(NVFuserTest, FusionSymbolicReduction_CUDA) { LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( @@ -7515,7 +7617,7 @@ TEST(NVFuserTest, FusionSymbolicReduction_CUDA) { lparams); } -TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { +TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { const std::vector red_dims = {0, 2}; // Copy is because CodeGen requires int and Pytorch requires int64_t // for a vector of reduction dimensions @@ -7530,8 +7632,8 @@ TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); fusion.addInput(tv0); - TensorView* tv1 = - reductionOp(BinaryOpType::Add, red_dims, new Double(0), tv0); + TensorView* tv1 = reductionOp( + BinaryOpType::Add, red_dims, IrBuilder::create(0), tv0); fusion.addOutput(tv1); const auto options = @@ -7547,7 +7649,7 @@ TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); fe.runFusion({aten_input}, {cg_output}, lparams); testValidate( @@ -7561,7 +7663,7 @@ TEST(NVFuserTest, FusionReductionSchedulerMultiDimNonFastest_CUDA) { lparams); } -TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { +TEST_F(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { const std::vector red_dims = {1, 3}; // Copy is because CodeGen requires int and Pytorch requires int64_t // for a vector of reduction dimensions @@ -7575,8 +7677,8 @@ TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); fusion.addInput(tv0); - TensorView* tv1 = - reductionOp(BinaryOpType::Add, red_dims, new Double(0), tv0); + TensorView* tv1 = reductionOp( + BinaryOpType::Add, red_dims, IrBuilder::create(0), tv0); fusion.addOutput(tv1); const auto options = @@ -7590,7 +7692,7 @@ TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( @@ -7604,7 +7706,7 @@ TEST(NVFuserTest, FusionReductionSchedulerMultiDimFastest_CUDA) { lparams); } -TEST(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) { +TEST_F(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) { std::vector dtypes = { DataType::Double, DataType::Float, DataType::Half}; #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 @@ -7661,8 +7763,7 @@ TEST(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}, lparams); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( @@ -7678,7 +7779,7 @@ TEST(NVFuserTest, FusionReductionSchedulerNoODimShmoo_CUDA) { } } -TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) { +TEST_F(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) { std::vector dtypes = { DataType::Double, DataType::Float, DataType::Half}; #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 @@ -7740,8 +7841,7 @@ TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}, lparams); auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.to(at::kDouble).sum({axis}); testValidate( @@ -7759,14 +7859,14 @@ TEST(NVFuserTest, FusionReductionSchedulerDimShmoo_CUDA) { } } -TEST(NVFuserTest, FusionCacheBefore_CUDA) { +TEST_F(NVFuserTest, FusionCacheBefore_CUDA) { // TVM Cache Write Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = add(tv0, new Double(1.0)); - TensorView* tv2 = mul(tv1, new Double(3.0)); + TensorView* tv1 = add(tv0, IrBuilder::create(1.0)); + TensorView* tv2 = mul(tv1, IrBuilder::create(3.0)); fusion.addInput(tv0); fusion.addOutput(tv2); @@ -7790,21 +7890,21 @@ TEST(NVFuserTest, FusionCacheBefore_CUDA) { at::Tensor aten_output = (aten_input + 1.0) * 3.0; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionCacheAfter_CUDA) { +TEST_F(NVFuserTest, FusionCacheAfter_CUDA) { // TVM Cache Read Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = add(tv0, new Double(1.0)); - TensorView* tv2 = mul(tv1, new Double(3.0)); + TensorView* tv1 = add(tv0, IrBuilder::create(1.0)); + TensorView* tv2 = mul(tv1, IrBuilder::create(3.0)); fusion.addInput(tv0); fusion.addOutput(tv2); @@ -7828,20 +7928,20 @@ TEST(NVFuserTest, FusionCacheAfter_CUDA) { at::Tensor aten_output = (aten_input + 1.0) * 3.0; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionCacheFork_CUDA) { +TEST_F(NVFuserTest, FusionCacheFork_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = add(tv0, new Double(1.0)); - TensorView* tv2 = mul(tv1, new Double(3.0)); + TensorView* tv1 = add(tv0, IrBuilder::create(1.0)); + TensorView* tv2 = mul(tv1, IrBuilder::create(3.0)); fusion.addInput(tv0); fusion.addOutput(tv1); fusion.addOutput(tv2); @@ -7873,7 +7973,7 @@ TEST(NVFuserTest, FusionCacheFork_CUDA) { at::Tensor aten_output2 = aten_output1 * 3.0; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( @@ -7885,7 +7985,7 @@ TEST(NVFuserTest, FusionCacheFork_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionCacheIndirect_CUDA) { +TEST_F(NVFuserTest, FusionCacheIndirect_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7927,14 +8027,14 @@ TEST(NVFuserTest, FusionCacheIndirect_CUDA) { at::Tensor aten_output = (t1 + (t2 - t3)) - t0; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionCacheBcast_CUDA) { +TEST_F(NVFuserTest, FusionCacheBcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -7986,22 +8086,22 @@ TEST(NVFuserTest, FusionCacheBcast_CUDA) { t0.to(at::kDouble).unsqueeze(1).matmul(t1.to(at::kDouble).unsqueeze(0)); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) { +TEST_F(NVFuserTest, FusionCacheMultiConsumer_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv0, new Double(1)); - TensorView* tv4 = add(tv3, new Double(2)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(2)); + TensorView* tv3 = add(tv0, IrBuilder::create(1)); + TensorView* tv4 = add(tv3, IrBuilder::create(2)); fusion.addInput(tv0); fusion.addOutput(tv2); @@ -8025,7 +8125,7 @@ TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) { auto aten_output = (aten_input + 1) + 2; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( @@ -8037,7 +8137,7 @@ TEST(NVFuserTest, FusionCacheMultiConsumer_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionSmem_CUDA) { +TEST_F(NVFuserTest, FusionSmem_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -8084,7 +8184,7 @@ TEST(NVFuserTest, FusionSmem_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t1}); auto cg_outputs = fe.runFusion({t0, t1}); testValidate( @@ -8093,7 +8193,7 @@ TEST(NVFuserTest, FusionSmem_CUDA) { TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); } -TEST(NVFuserTest, FusionSmemReduce_CUDA) { +TEST_F(NVFuserTest, FusionSmemReduce_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -8133,7 +8233,7 @@ TEST(NVFuserTest, FusionSmemReduce_CUDA) { at::Tensor aten_output = sum(aten_input.to(at::kDouble), {1}); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( @@ -8141,7 +8241,7 @@ TEST(NVFuserTest, FusionSmemReduce_CUDA) { TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); } -TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) { +TEST_F(NVFuserTest, FusionSmemBlockGemm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -8202,7 +8302,7 @@ TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) { at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t1}); auto cg_outputs = fe.runFusion({t0, t1}); testValidate( @@ -8211,7 +8311,7 @@ TEST(NVFuserTest, FusionSmemBlockGemm_CUDA) { TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); } -TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { +TEST_F(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -8291,7 +8391,7 @@ TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -8300,7 +8400,7 @@ TEST(NVFuserTest, FusionSmemBlockGemmCache_CUDA) { TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); } -TEST(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { +TEST_F(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -8309,7 +8409,7 @@ TEST(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { TensorView* max_val = reductionOp( BinaryOpType::Max, {-1}, - new Double(std::numeric_limits::lowest()), + IrBuilder::create(std::numeric_limits::lowest()), x); // (M) TensorView* bcast_max = broadcast(max_val, {false, true}); // (M, B) TensorView* x_max_sub = sub(x, bcast_max); // (M, N) @@ -8336,7 +8436,7 @@ TEST(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { bcast_sum, softmax}); - auto tidx = new Int(); + auto tidx = IrBuilder::create(); fusion.addInput(tidx); for (auto tensor : all_tensors) { @@ -8363,7 +8463,7 @@ TEST(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { auto aten_output = at::_softmax(aten_input.to(at::kDouble), -1, false); torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input, 128}); auto cg_outputs = fe.runFusion({aten_input, 128}); testValidate( @@ -8375,7 +8475,7 @@ TEST(NVFuserTest, FusionSmemDynamicPersistentSoftmax2D_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) { +TEST_F(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -8401,7 +8501,7 @@ TEST(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) { auto lparams = reduction_params.value().lparams; torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( @@ -8415,7 +8515,7 @@ TEST(NVFuserTest, FusionMagicSchedulerSoftmax_CUDA) { lparams); } -TEST(NVFuserTest, TestMaskSoftmax_CUDA) { +TEST_F(NVFuserTest, FusionTestMaskSoftmax_CUDA) { // This test is testing the usage of all padding tokens // with softmax like Bert might might use in a full padding // sequence. @@ -8456,7 +8556,7 @@ TEST(NVFuserTest, TestMaskSoftmax_CUDA) { auto lparams = reduction_params.value().lparams; torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input, aten_mask}, lparams); auto cg_outputs = fe.runFusion({aten_input, aten_mask}, lparams); testValidate( @@ -8470,7 +8570,7 @@ TEST(NVFuserTest, TestMaskSoftmax_CUDA) { lparams); } -TEST(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) { +TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -8558,13 +8658,13 @@ TEST(NVFuserTest, FusionMagicSchedulerLayerNormBackward_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { +TEST_F(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); const float kEps = 1e-5; - Double* eps_ptr = new Double(kEps); + Double* eps_ptr = IrBuilder::create(kEps); std::vector input_shape{20, 100, 35, 67}; std::vector norm_shape{67}; @@ -8594,7 +8694,7 @@ TEST(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { auto lparams = reduction_params.value().lparams; torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( @@ -8610,8 +8710,9 @@ TEST(NVFuserTest, FusionMagicSchedulerLayerNormalization_CUDA) { lparams); } -TEST(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 7) { +TEST_F(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) { + if (!deviceMajorMinorCheck(7)) { + GTEST_SKIP() << "skipping tests on pre-Volta GPUs"; return; } auto fusion = std::make_unique(); @@ -8633,8 +8734,8 @@ TEST(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) { fusion->addInput(running_mean); fusion->addInput(running_var); - Double* momentum = new Double(kMomentum); - Double* eps = new Double(kEps); + Double* momentum = IrBuilder::create(kMomentum); + Double* eps = IrBuilder::create(kEps); auto result = batch_norm( input, weight, bias, running_mean, running_var, kTraining, momentum, eps); @@ -8681,7 +8782,7 @@ TEST(NVFuserTest, FusionMagicSchedulerBatchNormalization_CUDA) { ""); } -TEST(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) { +TEST_F(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -8697,12 +8798,12 @@ TEST(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) { TensorView* max_sx = reductionOp( BinaryOpType::Max, {-1}, - new Double(std::numeric_limits::lowest()), + IrBuilder::create(std::numeric_limits::lowest()), sx); // (M) TensorView* max_dx = reductionOp( BinaryOpType::Max, {-1}, - new Double(std::numeric_limits::lowest()), + IrBuilder::create(std::numeric_limits::lowest()), dx); // (M) // Reduction => merge local and shared memory TensorViews @@ -8804,7 +8905,7 @@ TEST(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) { aten_output.narrow(1, static_size, dimy - static_size); torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_static_in, aten_dynamic_in}); fe.runFusion( {aten_static_in, aten_dynamic_in}, {cg_static_out, cg_dynamic_out}); @@ -8817,7 +8918,7 @@ TEST(NVFuserTest, FusionPersistentSoftmaxLocalSmem_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { +TEST_F(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -8830,10 +8931,10 @@ TEST(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { fusion.addInput(sx); fusion.addInput(dx); - Double* gamma = new Double(); - Double* beta = new Double(); - Double* eps = new Double(); - Int* N = new Int(); + Double* gamma = IrBuilder::create(); + Double* beta = IrBuilder::create(); + Double* eps = IrBuilder::create(); + Int* N = IrBuilder::create(); fusion.addInput(gamma); fusion.addInput(beta); fusion.addInput(eps); @@ -8982,7 +9083,7 @@ TEST(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { aten_static_in, aten_dynamic_in, kGamma, kBeta, kEps, dimy}; torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, {cg_static_out, cg_dynamic_out}); auto at_mu = at::mean(aten_input.to(at::kDouble), -1).unsqueeze(1); @@ -9003,16 +9104,16 @@ TEST(NVFuserTest, FusionPersistentNormLocalShared_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { +TEST_F(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views auto x = makeSymbolicTensor(2); - Double* gamma = new Double(); - Double* beta = new Double(); - Double* eps = new Double(); - Int* N = new Int(); + Double* gamma = IrBuilder::create(); + Double* beta = IrBuilder::create(); + Double* eps = IrBuilder::create(); + Int* N = IrBuilder::create(); fusion.addInput(x); fusion.addInput(gamma); fusion.addInput(beta); @@ -9062,7 +9163,7 @@ TEST(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { norm_gamma, norm_gamma_beta}); - auto tidx = new Int(); + auto tidx = IrBuilder::create(); fusion.addInput(tidx); for (auto tensor : all_tensors) { @@ -9105,20 +9206,21 @@ TEST(NVFuserTest, FusionSmemDynamicPersistentNorm_CUDA) { aten_input, kGamma, kBeta, kEps, dimy, TIDX}; torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { +TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addInput(tv0); fusion.addOutput(tv1); // tv1[I0, R1] = tv0[I0, I1] @@ -9150,7 +9252,7 @@ TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { LaunchParams lparams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); auto cg_outputs = fe.runFusion({aten_input}, lparams); testValidate( @@ -9165,12 +9267,12 @@ TEST(NVFuserTest, FusionSmemDynamicReductionSymbolic_CUDA) { TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 0); } -TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { +TEST_F(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm - Int* sym_bsx = new Int(); + Int* sym_bsx = IrBuilder::create(); TensorView* tv0 = makeSymbolicTensor(3); // M, K, N fusion.addInput(tv0); fusion.addInput(sym_bsx); @@ -9213,7 +9315,7 @@ TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input, runtime_threadIdx_dim}, lparams); auto cg_outputs = fe.runFusion({aten_input, runtime_threadIdx_dim}, lparams); testValidate( @@ -9229,11 +9331,11 @@ TEST(NVFuserTest, FusionSmemDynamicReductionSymbolicArg_CUDA) { TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); } -TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { +TEST_F(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - Int* sym_bsx = new Int(); + Int* sym_bsx = IrBuilder::create(); TensorView* tv0 = makeSymbolicTensor(2); // (M, K) TensorView* tv1 = makeSymbolicTensor(2); // (K, N) TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) @@ -9278,7 +9380,7 @@ TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { LaunchParams lparams(-1, -1, -1, BSX, -1, -1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( @@ -9294,14 +9396,16 @@ TEST(NVFuserTest, FusionSmemDynamicPwiseMulSymbolicArgWAR_CUDA) { TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); } -TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { +TEST_F(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Symbolic integers we will use for runtime tiling - Int* symbolic_m_tile_dim = new Int(); // bound to threadIdx.z - Int* symbolic_split_k_tile_dim = new Int(); // bound to blockIdx.x - Int* symbolic_block_k_tile_dim = new Int(); // bound to threadIdx.x + Int* symbolic_m_tile_dim = IrBuilder::create(); // bound to threadIdx.z + Int* symbolic_split_k_tile_dim = + IrBuilder::create(); // bound to blockIdx.x + Int* symbolic_block_k_tile_dim = + IrBuilder::create(); // bound to threadIdx.x // Compile-time integer for tiling int n_smem_tile = 8; // bound to threadIdx.y @@ -9397,10 +9501,6 @@ TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); - FusionExecutor fe; - // Generate CUDA and compile with nvRTC - fe.compileFusion(&fusion); - // Runtime tiling int m_tile = 4; // bound to threadIdx.z int split_k = 7; // bound to blockIdx.x @@ -9410,6 +9510,9 @@ TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { at::Tensor aten_output = mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); + FusionExecutor fe; + // Generate CUDA and compile with nvRTC + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -9418,13 +9521,14 @@ TEST(NVFuserTest, FusionSmemDynamicTiledGemm_CUDA) { TORCH_CHECK(fe.kernel()->summary().war_hazard_syncs_count == 1); } -TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) { +TEST_F(NVFuserTest, FusionGlobalIntermediate_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeSymbolicTensor(2); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); fusion.addInput(tv0); fusion.addOutput(tv1); // tv1[I0, R1] = tv0[I0, I1] @@ -9455,7 +9559,7 @@ TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) { auto lparams = LaunchParams(-1, -1, -1, runtime_threadIdx_dim, -1, -1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}, lparams); auto cg_outputs = fe.runFusion({input}, lparams); auto aten_output = input.to(at::kDouble).sum({1}); @@ -9470,7 +9574,7 @@ TEST(NVFuserTest, FusionGlobalIntermediate_CUDA) { lparams); } -TEST(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { +TEST_F(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -9504,18 +9608,18 @@ TEST(NVFuserTest, FusionGlobalIntermediateDefaultSchedule_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0, t1, t2, t3}); auto cg_outputs = fe.runFusion({t0, t1, t2, t3}); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionConstCheck_CUDA) { +TEST_F(NVFuserTest, FusionConstCheck_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - auto one = new Int(1); + auto one = IrBuilder::create(1); TORCH_CHECK(one->isConstScalar()); auto one_x2 = mul(one, one); @@ -9528,7 +9632,7 @@ TEST(NVFuserTest, FusionConstCheck_CUDA) { TORCH_CHECK(one_x4->isConstScalar()); } -TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) { +TEST_F(NVFuserTest, FusionUnrollWithAlloc_CUDA) { const std::vector tensor_dims_in = {128, 128}; Fusion fusion; FusionGuard fg(&fusion); @@ -9537,8 +9641,9 @@ TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) { TensorView* tv0 = makeSymbolicTensor(tensor_dims_in.size()); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(0)); - TensorView* tv2 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv1); + TensorView* tv1 = add(tv0, IrBuilder::create(0)); + TensorView* tv2 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv1); fusion.addOutput(tv2); const auto options = @@ -9562,7 +9667,7 @@ TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) { tv1->computeAt(tv2_rf, -1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto aten_output = (input + 0).to(at::kDouble).sum(1); @@ -9571,12 +9676,12 @@ TEST(NVFuserTest, FusionUnrollWithAlloc_CUDA) { } // Test isZeroInt -TEST(NVFuserTest, FusionIsZeroInt_CUDA) { +TEST_F(NVFuserTest, FusionIsZeroInt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - Int* x = new Int(0); - Int* y = new Int(1); + Int* x = IrBuilder::create(0); + Int* y = IrBuilder::create(1); Val* z = mul(x, y); TORCH_CHECK(x->isZeroInt()); TORCH_CHECK(!y->isZeroInt()); @@ -9584,12 +9689,12 @@ TEST(NVFuserTest, FusionIsZeroInt_CUDA) { } // Test isOneInt -TEST(NVFuserTest, FusionIsOneInt_CUDA) { +TEST_F(NVFuserTest, FusionIsOneInt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); - Int* x = new Int(1); - Int* y = new Int(1); + Int* x = IrBuilder::create(1); + Int* y = IrBuilder::create(1); Val* z = mul(x, y); TORCH_CHECK(x->isOneInt()); TORCH_CHECK(y->isOneInt()); @@ -9599,7 +9704,7 @@ TEST(NVFuserTest, FusionIsOneInt_CUDA) { // This is to verify no cycle of computeAt is created. A more complex // variation of this pattern appears in one of the Python tests // (test_random_topo). -TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -9607,12 +9712,12 @@ TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { fusion.addInput(tv0); // Common intermediate tensor - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); // tv1 -> tv2 - auto tv2 = add(tv1, new Double(2)); + auto tv2 = add(tv1, IrBuilder::create(2)); // tv1 -> tv3 -> tv4 - auto tv3 = add(tv1, new Double(3)); - auto tv4 = add(tv3, new Double(4)); + auto tv3 = add(tv1, IrBuilder::create(3)); + auto tv4 = add(tv3, IrBuilder::create(4)); // NOTE: This should no longer occur as of PR #201. // The order of adding outputs matters. If tv3 is added before tv4, @@ -9639,7 +9744,7 @@ TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { auto t4 = t3 + 4; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); std::vector aten_outputs = {t2, t4, t3}; @@ -9647,7 +9752,7 @@ TEST(NVFuserTest, FusionComputeAtNonterminatingOutput_CUDA) { &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTraversalOrder1_CUDA) { +TEST_F(NVFuserTest, FusionTraversalOrder1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -9655,10 +9760,10 @@ TEST(NVFuserTest, FusionTraversalOrder1_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv0, new Double(2)); - TensorView* tv3 = add(tv1, new Double(3)); - TensorView* tv4 = add(tv1, new Double(4)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv0, IrBuilder::create(2)); + TensorView* tv3 = add(tv1, IrBuilder::create(3)); + TensorView* tv4 = add(tv1, IrBuilder::create(4)); fusion.addOutput(tv2); fusion.addOutput(tv3); @@ -9666,9 +9771,6 @@ TEST(NVFuserTest, FusionTraversalOrder1_CUDA) { tv1->computeAt(tv3, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 10}, options); @@ -9684,12 +9786,14 @@ TEST(NVFuserTest, FusionTraversalOrder1_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTraversalOrder2_CUDA) { +TEST_F(NVFuserTest, FusionTraversalOrder2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -9697,11 +9801,11 @@ TEST(NVFuserTest, FusionTraversalOrder2_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(2)); - TensorView* tv3 = add(tv0, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); + TensorView* tv3 = add(tv0, IrBuilder::create(3)); + TensorView* tv4 = add(tv3, IrBuilder::create(4)); TensorView* tv5 = add(tv1, tv3); @@ -9712,9 +9816,6 @@ TEST(NVFuserTest, FusionTraversalOrder2_CUDA) { tv1->computeAt(tv5, -1); tv3->computeAt(tv5, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 10}, options); @@ -9731,13 +9832,15 @@ TEST(NVFuserTest, FusionTraversalOrder2_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTraversalOrder3_CUDA) { +TEST_F(NVFuserTest, FusionTraversalOrder3_CUDA) { for (const auto i : c10::irange(2)) { Fusion fusion; FusionGuard fg(&fusion); @@ -9745,11 +9848,11 @@ TEST(NVFuserTest, FusionTraversalOrder3_CUDA) { TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(2)); - TensorView* tv3 = add(tv0, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); + TensorView* tv3 = add(tv0, IrBuilder::create(3)); + TensorView* tv4 = add(tv3, IrBuilder::create(4)); TensorView* tv5 = add(tv1, tv3); @@ -9774,9 +9877,6 @@ TEST(NVFuserTest, FusionTraversalOrder3_CUDA) { compute_at_outer->computeAt(tv5, -2); compute_at_inner->computeAt(tv5, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); auto t1 = aten_input + 1; @@ -9792,6 +9892,8 @@ TEST(NVFuserTest, FusionTraversalOrder3_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( @@ -9799,25 +9901,25 @@ TEST(NVFuserTest, FusionTraversalOrder3_CUDA) { } } -TEST(NVFuserTest, FusionTraversalOrder4_CUDA) { +TEST_F(NVFuserTest, FusionTraversalOrder4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // First tree TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv1, new Double(3)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(2)); + TensorView* tv3 = add(tv1, IrBuilder::create(3)); fusion.addOutput(tv2); fusion.addOutput(tv3); // Second tree TensorView* tv4 = makeSymbolicTensor(1); fusion.addInput(tv4); - TensorView* tv5 = add(tv4, new Double(5)); - TensorView* tv6 = add(tv5, new Double(6)); - TensorView* tv7 = add(tv5, new Double(7)); + TensorView* tv5 = add(tv4, IrBuilder::create(5)); + TensorView* tv6 = add(tv5, IrBuilder::create(6)); + TensorView* tv7 = add(tv5, IrBuilder::create(7)); fusion.addOutput(tv6); fusion.addOutput(tv7); @@ -9844,23 +9946,23 @@ TEST(NVFuserTest, FusionTraversalOrder4_CUDA) { at::empty_like(t0, options)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); fe.runFusion(aten_inputs, cg_outputs); testValidate( &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTraversalOrder5_CUDA) { +TEST_F(NVFuserTest, FusionTraversalOrder5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv0, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(2)); + TensorView* tv3 = add(tv0, IrBuilder::create(3)); + TensorView* tv4 = add(tv3, IrBuilder::create(4)); TensorView* tv5 = add(tv2, tv4); fusion.addOutput(tv1); @@ -9870,9 +9972,6 @@ TEST(NVFuserTest, FusionTraversalOrder5_CUDA) { tv2->computeAt(tv5, -1); tv4->computeAt(tv5, -1); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); std::vector cg_outputs = { @@ -9880,6 +9979,8 @@ TEST(NVFuserTest, FusionTraversalOrder5_CUDA) { at::empty_like(aten_input, options), at::empty_like(aten_input, options)}; + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); auto t1 = aten_input + 1; @@ -9894,16 +9995,16 @@ TEST(NVFuserTest, FusionTraversalOrder5_CUDA) { &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTraversalOrder6_CUDA) { +TEST_F(NVFuserTest, FusionTraversalOrder6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv0, new Double(2)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv0, IrBuilder::create(2)); TensorView* tv3 = add(tv1, tv2); - TensorView* tv4 = add(tv3, new Double(4)); + TensorView* tv4 = add(tv3, IrBuilder::create(4)); fusion.addOutput(tv4); @@ -9916,9 +10017,6 @@ TEST(NVFuserTest, FusionTraversalOrder6_CUDA) { tv1->computeAt(tv3, -1); tv2->computeAt(tv3, -2); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); @@ -9929,22 +10027,24 @@ TEST(NVFuserTest, FusionTraversalOrder6_CUDA) { at::Tensor cg_output = at::empty_like(aten_input, options); + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTraversalOrder7_CUDA) { +TEST_F(NVFuserTest, FusionTraversalOrder7_CUDA) { Fusion fusion; FusionGuard fg(&fusion); TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(2)); - TensorView* tv3 = add(tv0, new Double(3)); - TensorView* tv4 = add(tv3, new Double(4)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(2)); + TensorView* tv3 = add(tv0, IrBuilder::create(3)); + TensorView* tv4 = add(tv3, IrBuilder::create(4)); TensorView* tv5 = add(tv2, tv4); fusion.addOutput(tv5); @@ -9963,9 +10063,6 @@ TEST(NVFuserTest, FusionTraversalOrder7_CUDA) { tv2->computeAt(tv5, -4); tv4->computeAt(tv5, -3); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({100}, options); @@ -9976,6 +10073,9 @@ TEST(NVFuserTest, FusionTraversalOrder7_CUDA) { auto aten_output = t2 + t4; at::Tensor cg_output = at::empty_like(aten_input, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, {cg_output}); testValidate( @@ -9983,7 +10083,7 @@ TEST(NVFuserTest, FusionTraversalOrder7_CUDA) { } // Test predication of grid reduction -TEST(NVFuserTest, FusionThreadPredicate_CUDA) { +TEST_F(NVFuserTest, FusionThreadPredicate_CUDA) { const int gdimx = 4; const int bdimx = 128; @@ -9993,9 +10093,10 @@ TEST(NVFuserTest, FusionThreadPredicate_CUDA) { TensorView* tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv0); TensorView* tv2 = unaryOp(UnaryOpType::Neg, tv1); - TensorView* tv3 = add(tv0, new Double(2)); + TensorView* tv3 = add(tv0, IrBuilder::create(2)); fusion.addOutput(tv3); fusion.addOutput(tv2); @@ -10036,14 +10137,14 @@ TEST(NVFuserTest, FusionThreadPredicate_CUDA) { at::empty_like(aten_input, options), at::empty({numel_x}, options)}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, cg_outputs); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionLSTMCell_CUDA) { +TEST_F(NVFuserTest, FusionLSTMCell_CUDA) { const int hidden_features = 512; const int batch_size = 64; @@ -10116,14 +10217,14 @@ TEST(NVFuserTest, FusionLSTMCell_CUDA) { auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {at_cy, at_hy}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) { +TEST_F(NVFuserTest, FusionComputeAtMultiBCast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10131,7 +10232,7 @@ TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) { TensorView* tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - TensorView* tv1 = mul(tv0, new Double(0.5)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); TensorView* tv2 = broadcast(tv1, {true, false}); TensorView* tv3 = broadcast(tv1, {false, true}); TensorView* tv4 = add(tv2, tv3); @@ -10142,7 +10243,7 @@ TEST(NVFuserTest, FusionComputeAtMultiBCast_CUDA) { ASSERT_ANY_THROW(tv1->computeAt(tv3, -1)); } -TEST(NVFuserTest, FusionReductionHalf_CUDA) { +TEST_F(NVFuserTest, FusionReductionHalf_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10151,7 +10252,7 @@ TEST(NVFuserTest, FusionReductionHalf_CUDA) { fusion.addInput(tv0); auto tv1 = castOp(DataType::Float, tv0); - auto tv2 = add(tv1, new Double(1.0)); + auto tv2 = add(tv1, IrBuilder::create(1.0)); auto tv3 = sum(tv2, {2}); auto tv4 = castOp(DataType::Half, tv3); @@ -10172,7 +10273,7 @@ TEST(NVFuserTest, FusionReductionHalf_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); @@ -10189,7 +10290,7 @@ TEST(NVFuserTest, FusionReductionHalf_CUDA) { lparams); } -TEST(NVFuserTest, FusionReduceSingle_CUDA) { +TEST_F(NVFuserTest, FusionReduceSingle_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10205,7 +10306,7 @@ TEST(NVFuserTest, FusionReduceSingle_CUDA) { // Grab only tensor views, though there shouldn't be any other type FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}); @@ -10214,7 +10315,7 @@ TEST(NVFuserTest, FusionReduceSingle_CUDA) { &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; @@ -10226,8 +10327,8 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) { TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); fusion.addInput(tv0); - TensorView* tv1 = - reductionOp(BinaryOpType::Add, {red_dim, 2}, new Double(0), tv0); + TensorView* tv1 = reductionOp( + BinaryOpType::Add, {red_dim, 2}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); const auto options = @@ -10241,7 +10342,7 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.to(at::kDouble).sum({red_dim, 2}); @@ -10257,7 +10358,7 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast_CUDA) { lparams); } -TEST(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) { +TEST_F(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; @@ -10269,10 +10370,11 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) { TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); fusion.addInput(tv0); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {2}, IrBuilder::create(0), tv0); - TensorView* tv2 = - reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv1); + TensorView* tv2 = reductionOp( + BinaryOpType::Add, {red_dim}, IrBuilder::create(0), tv1); fusion.addOutput(tv2); const auto options = @@ -10287,7 +10389,7 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.to(at::kDouble).sum({1, 2}); @@ -10303,7 +10405,7 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast2_CUDA) { lparams); } -TEST(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) { +TEST_F(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) { constexpr int bid_x = 80; constexpr int tid_x = 4096; constexpr int red_dim = 1; @@ -10315,10 +10417,11 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) { TensorView* tv0 = makeConcreteTensor({bid_x, tid_x, 1}); fusion.addInput(tv0); - TensorView* tv1 = - reductionOp(BinaryOpType::Add, {red_dim}, new Double(0), tv0); + TensorView* tv1 = reductionOp( + BinaryOpType::Add, {red_dim}, IrBuilder::create(0), tv0); - TensorView* tv2 = reductionOp(BinaryOpType::Add, {1}, new Double(0), tv1); + TensorView* tv2 = + reductionOp(BinaryOpType::Add, {1}, IrBuilder::create(0), tv1); fusion.addOutput(tv2); const auto options = @@ -10332,7 +10435,7 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}, lparams); // no broadcasting needed, omitting the last optional argument; auto cg_outputs = fe.runFusion({aten_input}, lparams); auto aten_output = aten_input.to(at::kDouble).sum({2, 1}); @@ -10348,24 +10451,27 @@ TEST(NVFuserTest, FusionReduceImplicitBroadcast3_CUDA) { lparams); } -TEST(NVFuserTest, FusionTrivialReduction_CUDA) { +TEST_F(NVFuserTest, FusionTrivialReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Set up your input tensor views TensorView* tv0 = makeConcreteTensor({10, 20, 1}); fusion.addInput(tv0); - TensorView* tv1 = reductionOp(BinaryOpType::Add, {2}, new Double(0), tv0); + TensorView* tv1 = + reductionOp(BinaryOpType::Add, {2}, IrBuilder::create(0), tv0); fusion.addOutput(tv1); - TORCH_CHECK(!fusion.hasReduction(), "Trivial reduction picked up by fusion"); + TORCH_CHECK( + ir_utils::getReductionOps(&fusion).empty(), + "Trivial reduction picked up by fusion"); const auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor aten_input = at::randn({10, 20, 1}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); auto aten_output = aten_input.to(at::kDouble).sum({2}); @@ -10373,7 +10479,7 @@ TEST(NVFuserTest, FusionTrivialReduction_CUDA) { &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTrivialReduction2_CUDA) { +TEST_F(NVFuserTest, FusionTrivialReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10400,14 +10506,14 @@ TEST(NVFuserTest, FusionTrivialReduction2_CUDA) { auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTrivialReduction3_CUDA) { +TEST_F(NVFuserTest, FusionTrivialReduction3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10433,7 +10539,7 @@ TEST(NVFuserTest, FusionTrivialReduction3_CUDA) { auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( @@ -10442,7 +10548,7 @@ TEST(NVFuserTest, FusionTrivialReduction3_CUDA) { // Make sure trivial reductions are correctly detected even with // scheduling applied. -TEST(NVFuserTest, FusionDetectTrivialReduction1_CUDA) { +TEST_F(NVFuserTest, FusionDetectTrivialReduction1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10459,8 +10565,8 @@ TEST(NVFuserTest, FusionDetectTrivialReduction1_CUDA) { auto tv4 = tv2->rFactor({-1}); auto tv5 = broadcast(tv0, {true, false}); - auto tv6 = add(tv5, new Double(1)); - auto tv7 = sub(tv6, new Double(1)); + auto tv6 = add(tv5, IrBuilder::create(1)); + auto tv7 = sub(tv6, IrBuilder::create(1)); auto tv8 = sum(tv7, {0}); fusion.addOutput(tv8); @@ -10483,10 +10589,10 @@ TEST(NVFuserTest, FusionDetectTrivialReduction1_CUDA) { GpuLower gpulw(&fusion); - // No kir::ReductionOp should be generated as all the reduction + // No ReductionOp should be generated as all the reduction // exprs should be replaced with a unary set op. - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - TORCH_CHECK(!kir_node->isA()); + for (const auto expr : gpulw.kernel()->as()->exprs()) { + TORCH_CHECK(!expr->isA()); } auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -10494,7 +10600,7 @@ TEST(NVFuserTest, FusionDetectTrivialReduction1_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -10502,14 +10608,14 @@ TEST(NVFuserTest, FusionDetectTrivialReduction1_CUDA) { } // Test detection of partially trivial reduction -TEST(NVFuserTest, FusionDetectTrivialReduction2_CUDA) { +TEST_F(NVFuserTest, FusionDetectTrivialReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); - auto tv2 = add(tv1, new Double(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv1->split(1, 1); @@ -10525,17 +10631,17 @@ TEST(NVFuserTest, FusionDetectTrivialReduction2_CUDA) { GpuLower gpulw(&fusion); // tv3's reduction axis is a trivial reduction. The only - // kir::ReductionOp should be for tv1. - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (kir_node->isA()) { + // ReductionOp should be for tv1. + for (const auto expr : gpulw.kernel()->as()->exprs()) { + if (expr->isA()) { auto reduction_out = - kir_node->as()->outputs()[0]->as(); - TORCH_CHECK(reduction_out->fuserTv() == tv1); + expr->as()->outputs()[0]->as(); + TORCH_CHECK(reduction_out->name() == 1); } } } -TEST(NVFuserTest, FusionInputsIdLookup_CUDA) { +TEST_F(NVFuserTest, FusionInputsIdLookup_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({16, 8, 8}, options); at::Tensor t1 = at::randn({8, 8}, options); @@ -10573,7 +10679,7 @@ TEST(NVFuserTest, FusionInputsIdLookup_CUDA) { TORCH_CHECK(id_1_relook.eviction == false); } -TEST(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) { +TEST_F(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) { std::vector sizes_vec({16, 8, 8}); std::vector strides_vec({64, 8, 1}); auto tensor_type = TensorType::create( @@ -10610,7 +10716,7 @@ TEST(NVFuserTest, FusionGroupGuardSimpleTensor_CUDA) { TORCH_CHECK(complyWith(t6, TensorType::create(t6))); } -TEST(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) { +TEST_F(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) { std::vector sizes_vec({16, 1, 8}); std::vector strides_vec({8, 8, 1}); auto tensor_type = TensorType::create( @@ -10634,7 +10740,7 @@ TEST(NVFuserTest, FusionGroupGuardBroadcastTensor_CUDA) { TORCH_CHECK(complyWith(t3, tensor_type)); } -TEST(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) { +TEST_F(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) { std::vector sizes_vec({16, 8, 8}); std::vector strides_vec({64, 1, 8}); auto tensor_type = TensorType::create( @@ -10650,7 +10756,7 @@ TEST(NVFuserTest, FusionGroupGuardPermutedTensor_CUDA) { TORCH_CHECK(complyWith(t1, tensor_type)); } -TEST(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) { +TEST_F(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) { std::vector sizes_vec({16, 8, 8}); std::vector strides_vec({128, 16, 1}); auto tensor_type = TensorType::create( @@ -10666,7 +10772,7 @@ TEST(NVFuserTest, FusionGroupGuardRelaxedCheck_CUDA) { TORCH_CHECK(complyWith(t1, tensor_type)); } -TEST(NVFuserTest, FusionDisjointSet_CUDA) { +TEST_F(NVFuserTest, FusionDisjointSet_CUDA) { DisjointSet set; const std::set group_x({0, 1, 2}); @@ -10779,7 +10885,7 @@ TEST(NVFuserTest, FusionDisjointSet_CUDA) { } } -TEST(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) { +TEST_F(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10802,7 +10908,7 @@ TEST(NVFuserTest, FusionNonUniqueBroadcastSize_CUDA) { ASSERT_ANY_THROW(tv3->computeAt(tv4, -1)); } -TEST(NVFuserTest, FusionBiasGeluFwd_CUDA) { +TEST_F(NVFuserTest, FusionBiasGeluFwd_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10819,14 +10925,14 @@ TEST(NVFuserTest, FusionBiasGeluFwd_CUDA) { auto t3 = castOp(DataType::Float, t2); auto t4 = broadcast(t1, {true, true, false}); auto t5 = add(t4, t3); - auto t6 = mul(t5, new Double(0.5)); - auto t7 = mul(t5, new Double(k_079)); - auto t8 = mul(t5, new Double(k_004)); + auto t6 = mul(t5, IrBuilder::create(0.5)); + auto t7 = mul(t5, IrBuilder::create(k_079)); + auto t8 = mul(t5, IrBuilder::create(k_004)); auto t9 = mul(t8, t5); - auto t10 = add(t9, new Int(1)); + auto t10 = add(t9, IrBuilder::create(1)); auto t11 = mul(t7, t10); auto t12 = unaryOp(UnaryOpType::Tanh, t11); - auto t13 = add(t12, new Double(1)); + auto t13 = add(t12, IrBuilder::create(1)); auto t14 = mul(t6, t13); auto t15 = castOp(DataType::Half, t14); fusion.addOutput(t15); @@ -10849,15 +10955,14 @@ TEST(NVFuserTest, FusionBiasGeluFwd_CUDA) { auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs, lparams); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBiasGeluBwd_CUDA) { +TEST_F(NVFuserTest, FusionBiasGeluBwd_CUDA) { if (at::cuda::getDeviceProperties(0)->major < 6) { return; } @@ -10882,23 +10987,23 @@ TEST(NVFuserTest, FusionBiasGeluBwd_CUDA) { auto t5 = castOp(DataType::Float, t4); auto t6 = broadcast(t3, {true, true, false}); auto t7 = add(t6, t5); - auto t8 = mul(t7, new Double(k_079)); - auto t9 = mul(t7, new Double(k_004)); + auto t8 = mul(t7, IrBuilder::create(k_079)); + auto t9 = mul(t7, IrBuilder::create(k_004)); auto t10 = mul(t9, t7); - auto t11 = add(t10, new Int(1)); + auto t11 = add(t10, IrBuilder::create(1)); auto t12 = mul(t8, t11); auto t13 = unaryOp(UnaryOpType::Tanh, t12); - auto t14 = mul(t7, new Double(0.5)); + auto t14 = mul(t7, IrBuilder::create(0.5)); auto t15 = mul(t13, t13); auto t16 = unaryOp(UnaryOpType::Neg, t15); - auto t17 = add(t16, new Int(1)); - auto t18 = mul(t7, new Double(k_010)); + auto t17 = add(t16, IrBuilder::create(1)); + auto t18 = mul(t7, IrBuilder::create(k_010)); auto t19 = mul(t18, t7); - auto t20 = add(t19, new Double(k_079)); + auto t20 = add(t19, IrBuilder::create(k_079)); auto t21 = mul(t17, t20); auto t22 = mul(t14, t21); - auto t23 = add(t13, new Int(1)); - auto t24 = mul(t23, new Double(0.5)); + auto t23 = add(t13, IrBuilder::create(1)); + auto t24 = mul(t23, IrBuilder::create(0.5)); auto t25 = add(t22, t24); auto t26 = mul(t25, t1); // Save float output for validation @@ -10929,8 +11034,7 @@ TEST(NVFuserTest, FusionBiasGeluBwd_CUDA) { auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs, lparams); auto cg_outputs = fe.runFusion(aten_inputs, lparams); testValidate( @@ -10938,7 +11042,7 @@ TEST(NVFuserTest, FusionBiasGeluBwd_CUDA) { } // Reproducer of issue #459 -TEST(NVFuserTest, FusionIssue459_CUDA) { +TEST_F(NVFuserTest, FusionIssue459_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -10947,14 +11051,14 @@ TEST(NVFuserTest, FusionIssue459_CUDA) { auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); auto tv3 = broadcast(tv2, {true, false}); auto tv4 = add(tv1, tv3); // Create two outputs from the final arithmetic result - auto tv5 = add(tv4, new Double(1)); + auto tv5 = add(tv4, IrBuilder::create(1)); fusion.addOutput(tv5); - auto tv6 = add(tv4, new Double(1)); + auto tv6 = add(tv4, IrBuilder::create(1)); fusion.addOutput(tv6); // Scheduling @@ -10981,8 +11085,7 @@ TEST(NVFuserTest, FusionIssue459_CUDA) { std::vector aten_inputs = {t0, t1}; torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -10994,15 +11097,15 @@ TEST(NVFuserTest, FusionIssue459_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionSmemIndexingSimple_CUDA) { +TEST_F(NVFuserTest, FusionSmemIndexingSimple_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); tv3->axis(0)->parallelize(ParallelType::BIDx); @@ -11013,28 +11116,27 @@ TEST(NVFuserTest, FusionSmemIndexingSimple_CUDA) { tv1->setMemoryType(MemoryType::Shared); tv2->setMemoryType(MemoryType::Global); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto aten_input = at::randn({12, 34}, options); at::Tensor aten_output = aten_input + 1.0 + 1.0 + 1.0; + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSmemIndexing_CUDA) { +TEST_F(NVFuserTest, FusionSmemIndexing_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Symbolic integers we will use for runtime tiling - Int* symbolic_m_tile_dim = new Int(); - Int* symbolic_split_k_tile_dim = new Int(); - Int* symbolic_block_k_tile_dim = new Int(); + Int* symbolic_m_tile_dim = IrBuilder::create(); + Int* symbolic_split_k_tile_dim = IrBuilder::create(); + Int* symbolic_block_k_tile_dim = IrBuilder::create(); // Compile-time integer for tiling int n_smem_tile = 32; @@ -11131,9 +11233,8 @@ TEST(NVFuserTest, FusionSmemIndexing_CUDA) { // A, B, m_tile_dim, split_k, intra_cta_tile std::vector aten_inputs = {t0, t1, 3, 4, 5}; - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( @@ -11141,13 +11242,13 @@ TEST(NVFuserTest, FusionSmemIndexing_CUDA) { } // Reproducer of issue 408 -TEST(NVFuserTest, FusionCacheBeforeReduction_CUDA) { +TEST_F(NVFuserTest, FusionCacheBeforeReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); fusion.addOutput(tv2); @@ -11160,9 +11261,6 @@ TEST(NVFuserTest, FusionCacheBeforeReduction_CUDA) { tv3->axis(-1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int numel_x = 100; const int numel_y = 200; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -11172,21 +11270,23 @@ TEST(NVFuserTest, FusionCacheBeforeReduction_CUDA) { auto aten_output = (aten_input + 1).to(at::kDouble).sum({1}); + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); fe.runFusion({aten_input}, {cg_output}); testValidate( &fusion, {cg_output}, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { +TEST_F(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(3); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); - auto tv3 = add(tv2, new Double(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv2); fusion.addOutput(tv3); @@ -11201,9 +11301,6 @@ TEST(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int numel_x = 10; const int numel_y = 20; const int numel_z = 30; @@ -11214,20 +11311,22 @@ TEST(NVFuserTest, FusionCacheBeforeReduction2_CUDA) { auto t3 = t2 + 1; std::vector aten_outputs = {t2, t3}; + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue367_CUDA) { +TEST_F(NVFuserTest, FusionIssue367_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Symbolic integers we will use for runtime tiling - Int* symbolic_m_tile_dim = new Int(); - Int* symbolic_split_k_tile_dim = new Int(); - Int* symbolic_block_k_tile_dim = new Int(); + Int* symbolic_m_tile_dim = IrBuilder::create(); + Int* symbolic_split_k_tile_dim = IrBuilder::create(); + Int* symbolic_block_k_tile_dim = IrBuilder::create(); // Compile-time integer for tiling int n_smem_tile = 32; @@ -11320,14 +11419,14 @@ TEST(NVFuserTest, FusionIssue367_CUDA) { mul(t0.unsqueeze(2), t1.unsqueeze(0)).to(at::kDouble).sum(1); torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue468_CUDA) { +TEST_F(NVFuserTest, FusionIssue468_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -11346,15 +11445,15 @@ TEST(NVFuserTest, FusionIssue468_CUDA) { at::Tensor aten_input = at::randn({10, 100}, options); at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}).sum({0}); - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + FusionExecutor fe; + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue363_CUDA) { +TEST_F(NVFuserTest, FusionIssue363_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -11402,21 +11501,21 @@ TEST(NVFuserTest, FusionIssue363_CUDA) { std::vector aten_inputs = {t0, t1}; torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue484_CUDA) { +TEST_F(NVFuserTest, FusionIssue484_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); - auto tv2 = add(tv1, new Double(0)); + auto tv2 = add(tv1, IrBuilder::create(0)); fusion.addOutput(tv2); tv1->setMemoryType(MemoryType::Global); @@ -11430,20 +11529,20 @@ TEST(NVFuserTest, FusionIssue484_CUDA) { at::Tensor aten_output = aten_input.to(at::kDouble).sum({1}); torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue329_CUDA) { +TEST_F(NVFuserTest, FusionIssue329_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); fusion.addOutput(tv2); auto tv3 = sum(tv1, {1}); @@ -11460,22 +11559,21 @@ TEST(NVFuserTest, FusionIssue329_CUDA) { std::vector aten_outputs = {t2, t3}; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue382_CUDA) { +TEST_F(NVFuserTest, FusionIssue382_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = broadcast(tv1, {false, false, true}); auto tv3 = makeSymbolicTensor(3); fusion.addInput(tv3); @@ -11492,9 +11590,6 @@ TEST(NVFuserTest, FusionIssue382_CUDA) { tv1->setMemoryType(MemoryType::Global); tv2->setMemoryType(MemoryType::Global); - torch::jit::fuser::cuda::FusionExecutor fe; - fe.compileFusion(&fusion); - const int numel_x = 12; const int numel_y = 34; const int numel_z = 56; @@ -11507,20 +11602,22 @@ TEST(NVFuserTest, FusionIssue382_CUDA) { std::vector aten_inputs = {t0, t3}; auto aten_output = (t0 + 1).unsqueeze(-1) + t3; + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); testValidate( &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue507_CUDA) { +TEST_F(NVFuserTest, FusionIssue507_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv1->setMemoryType(MemoryType::Shared); @@ -11538,22 +11635,21 @@ TEST(NVFuserTest, FusionIssue507_CUDA) { auto aten_output = (t1 + 1); FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); testValidate( &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue532_CUDA) { +TEST_F(NVFuserTest, FusionIssue532_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(1); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(1)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(1)); fusion.addInput(tv0); fusion.addOutput(tv2); @@ -11579,7 +11675,7 @@ TEST(NVFuserTest, FusionIssue532_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0 + 1 + 1; @@ -11588,14 +11684,14 @@ TEST(NVFuserTest, FusionIssue532_CUDA) { &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionLoopUnswitch_CUDA) { +TEST_F(NVFuserTest, FusionLoopUnswitch_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // Algorithm TensorView* tv0 = makeSymbolicTensor(1); - TensorView* tv1 = add(tv0, new Double(1)); - TensorView* tv2 = add(tv1, new Double(1)); + TensorView* tv1 = add(tv0, IrBuilder::create(1)); + TensorView* tv2 = add(tv1, IrBuilder::create(1)); fusion.addInput(tv0); fusion.addOutput(tv2); @@ -11612,7 +11708,7 @@ TEST(NVFuserTest, FusionLoopUnswitch_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0 + 1 + 1; @@ -11621,7 +11717,7 @@ TEST(NVFuserTest, FusionLoopUnswitch_CUDA) { &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue549_CUDA) { +TEST_F(NVFuserTest, FusionIssue549_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -11631,7 +11727,7 @@ TEST(NVFuserTest, FusionIssue549_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); TensorView* tv3 = broadcast(tv2, {false, false, true}); // tv3[I0, I1, B] = tv0[I0, I1] @@ -11689,10 +11785,12 @@ TEST(NVFuserTest, FusionIssue549_CUDA) { at::Tensor t0 = at::randn({M, K}, options); at::Tensor t1 = at::randn({K, N}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); // Lets specify a few bounds in launch params to make sure it works - fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); + LaunchParams lparams(1, -1, -1, 32, 4, 4); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0, t1}, lparams); + fe.runFusion({t0, t1}, lparams); // Make sure bad launch params throws // TODO: Re-enable once we have parallelization validation in. @@ -11707,7 +11805,7 @@ TEST(NVFuserTest, FusionIssue549_CUDA) { &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, simplecompileRtc_CUDA) { +TEST_F(NVFuserTest, FusionSimpleCompileRtc_CUDA) { FusionExecutor fe; std::string kernel = R"( __global__ void kernel1(Tensor T0, Tensor T1) { @@ -11739,7 +11837,7 @@ __global__ void kernel1(Tensor T0, Tensor T1) { TORCH_CHECK(out_ref.allclose(out0)); } -TEST(NVFuserTest, FusionSerialWelford_CUDA) { +TEST_F(NVFuserTest, FusionSerialWelford_CUDA) { FusionExecutor fe; int x = 128, y = 64, z = 64; @@ -11796,7 +11894,7 @@ __global__ void kernel1( TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } -TEST(NVFuserTest, FusionBlockWelford_CUDA) { +TEST_F(NVFuserTest, FusionBlockWelford_CUDA) { FusionExecutor fe; int x = 7, y = 8, z = 9; @@ -11884,7 +11982,7 @@ __global__ void kernel1( cat_tensor.mean({1}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } -TEST(NVFuserTest, FusionBlockWelfordNoInit_CUDA) { +TEST_F(NVFuserTest, FusionBlockWelfordNoInit_CUDA) { FusionExecutor fe; int x = 7, y = 8, z = 9; @@ -11950,7 +12048,7 @@ __global__ void kernel1( TORCH_CHECK(in0.mean({1, 2}).allclose(out_avg, /*rtol*/ 1e-5, /*atol*/ 1e-6)); } -TEST(NVFuserTest, FusionGridWelfordNoInit_CUDA) { +TEST_F(NVFuserTest, FusionGridWelfordNoInit_CUDA) { FusionExecutor fe; int x = 128, y = 64, z = 128; @@ -12040,7 +12138,7 @@ __global__ void kernel1( TORCH_CHECK(in0.var(dims, false).allclose(out_var)); } -TEST(NVFuserTest, FusionWelfordOp_CUDA) { +TEST_F(NVFuserTest, FusionWelfordOp_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12048,7 +12146,7 @@ TEST(NVFuserTest, FusionWelfordOp_CUDA) { auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); + auto tv1 = mul(tv0, IrBuilder::create(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; @@ -12069,7 +12167,7 @@ TEST(NVFuserTest, FusionWelfordOp_CUDA) { at::Tensor t0 = at::randn({M, N}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto outputs = fe.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var @@ -12084,7 +12182,7 @@ TEST(NVFuserTest, FusionWelfordOp_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionBlockWelfordOp_CUDA) { +TEST_F(NVFuserTest, FusionBlockWelfordOp_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12092,7 +12190,7 @@ TEST(NVFuserTest, FusionBlockWelfordOp_CUDA) { auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); + auto tv1 = mul(tv0, IrBuilder::create(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; @@ -12115,7 +12213,7 @@ TEST(NVFuserTest, FusionBlockWelfordOp_CUDA) { at::Tensor t_N = at::empty({M}, options_int); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto outputs = fe.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var @@ -12130,7 +12228,7 @@ TEST(NVFuserTest, FusionBlockWelfordOp_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionGridWelfordOp_CUDA) { +TEST_F(NVFuserTest, FusionGridWelfordOp_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12138,7 +12236,7 @@ TEST(NVFuserTest, FusionGridWelfordOp_CUDA) { auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); + auto tv1 = mul(tv0, IrBuilder::create(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; @@ -12161,7 +12259,7 @@ TEST(NVFuserTest, FusionGridWelfordOp_CUDA) { at::Tensor t_N = at::empty({M}, options_int); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto outputs = fe.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var @@ -12176,7 +12274,7 @@ TEST(NVFuserTest, FusionGridWelfordOp_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionRfactorWelfordOp_CUDA) { +TEST_F(NVFuserTest, FusionRfactorWelfordOp_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12184,7 +12282,7 @@ TEST(NVFuserTest, FusionRfactorWelfordOp_CUDA) { auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); + auto tv1 = mul(tv0, IrBuilder::create(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; @@ -12206,7 +12304,7 @@ TEST(NVFuserTest, FusionRfactorWelfordOp_CUDA) { at::Tensor t_N = at::empty({M}, options_int); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto outputs = fe.runFusion({t0}); // by default Welford outputs sum of square diff so need to divide to get var @@ -12221,7 +12319,7 @@ TEST(NVFuserTest, FusionRfactorWelfordOp_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionWelfordSchedule_CUDA) { +TEST_F(NVFuserTest, FusionWelfordSchedule_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12229,7 +12327,7 @@ TEST(NVFuserTest, FusionWelfordSchedule_CUDA) { auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = mul(tv0, new Double(1)); + auto tv1 = mul(tv0, IrBuilder::create(1)); auto tvs = Welford(tv1, {1}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; @@ -12246,9 +12344,10 @@ TEST(NVFuserTest, FusionWelfordSchedule_CUDA) { auto reduction_params = getReductionHeuristics(&fusion, {t0}); scheduleReduction(&fusion, reduction_params.value()); + auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({t0}, reduction_params.value().lparams); + fe.compileFusion(&fusion, {t0}, lparams); + auto outputs = fe.runFusion({t0}, lparams); // by default Welford outputs sum of square diff so need to divide to get var outputs[1] /= N; @@ -12283,7 +12382,7 @@ void testWelford(DataType dtype, int red_axis, int odim, int rdim) { tv0_cast = castOp(DataType::Float, tv0); } fusion.addInput(tv0); - auto tv1 = mul(tv0_cast, new Double(1)); + auto tv1 = mul(tv0_cast, IrBuilder::create(1)); auto tvs = Welford(tv1, {axis}); auto tv_avg = tvs.avg; auto tv_M2 = tvs.var_sum; @@ -12324,8 +12423,8 @@ void testWelford(DataType dtype, int red_axis, int odim, int rdim) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); - auto outputs = fe.runFusion({aten_input}, reduction_params.value().lparams); + fe.compileFusion(&fusion, {aten_input}, lparams); + auto outputs = fe.runFusion({aten_input}, lparams); // by default Welford outputs sum of square diff so need to divide to // get var @@ -12351,7 +12450,7 @@ void testWelford(DataType dtype, int red_axis, int odim, int rdim) { } } // namespace -TEST(NVFuserTest, FusionWelfordShmoo_CUDA) { +TEST_F(NVFuserTest, FusionWelfordShmoo_CUDA) { std::vector dtypes = { DataType::Double, DataType::Float, DataType::Half}; #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 @@ -12393,7 +12492,7 @@ TEST(NVFuserTest, FusionWelfordShmoo_CUDA) { } } -TEST(NVFuserTest, FusionTranspose1_CUDA) { +TEST_F(NVFuserTest, FusionTranspose1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12414,7 +12513,7 @@ TEST(NVFuserTest, FusionTranspose1_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0.t(); @@ -12423,7 +12522,7 @@ TEST(NVFuserTest, FusionTranspose1_CUDA) { &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTranspose2_CUDA) { +TEST_F(NVFuserTest, FusionTranspose2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12447,7 +12546,7 @@ TEST(NVFuserTest, FusionTranspose2_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0.t(); @@ -12456,7 +12555,7 @@ TEST(NVFuserTest, FusionTranspose2_CUDA) { &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { +TEST_F(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12526,10 +12625,11 @@ TEST(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { at::Tensor t0 = at::randn({K, M}, options); at::Tensor t1 = at::randn({N, K}, options); - FusionExecutor fe; - fe.compileFusion(&fusion); // Lets specify a few bounds in launch params to make sure it works - fe.runFusion({t0, t1}, LaunchParams(1, -1, -1, 32, 4, 4)); + LaunchParams lparams(1, -1, -1, 32, 4, 4); + FusionExecutor fe; + fe.compileFusion(&fusion, {t0, t1}, lparams); + fe.runFusion({t0, t1}, lparams); // Don't specify any launch params auto cg_outputs = fe.runFusion({t0, t1}); @@ -12540,7 +12640,7 @@ TEST(NVFuserTest, FusionSimpleGemmTransposed_CUDA) { &fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { +TEST_F(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12593,7 +12693,7 @@ TEST(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { at::Tensor cg_output = at::empty({dimx, dimy, dimz}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_input_t = at::transpose(input, 1, 2); @@ -12603,7 +12703,7 @@ TEST(NVFuserTest, FusionSoftmax3DTransposed_CUDA) { &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { // Case 1 // tv1 = tv0 * 0.5 // tv2 = tv1 * -1 @@ -12620,10 +12720,10 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { tv0 = transpose(tv0, {{0, 1}}); - TensorView* tv1 = mul(tv0, new Double(0.5)); - TensorView* tv2 = mul(tv1, new Double(-1.0)); - TensorView* tv3 = add(tv1, new Double(3.0)); - TensorView* tv4 = mul(tv1, new Double(2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(0.5)); + TensorView* tv2 = mul(tv1, IrBuilder::create(-1.0)); + TensorView* tv3 = add(tv1, IrBuilder::create(3.0)); + TensorView* tv4 = mul(tv1, IrBuilder::create(2.0)); TensorView* tv5 = add(tv3, tv2); TensorView* tv6 = add(tv5, tv4); @@ -12654,7 +12754,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { } for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); tv->axis(1)->parallelize(ParallelType::Unroll); @@ -12667,7 +12767,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { at::Tensor aten_input = at::randn({129, 127}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); at::Tensor aten_input_t = aten_input.t(); @@ -12686,7 +12786,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed1_CUDA) { &fusion, cg_outputs, {aten_input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { // Case 2 // tv1 = tv0 * -1 // tv2 = tv0 + 3 @@ -12702,9 +12802,9 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { tv0 = transpose(tv0, {{0, 1}}); - TensorView* tv1 = mul(tv0, new Double(-1.0)); - TensorView* tv2 = add(tv0, new Double(3.0)); - TensorView* tv3 = mul(tv0, new Double(2.0)); + TensorView* tv1 = mul(tv0, IrBuilder::create(-1.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(3.0)); + TensorView* tv3 = mul(tv0, IrBuilder::create(2.0)); TensorView* tv4 = add(tv2, tv1); TensorView* tv5 = add(tv4, tv3); @@ -12723,7 +12823,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { tv0->computeAt(tv6, 1); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -12736,7 +12836,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { at::Tensor input = at::randn({129, 127}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto cg_outputs = fe.runFusion({input}); auto input_t = input.t(); @@ -12752,7 +12852,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed2_CUDA) { testValidate(&fusion, cg_outputs, {input}, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { // Case 3 // T2 = T1 * 0.979361 // T3 = T2 * T0 @@ -12769,7 +12869,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { tv1 = transpose(tv1, {{0, 1}, {1, 2}, {2, 3}, {3, 0}}); - TensorView* tv2 = mul(tv1, new Double(.979361)); + TensorView* tv2 = mul(tv1, IrBuilder::create(.979361)); TensorView* tv3 = mul(tv2, tv0); fusion.addOutput(tv3); @@ -12786,7 +12886,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { tv3->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -12802,7 +12902,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); @@ -12814,7 +12914,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed3_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { // Case 4 // T4 = T2 - T3 // T5 = T1 + T4 @@ -12862,7 +12962,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { tv6->axis(0)->parallelize(ParallelType::BIDx); for (Val* val : fusion.vals()) { - if (!fusion.hasInput(val) && + if (!val->isFusionInput() && val->getValType().value() == ValType::TensorView) { TensorView* tv = static_cast(val); @@ -12880,7 +12980,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { std::vector aten_inputs = {t0, t1, t2, t3}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto t0_t = t0.permute({3, 0, 1, 2}); @@ -12895,7 +12995,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed4_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { // Case 5 // tv2 = tv0 + 2.0 // tv3 = tv1 * tv2 @@ -12909,7 +13009,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); tv1 = transpose(tv1, {{0, 1}}); - TensorView* tv2 = add(tv0, new Double(2.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); @@ -12928,7 +13028,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto t2 = t0.t().add(2.0); @@ -12938,7 +13038,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed5_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { +TEST_F(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -12948,7 +13048,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { TensorView* tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); tv1 = transpose(tv1, {{0, 1}}); - TensorView* tv2 = add(tv0, new Double(2.0)); + TensorView* tv2 = add(tv0, IrBuilder::create(2.0)); TensorView* tv3 = mul(tv1, tv2); fusion.addOutput(tv3); @@ -12970,7 +13070,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto t2 = t0.t().add(2.0); @@ -12980,7 +13080,7 @@ TEST(NVFuserTest, FusionAdvancedComputeAtTransposed6_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSegmentReducePointwise_CUDA) { +TEST_F(NVFuserTest, FusionSegmentReducePointwise_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -12992,7 +13092,7 @@ TEST(NVFuserTest, FusionSegmentReducePointwise_CUDA) { fusion->addInput(tv1); fusion->addInput(tv2); - TensorView* tv3 = add(tv0, new Double(1)); // Group 0 + TensorView* tv3 = add(tv0, IrBuilder::create(1)); // Group 0 TensorView* tv4 = max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues) TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce, @@ -13029,7 +13129,7 @@ TEST(NVFuserTest, FusionSegmentReducePointwise_CUDA) { executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionMultipleVectorize_CUDA) { +TEST_F(NVFuserTest, FusionMultipleVectorize_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -13089,7 +13189,7 @@ TEST(NVFuserTest, FusionMultipleVectorize_CUDA) { TORCH_CHECK(runtime1 != runtime3); } -TEST(NVFuserTest, FusionVectorizeSimple_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeSimple_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13123,7 +13223,7 @@ TEST(NVFuserTest, FusionVectorizeSimple_CUDA) { at::Tensor aten_input = at::empty({2, 6, 32}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {aten_input}); auto cg_outputs = fe.runFusion({aten_input}); at::Tensor aten_output = aten_input.sin(); @@ -13132,7 +13232,7 @@ TEST(NVFuserTest, FusionVectorizeSimple_CUDA) { &fusion, cg_outputs, {aten_input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { +TEST_F(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { Fusion fusion; FusionGuard fg(&fusion); // dimensionality of the problem @@ -13148,7 +13248,7 @@ TEST(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { // Do math with it, it returns a `Val*` but can be static_casted back to // TensorView - TensorView* tv2 = add(tv1, new Double(2.0)); + TensorView* tv2 = add(tv1, IrBuilder::create(2.0)); TensorView* tv3 = add(tv0, tv2); // Register your outputs @@ -13197,7 +13297,7 @@ TEST(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { at::Tensor output = at::empty_like(input1); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input1, input2}); fe.runFusion({input1, input2}, {output}); at::Tensor tv2_ref = input2 + 2.0; @@ -13206,7 +13306,7 @@ TEST(NVFuserTest, FusionSimpleVectorizeUnroll_CUDA) { TORCH_CHECK(output_ref.equal(output)); } -TEST(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) { +TEST_F(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -13220,7 +13320,7 @@ TEST(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) { fusion->addInput(tv0); - auto tv1 = add(tv0, new Double(1.0)); + auto tv1 = add(tv0, IrBuilder::create(1.0)); auto tv2 = sum(tv1, {2}); // Group 0 auto output = softmax(tv2, kReductionAxis); // Group 1 @@ -13247,14 +13347,14 @@ TEST(NVFuserTest, FusionSegmentReduceSoftmax_CUDA) { executor_cache.fusion(), outputs, {at_x}, {t3}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSwizzle1_CUDA) { +TEST_F(NVFuserTest, FusionSwizzle1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = mul(tv1, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = mul(tv1, IrBuilder::create(2)); fusion.addOutput(tv2); tv2->split(0, 7); @@ -13279,7 +13379,7 @@ TEST(NVFuserTest, FusionSwizzle1_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = (t0 + 1) * 2; @@ -13288,14 +13388,14 @@ TEST(NVFuserTest, FusionSwizzle1_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSwizzle2_CUDA) { +TEST_F(NVFuserTest, FusionSwizzle2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = mul(tv1, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = mul(tv1, IrBuilder::create(2)); fusion.addOutput(tv2); tv1->split(-1, 4); @@ -13323,7 +13423,7 @@ TEST(NVFuserTest, FusionSwizzle2_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = (t0 + 1) * 2; @@ -13332,7 +13432,7 @@ TEST(NVFuserTest, FusionSwizzle2_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTransposeWithSwizzle_CUDA) { +TEST_F(NVFuserTest, FusionTransposeWithSwizzle_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13385,8 +13485,7 @@ TEST(NVFuserTest, FusionTransposeWithSwizzle_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.t(); @@ -13395,7 +13494,7 @@ TEST(NVFuserTest, FusionTransposeWithSwizzle_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) { +TEST_F(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13452,8 +13551,7 @@ TEST(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); - + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.t(); @@ -13462,10 +13560,7 @@ TEST(NVFuserTest, FusionTransposeWithSwizzle1DThreadBlock_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGridPersistence_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionGridPersistence_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13490,7 +13585,7 @@ TEST(NVFuserTest, FusionGridPersistence_CUDA) { at::Tensor input = at::randn({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto out = fe.runFusion({input}); auto aten_output = input.sum({0}).unsqueeze(-1).add(input); @@ -13498,10 +13593,7 @@ TEST(NVFuserTest, FusionGridPersistence_CUDA) { testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGridPersistence2_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionGridPersistence2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13528,7 +13620,7 @@ TEST(NVFuserTest, FusionGridPersistence2_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto out = fe.runFusion({input}); auto aten_output = input.sum({0}).unsqueeze(0).add(input); @@ -13536,10 +13628,7 @@ TEST(NVFuserTest, FusionGridPersistence2_CUDA) { testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionWelfordPersistence_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionWelfordPersistence_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13567,7 +13656,7 @@ TEST(NVFuserTest, FusionWelfordPersistence_CUDA) { at::Tensor input = at::randn({numel_x}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto out = fe.runFusion({input}); auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) @@ -13577,10 +13666,7 @@ TEST(NVFuserTest, FusionWelfordPersistence_CUDA) { testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionWelfordPersistence2_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionWelfordPersistence2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13610,7 +13696,7 @@ TEST(NVFuserTest, FusionWelfordPersistence2_CUDA) { at::Tensor input = at::randn({numel_x, numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); auto out = fe.runFusion({input}); auto aten_output = (input.mean({0}) + (input.var({0}, false) * numel_x)) @@ -13620,7 +13706,7 @@ TEST(NVFuserTest, FusionWelfordPersistence2_CUDA) { testValidate(&fusion, out, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue633_CUDA) { +TEST_F(NVFuserTest, FusionIssue633_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13642,14 +13728,13 @@ TEST(NVFuserTest, FusionIssue633_CUDA) { tv2->axis(0)->parallelize(ParallelType::BIDx); tv2->axis(1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({dx, dy, dz}, options); at::Tensor t1 = at::randn({dx, dy, 1}, options); std::vector aten_inputs = {t0, t1}; + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; @@ -13658,48 +13743,7 @@ TEST(NVFuserTest, FusionIssue633_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionKirScoping_CUDA) { - Fusion fusion; - FusionGuard fg(&fusion); - - auto tv0 = makeSymbolicTensor(2); - fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - fusion.addOutput(tv2); - - tv2->merge(0); - tv2->split(0, 4); - tv0->computeAt(tv2, -1); - - GpuLower gpulw(&fusion); - - auto kir_tv1 = gpulw.lowerValue(tv1); - auto tv1_scope = kir_tv1->definition()->scope(); - TORCH_CHECK(tv1_scope != nullptr); - TORCH_CHECK(tv1_scope->owner()->as()); - - auto kir_tv2 = gpulw.lowerValue(tv2); - auto tv2_scope = kir_tv2->definition()->scope(); - TORCH_CHECK(tv2_scope != nullptr); - TORCH_CHECK(tv2_scope->owner()->as()); - - TORCH_CHECK(tv1_scope != tv2_scope); - - // tv1 and tv2 should have the same inner-most ForLoop - auto parent_scope = tv1_scope->owner()->scope(); - TORCH_CHECK(parent_scope == tv2_scope->owner()->scope()); - TORCH_CHECK(parent_scope->owner()->as()); - // There should be one more loop - parent_scope = parent_scope->owner()->scope(); - TORCH_CHECK(parent_scope->owner()->as()); - - // scope() should return nullptr for top-level exprs - auto top_level_scope = parent_scope->owner()->scope(); - TORCH_CHECK(top_level_scope == nullptr); -} - -TEST(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { +TEST_F(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13726,7 +13770,7 @@ TEST(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto t3 = t0.unsqueeze(-1).expand(shape) + t1; @@ -13734,7 +13778,7 @@ TEST(NVFuserTest, FusionBroadcastAcrossComputeAt_CUDA) { testValidate(&fusion, cg_outputs, aten_inputs, {t3}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13777,7 +13821,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; @@ -13785,7 +13829,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedPointwise_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13835,7 +13879,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; @@ -13843,7 +13887,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeContig_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13896,7 +13940,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; @@ -13904,7 +13948,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicPass_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -13963,7 +14007,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedPointwiseMergeSymbolicFail_CUDA) { ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } -TEST(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14013,7 +14057,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.add(t1).sum(1); @@ -14021,7 +14065,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedRFactor_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14059,7 +14103,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedWrongDimFail_CUDA) { ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } -TEST(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14100,7 +14144,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; @@ -14108,7 +14152,7 @@ TEST(NVFuserTest, FusionVectorizeMisalignedStride_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) { +TEST_F(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14151,13 +14195,13 @@ TEST(NVFuserTest, FusionVectorizeMisalignedStrideFail_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); // Failure because the input + output tensors do not have the same stride ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); } -TEST(NVFuserTest, FusionViewOutput_CUDA) { +TEST_F(NVFuserTest, FusionViewOutput_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14181,7 +14225,7 @@ TEST(NVFuserTest, FusionViewOutput_CUDA) { auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto outputs = fe.runFusion(aten_inputs, lparams); auto at_x_add_bias = at_x + at_bias; @@ -14190,7 +14234,7 @@ TEST(NVFuserTest, FusionViewOutput_CUDA) { testValidate(&fusion, outputs, aten_inputs, {at_x_view}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionViewFailMismatchSize_CUDA) { +TEST_F(NVFuserTest, FusionViewFailMismatchSize_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14210,7 +14254,7 @@ TEST(NVFuserTest, FusionViewFailMismatchSize_CUDA) { ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape)); } -TEST(NVFuserTest, FusionViewFailMulitDimInference_CUDA) { +TEST_F(NVFuserTest, FusionViewFailMulitDimInference_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14228,7 +14272,7 @@ TEST(NVFuserTest, FusionViewFailMulitDimInference_CUDA) { ASSERT_ANY_THROW(view(x_add_bias, input_shape, output_shape)); } -TEST(NVFuserTest, FusionViewFailReduction_CUDA) { +TEST_F(NVFuserTest, FusionViewFailReduction_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -14259,7 +14303,7 @@ TEST(NVFuserTest, FusionViewFailReduction_CUDA) { ASSERT_ANY_THROW(fusion_executor_cache.runFusionWithInputs({at_x, at_bias})); } -TEST(NVFuserTest, FusionViewFailPersistent_CUDA) { +TEST_F(NVFuserTest, FusionViewFailPersistent_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -14319,7 +14363,7 @@ void addViewGeluFusion( auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto outputs = fe.runFusion(aten_inputs, lparams); auto at_x_add_bias = at_x + at_bias; @@ -14330,25 +14374,25 @@ void addViewGeluFusion( } } -TEST(NVFuserTest, FusionViewSplit_CUDA) { +TEST_F(NVFuserTest, FusionViewSplit_CUDA) { std::vector input_shape{80}; std::vector output_shape{2, 4, 10}; addViewGeluFusion(input_shape, output_shape); } -TEST(NVFuserTest, FusionViewBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionViewBroadcast_CUDA) { std::vector input_shape{80}; std::vector output_shape{1, 80}; addViewGeluFusion(input_shape, output_shape); } -TEST(NVFuserTest, FusionViewMerge_CUDA) { +TEST_F(NVFuserTest, FusionViewMerge_CUDA) { std::vector input_shape{2, 40, 7}; std::vector output_shape{560}; addViewGeluFusion(input_shape, output_shape); } -TEST(NVFuserTest, FusionViewAllShmoo_CUDA) { +TEST_F(NVFuserTest, FusionViewAllShmoo_CUDA) { typedef std::vector shape; typedef std::pair view_example; @@ -14373,7 +14417,7 @@ TEST(NVFuserTest, FusionViewAllShmoo_CUDA) { } } -TEST(NVFuserTest, FusionViewInferShmoo_CUDA) { +TEST_F(NVFuserTest, FusionViewInferShmoo_CUDA) { typedef std::vector shape; typedef std::pair view_example; @@ -14427,7 +14471,7 @@ void geluViewAddFusion( auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto outputs = fe.runFusion(aten_inputs, lparams); auto at_x_gelu = at::gelu(at_x); @@ -14438,7 +14482,7 @@ void geluViewAddFusion( } } -TEST(NVFuserTest, FusionViewStride_CUDA) { +TEST_F(NVFuserTest, FusionViewStride_CUDA) { typedef std::vector shape; typedef std::pair view_example; @@ -14483,7 +14527,7 @@ void geluViewBinaryAddFusion( auto lparams = schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs, lparams); auto outputs = fe.runFusion(aten_inputs, lparams); auto at_x_gelu = at::gelu(at_x); @@ -14495,11 +14539,11 @@ void geluViewBinaryAddFusion( } } -TEST(NVFuserTest, FusionViewBinary_CUDA) { +TEST_F(NVFuserTest, FusionViewBinary_CUDA) { geluViewBinaryAddFusion({27454, 2}, {54908}, {7844, 7}); } -TEST(NVFuserTest, FusionVectorization1_CUDA) { +TEST_F(NVFuserTest, FusionVectorization1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14540,7 +14584,7 @@ TEST(NVFuserTest, FusionVectorization1_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0 + t1; @@ -14548,7 +14592,7 @@ TEST(NVFuserTest, FusionVectorization1_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionVectorization2_CUDA) { +TEST_F(NVFuserTest, FusionVectorization2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14586,7 +14630,7 @@ TEST(NVFuserTest, FusionVectorization2_CUDA) { ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } -TEST(NVFuserTest, FusionVectorization3_CUDA) { +TEST_F(NVFuserTest, FusionVectorization3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14623,11 +14667,10 @@ TEST(NVFuserTest, FusionVectorization3_CUDA) { const int by = 2049; at::Tensor t0 = at::randn({bx, by}, options); at::Tensor t1 = at::randn({bx, by}, options); + std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); - - std::vector aten_inputs = {t0, t1}; + fe.compileFusion(&fusion, aten_inputs); ASSERT_ANY_THROW(fe.runFusion(aten_inputs)); aten_inputs[0] = t0.index({"...", Slice(1)}); @@ -14644,7 +14687,7 @@ TEST(NVFuserTest, FusionVectorization3_CUDA) { &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionVectorizationRFactor_CUDA) { +TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14692,7 +14735,7 @@ TEST(NVFuserTest, FusionVectorizationRFactor_CUDA) { std::vector aten_inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto aten_output = t0.add(t1).sum(1); @@ -14705,7 +14748,7 @@ TEST(NVFuserTest, FusionVectorizationRFactor_CUDA) { } // Unswitched loops with extent one may omit else clause. -TEST(NVFuserTest, FusionSizeOneLoop1_CUDA) { +TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14742,16 +14785,7 @@ TEST(NVFuserTest, FusionSizeOneLoop1_CUDA) { // Make sure the unswitched loop does not have an else clause. GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto fl = dynamic_cast(kir_node.get())) { - if (fl->iter_domain()->parallelType() != ParallelType::Unswitch) { - continue; - } - if (auto pred = dynamic_cast(fl->parentScope())) { - TORCH_CHECK(!pred->hasElse()); - } - } - } + TORCH_CHECK(!UnswitchInElseChecker::check(gpulw)); const int x = 11; const int y = 12; @@ -14763,7 +14797,7 @@ TEST(NVFuserTest, FusionSizeOneLoop1_CUDA) { std::vector aten_inputs = {t0, t1, t2}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto t6 = (t0.unsqueeze(-1) + t1).unsqueeze(0) + t2; @@ -14772,7 +14806,7 @@ TEST(NVFuserTest, FusionSizeOneLoop1_CUDA) { // The unswitched loop has extent one but inner loops don't. The else // part should not be omitted. -TEST(NVFuserTest, FusionSizeOneLoop2_CUDA) { +TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14780,7 +14814,7 @@ TEST(NVFuserTest, FusionSizeOneLoop2_CUDA) { auto tv0 = makeConcreteTensor({x}); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); fusion.addOutput(tv1); tv1->split(-1, 4); @@ -14790,38 +14824,29 @@ TEST(NVFuserTest, FusionSizeOneLoop2_CUDA) { // Make sure the size-one unswitched loop does not omit the else clause. GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto fl = dynamic_cast(kir_node.get())) { - if (fl->iter_domain()->parallelType() != ParallelType::Unswitch) { - continue; - } - if (auto pred = dynamic_cast(fl->parentScope())) { - TORCH_CHECK(pred->hasElse()); - } - } - } + TORCH_CHECK(UnswitchInElseChecker::check(gpulw)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({x}, options); std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto cg_outputs = fe.runFusion(aten_inputs); auto t1 = t0 + 1; testValidate(&fusion, cg_outputs, aten_inputs, {t1}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionValidateParallelize1_CUDA) { +TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv1->axis(-1)->parallelize(ParallelType::TIDx); @@ -14832,15 +14857,15 @@ TEST(NVFuserTest, FusionValidateParallelize1_CUDA) { ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } -TEST(NVFuserTest, FusionValidateParallelize2_CUDA) { +TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv1->axis(-1)->parallelize(ParallelType::TIDx); @@ -14853,15 +14878,15 @@ TEST(NVFuserTest, FusionValidateParallelize2_CUDA) { fe.compileFusion(&fusion); } -TEST(NVFuserTest, FusionValidateParallelize3_CUDA) { +TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv1->split(-1, 4); @@ -14876,15 +14901,15 @@ TEST(NVFuserTest, FusionValidateParallelize3_CUDA) { fe.compileFusion(&fusion); } -TEST(NVFuserTest, FusionValidateParallelize4_CUDA) { +TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv1->split(-1, 4); @@ -14899,15 +14924,15 @@ TEST(NVFuserTest, FusionValidateParallelize4_CUDA) { ASSERT_ANY_THROW(fe.compileFusion(&fusion)); } -TEST(NVFuserTest, FusionValidateParallelize5_CUDA) { +TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv1->split(-1, 4); @@ -14924,7 +14949,7 @@ TEST(NVFuserTest, FusionValidateParallelize5_CUDA) { } // See issue #995 -TEST(NVFuserTest, FusionValidateParallelize6_CUDA) { +TEST_F(NVFuserTest, FusionValidateParallelize6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14933,7 +14958,7 @@ TEST(NVFuserTest, FusionValidateParallelize6_CUDA) { fusion.addInput(tv0); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); auto tv3 = broadcast(tv2, {true, false, false, false}); auto tv4 = add(tv3, tv1); fusion.addOutput(tv4); @@ -14960,7 +14985,7 @@ TEST(NVFuserTest, FusionValidateParallelize6_CUDA) { ASSERT_ANY_THROW(fusion.printKernel()); } -TEST(NVFuserTest, FusionDAGMerging_CUDA) { +TEST_F(NVFuserTest, FusionDAGMerging_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -14976,7 +15001,7 @@ TEST(NVFuserTest, FusionDAGMerging_CUDA) { auto tv5 = sum(tv4, {0}); // 3 // Branch 1 - auto tv6 = add(tv1, new Double(1)); // 4 + auto tv6 = add(tv1, IrBuilder::create(1)); // 4 // Merge auto tv7 = add(tv6, tv5); // 5 @@ -14995,17 +15020,17 @@ TEST(NVFuserTest, FusionDAGMerging_CUDA) { TORCH_CHECK(fusion_segments->groups().size() <= 4); } -TEST(NVFuserTest, FusionDAGScalarMerging_CUDA) { +TEST_F(NVFuserTest, FusionDAGScalarMerging_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(3); - auto i0 = new Double(); + auto i0 = IrBuilder::create(); fusion->addInput(tv0); fusion->addInput(i0); - auto i1 = add(i0, new Double(1.0)); + auto i1 = add(i0, IrBuilder::create(1.0)); auto i2 = mul(i1, i1); auto i3 = add(i2, i1); @@ -15051,7 +15076,7 @@ TEST(NVFuserTest, FusionDAGScalarMerging_CUDA) { executor_cache.fusion(), outputs, {t0, s0}, {t5}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) { +TEST_F(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15073,14 +15098,14 @@ TEST(NVFuserTest, FusionBlockReduceInSerialLoop_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_output = t0.sum({1, 2}); testValidate( &fusion, outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { +TEST_F(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15106,7 +15131,7 @@ TEST(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); at::Tensor aten_avg = t0.mean({1, 2}); at::Tensor aten_M2 = t0.var({1, 2}, false) * N * K; @@ -15115,7 +15140,7 @@ TEST(NVFuserTest, FusionBlockWelfordInSerialLoop_CUDA) { } // See Issue #716 -TEST(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) { +TEST_F(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15129,7 +15154,7 @@ TEST(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) { std::vector broadcast_mask = {false, true}; auto tv0_bcast = broadcast(tv0, broadcast_mask); - auto path1_bcast = add(tv0_bcast, new Double(1.0)); + auto path1_bcast = add(tv0_bcast, IrBuilder::create(1.0)); auto path1 = sum(path1_bcast, reduction_axes); fusion.addOutput(path1); @@ -15145,7 +15170,7 @@ TEST(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) { std::vector aten_inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); // inplace op, we are adding t0 to itself auto outputs = fe.runFusion(aten_inputs, {t0}); @@ -15153,7 +15178,7 @@ TEST(NVFuserTest, FusionIOTensorTrivialReductionRepro_CUDA) { TORCH_CHECK(outputs[0].allclose(t0_ref.add(1))); } -TEST(NVFuserTest, FusionReductionPredicate_CUDA) { +TEST_F(NVFuserTest, FusionReductionPredicate_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15184,7 +15209,7 @@ TEST(NVFuserTest, FusionReductionPredicate_CUDA) { at::Tensor cg_output = at::empty({numel_y}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}); fe.runFusion({input}, {cg_output}); auto aten_output = input.to(at::kDouble).sum({0}); @@ -15193,7 +15218,7 @@ TEST(NVFuserTest, FusionReductionPredicate_CUDA) { &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue728_CUDA) { +TEST_F(NVFuserTest, FusionIssue728_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15204,10 +15229,10 @@ TEST(NVFuserTest, FusionIssue728_CUDA) { auto tv2 = makeSymbolicTensor(1); fusion.addOutput(tv2); - auto tv3 = add(tv0, new Double(1)); + auto tv3 = add(tv0, IrBuilder::create(1)); auto tv4 = add(tv3, tv1); - auto tv5 = add(tv4, new Double(1)); - auto tv6 = add(tv2, new Double(1)); + auto tv5 = add(tv4, IrBuilder::create(1)); + auto tv6 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv5); fusion.addOutput(tv6); @@ -15253,7 +15278,7 @@ TEST(NVFuserTest, FusionIssue728_CUDA) { "Only tv3 should be included"); } -TEST(NVFuserTest, FusionIssue757_CUDA) { +TEST_F(NVFuserTest, FusionIssue757_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15281,7 +15306,7 @@ TEST(NVFuserTest, FusionIssue757_CUDA) { std::vector inputs = {t0, t3}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0.sum({1}); @@ -15292,7 +15317,7 @@ TEST(NVFuserTest, FusionIssue757_CUDA) { } // See issue #759 -TEST(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15323,7 +15348,7 @@ TEST(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { std::vector inputs = {t0, t3}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0.sum({1}); @@ -15333,7 +15358,7 @@ TEST(NVFuserTest, FusionPredicatedBlockBroadcast_CUDA) { testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSegmentVerticalMerge_CUDA) { +TEST_F(NVFuserTest, FusionSegmentVerticalMerge_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -15367,12 +15392,12 @@ TEST(NVFuserTest, FusionSegmentVerticalMerge_CUDA) { TORCH_CHECK(segmented_fusion->groups().size() == 2); } -TEST(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) { +TEST_F(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(3); - auto i0 = new Double(); + auto i0 = IrBuilder::create(); fusion->addInput(tv0); fusion->addInput(i0); @@ -15407,7 +15432,7 @@ TEST(NVFuserTest, FusionSegmentHorizontalMerge_CUDA) { TORCH_CHECK(segmented_fusion->groups().size() == 2); } -TEST(NVFuserTest, FusionSegmentMixReduction_CUDA) { +TEST_F(NVFuserTest, FusionSegmentMixReduction_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -15446,7 +15471,7 @@ TEST(NVFuserTest, FusionSegmentMixReduction_CUDA) { TORCH_CHECK(segmented_fusion->groups().size() <= 2); } -TEST(NVFuserTest, FusionSBAR_CUDA) { +TEST_F(NVFuserTest, FusionSBAR_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15492,12 +15517,11 @@ TEST(NVFuserTest, FusionSBAR_CUDA) { // outputs std::vector outputs; - auto lparams = schedulePointwise(&fusion, c10::ArrayRef(inputs)); + auto lparams = schedulePointwise(&fusion, inputs); FusionExecutor executor; - executor.compileFusion(&fusion); - - outputs = executor.runFusion(c10::ArrayRef(inputs), lparams); + executor.compileFusion(&fusion, inputs, lparams); + outputs = executor.runFusion(inputs, lparams); auto at_scale = at::mul(at_x, at_weight); auto at_scale_bias = at::add(at_scale, at_bias); @@ -15507,16 +15531,16 @@ TEST(NVFuserTest, FusionSBAR_CUDA) { testValidate(&fusion, outputs, inputs, {output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSingleElement_CUDA) { +TEST_F(NVFuserTest, FusionSingleElement_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(0); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(2.5)); + auto tv1 = add(tv0, IrBuilder::create(2.5)); - auto tv2 = add(tv1, new Double(3.5)); + auto tv2 = add(tv1, IrBuilder::create(3.5)); fusion.addOutput(tv2); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); @@ -15527,7 +15551,7 @@ TEST(NVFuserTest, FusionSingleElement_CUDA) { auto lparams = schedulePointwise(&fusion, {input}); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input}, lparams); fe.runFusion({input}, {cg_output}, lparams); auto aten_output = input.add(2.5).add(3.5); @@ -15536,7 +15560,7 @@ TEST(NVFuserTest, FusionSingleElement_CUDA) { &fusion, {cg_output}, {input}, {aten_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBNBackwardRepro_CUDA) { +TEST_F(NVFuserTest, FusionBNBackwardRepro_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -15566,12 +15590,12 @@ TEST(NVFuserTest, FusionBNBackwardRepro_CUDA) { makeSymbolicTensor(numDims); // single tensor broadcasted is dangerous. fusion.addInput(gt_0); - auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, new Int(1)); + auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create(1)); auto gt_float = castOp(DataType::Float, gt_bool); auto grad_out = mul(grad_out_prev, gt_float); - Val* eps_ptr = new Double(1e-5); + Val* eps_ptr = IrBuilder::create(1e-5); auto grads = batch_norm_backward( input, @@ -15606,7 +15630,7 @@ TEST(NVFuserTest, FusionBNBackwardRepro_CUDA) { } // TODO: We only changed inputs, merge this with the test above. -TEST(NVFuserTest, FusionBNBackwardRepro2_CUDA) { +TEST_F(NVFuserTest, FusionBNBackwardRepro2_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -15639,12 +15663,12 @@ TEST(NVFuserTest, FusionBNBackwardRepro2_CUDA) { auto gt_0 = makeConcreteTensor({-1, -1, 1, 1}); fusion.addInput(gt_0); - auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, new Int(1)); + auto gt_bool = binaryOp(BinaryOpType::GT, gt_0, IrBuilder::create(1)); auto gt_float = castOp(DataType::Float, gt_bool); auto grad_out = mul(grad_out_prev, gt_float); - Val* eps_ptr = new Double(1e-5); + Val* eps_ptr = IrBuilder::create(1e-5); auto grads = batch_norm_backward( input, @@ -15678,7 +15702,7 @@ TEST(NVFuserTest, FusionBNBackwardRepro2_CUDA) { auto outputs = fec.runFusionWithInputs(inputs); } -TEST(NVFuserTest, FusionBNRepro_CUDA) { +TEST_F(NVFuserTest, FusionBNRepro_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -15704,8 +15728,8 @@ TEST(NVFuserTest, FusionBNRepro_CUDA) { auto running_var = makeSymbolicTensor(1); fusion.addInput(running_var); - auto momentum_ptr = new Double(kMomentum); - auto eps_ptr = new Double(kEps); + auto momentum_ptr = IrBuilder::create(kMomentum); + auto eps_ptr = IrBuilder::create(kEps); auto result = batch_norm( input, @@ -15759,7 +15783,7 @@ TEST(NVFuserTest, FusionBNRepro_CUDA) { &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBNRepro2_CUDA) { +TEST_F(NVFuserTest, FusionBNRepro2_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -15777,8 +15801,8 @@ TEST(NVFuserTest, FusionBNRepro2_CUDA) { auto input = makeSymbolicTensor(numDims); fusion.addInput(input); - Val* momentum_ptr = new Double(kMomentum); - Val* eps_ptr = new Double(kEps); + Val* momentum_ptr = IrBuilder::create(kMomentum); + Val* eps_ptr = IrBuilder::create(kEps); auto result = batch_norm( input, @@ -15820,7 +15844,7 @@ TEST(NVFuserTest, FusionBNRepro2_CUDA) { &fusion, cg_outputs, aten_inputs, aten_outputs, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionZeroSizeTensorPW_CUDA) { +TEST_F(NVFuserTest, FusionZeroSizeTensorPW_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15830,7 +15854,7 @@ TEST(NVFuserTest, FusionZeroSizeTensorPW_CUDA) { auto tv1 = makeConcreteTensor({0}); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(2.5)); + auto tv2 = add(tv0, IrBuilder::create(2.5)); fusion.addOutput(tv2); auto tv3 = makeConcreteTensor({0}); @@ -15846,7 +15870,7 @@ TEST(NVFuserTest, FusionZeroSizeTensorPW_CUDA) { auto lparams = schedulePointwise(&fusion, {input0, input1}); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input0, input1}); fe.runFusion({input0, input1}, {cg_output2, cg_output3}, lparams); auto aten_output2 = input0.add(2.5); @@ -15861,7 +15885,7 @@ TEST(NVFuserTest, FusionZeroSizeTensorPW_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) { +TEST_F(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15891,7 +15915,7 @@ TEST(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input0, input1}, lparams); auto cg_outputs = fe.runFusion({input0, input1}, lparams); auto aten_output2 = input0.sum({1}); at::Tensor aten_output3 = at::empty({0}, options); @@ -15907,7 +15931,7 @@ TEST(NVFuserTest, FusionZeroSizeTensorReduction_CUDA) { lparams); } -TEST(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) { +TEST_F(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -15938,7 +15962,7 @@ TEST(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) { auto lparams = reduction_params.value().lparams; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input0, input1}, lparams); auto cg_outputs = fe.runFusion({input0, input1}, lparams); auto aten_output2 = input0.sum({0}).add(input0); at::Tensor aten_output3 = at::empty({0}, options); @@ -15954,7 +15978,7 @@ TEST(NVFuserTest, FusionZeroSizeTensorNormalization_CUDA) { lparams); } -TEST(NVFuserTest, FusionSegmentIoAlias_CUDA) { +TEST_F(NVFuserTest, FusionSegmentIoAlias_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -15966,7 +15990,7 @@ TEST(NVFuserTest, FusionSegmentIoAlias_CUDA) { fusion->addInput(tv1); fusion->addInput(tv2); - TensorView* tv3 = add(tv0, new Double(1)); // Group 0 + TensorView* tv3 = add(tv0, IrBuilder::create(1)); // Group 0 TensorView* tv4 = max(tv3, {0}); // Group 0 (use max instead to avoid numerical issues) TensorView* tv5 = add(tv4, tv1); // Group 0 (Non Broadcast after reduce, @@ -16008,7 +16032,7 @@ TEST(NVFuserTest, FusionSegmentIoAlias_CUDA) { executor_cache.fusion(), outputs, {t0, t1, t2}, {t6}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionWelford1Output_CUDA) { +TEST_F(NVFuserTest, FusionWelford1Output_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16028,7 +16052,7 @@ TEST(NVFuserTest, FusionWelford1Output_CUDA) { testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTranslate1Welford_CUDA) { +TEST_F(NVFuserTest, FusionTranslate1Welford_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16037,7 +16061,8 @@ TEST(NVFuserTest, FusionTranslate1Welford_CUDA) { fusion->addInput(tv0); auto tvs = Welford(tv0, {1}); - fusion->addOutput(tvs.var_sum); + auto tv_out = add(tv0, broadcast(tvs.avg, {false, true})); + fusion->addOutput(tv_out); FusionExecutorCache executor_cache(std::move(fusion_ptr)); auto run_test = [&executor_cache, @@ -16047,9 +16072,13 @@ TEST(NVFuserTest, FusionTranslate1Welford_CUDA) { auto outputs = executor_cache.runFusionWithInputs({t0}); // Square sums does not fit well in the testValidate assumptions, // so we just compare the divided output here. - outputs[0] /= inner_size; - auto t1 = t0.var({1}, false); - testValidate(fusion, outputs, {t0}, {t1}, __LINE__, __FILE__); + testValidate( + fusion, + outputs, + {t0}, + {t0.add(t0.mean({1}).unsqueeze(1))}, + __LINE__, + __FILE__); return executor_cache.getMostRecentKernelRuntime(); }; @@ -16057,21 +16086,25 @@ TEST(NVFuserTest, FusionTranslate1Welford_CUDA) { // Run a translated welford auto runtime1 = run_test(64); // Check it was translated - TORCH_CHECK(runtime1->singleKernelFusion()->unordered_exprs().size() > 2); TORCH_CHECK( - runtime1->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == - ScheduleHeuristic::Persistent); + runtime1->fusionSegments()->groups().size() == 1 && + runtime1->fusionSegments()->groups()[0]->exprs().size() > 2); // Run an un-translated welford auto runtime2 = run_test(65536); - // Check it was not translated - TORCH_CHECK(runtime2->singleKernelFusion()->unordered_exprs().size() == 1); - TORCH_CHECK( - runtime2->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == - ScheduleHeuristic::Reduction); + + bool found_welford = false; + for (auto group : runtime2->fusionSegments()->groups()) { + for (auto expr : group->exprs()) { + if (expr->isA()) { + found_welford = true; + } + } + } + TORCH_CHECK(found_welford); } -TEST(NVFuserTest, FusionTranslate2Welford_CUDA) { +TEST_F(NVFuserTest, FusionTranslate2Welford_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16080,10 +16113,12 @@ TEST(NVFuserTest, FusionTranslate2Welford_CUDA) { fusion->addInput(tv0); auto tvs1 = Welford(tv0, {1}); - auto tvs2 = Welford(tv0, {1}); + auto tv_out1 = add(tv0, broadcast(tvs1.avg, {false, true})); + fusion->addOutput(tv_out1); - fusion->addOutput(tvs1.var_sum); - fusion->addOutput(tvs2.var_sum); + auto tvs2 = Welford(tv0, {1}); + auto tv_out2 = add(tv0, broadcast(tvs2.avg, {false, true})); + fusion->addOutput(tv_out2); FusionExecutorCache executor_cache(std::move(fusion_ptr)); @@ -16095,10 +16130,8 @@ TEST(NVFuserTest, FusionTranslate2Welford_CUDA) { // Square sums does not fit well in the testValidate assumptions, // so we just compare the divided output here. - outputs[0] /= inner_size; - outputs[1] /= inner_size; - auto t1 = t0.var({1}, false); - testValidate(fusion, outputs, {t0}, {t1, t1}, __LINE__, __FILE__); + auto out = t0.add(t0.mean({1}).unsqueeze(1)); + testValidate(fusion, outputs, {t0}, {out, out}, __LINE__, __FILE__); return executor_cache.getMostRecentKernelRuntime(); }; @@ -16106,18 +16139,25 @@ TEST(NVFuserTest, FusionTranslate2Welford_CUDA) { // Run a translated welford auto runtime1 = run_test(64); // Check it was translated - TORCH_CHECK(runtime1->singleKernelFusion()->unordered_exprs().size() > 4); TORCH_CHECK( - runtime1->schedulerHeuristics()->singleKernelHeuristics()->heuristc() == - ScheduleHeuristic::Persistent); + runtime1->fusionSegments()->groups().size() == 1 && + runtime1->fusionSegments()->groups()[0]->exprs().size() > 4); // Run an un-translated welford auto runtime2 = run_test(65536); // // Check it was not translated - TORCH_CHECK(runtime2->singleKernelFusion()->unordered_exprs().size() == 2); + bool found_welford = false; + for (auto group : runtime2->fusionSegments()->groups()) { + for (auto expr : group->exprs()) { + if (expr->isA()) { + found_welford = true; + } + } + } + TORCH_CHECK(found_welford); } -TEST(NVFuserTest, FusionLargeWelfordNormalization_CUDA) { +TEST_F(NVFuserTest, FusionLargeWelfordNormalization_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16150,7 +16190,7 @@ TEST(NVFuserTest, FusionLargeWelfordNormalization_CUDA) { TORCH_CHECK(!runtime->isSegmented()); } -TEST(NVFuserTest, FusionWelfordOtherPersistence_CUDA) { +TEST_F(NVFuserTest, FusionWelfordOtherPersistence_CUDA) { auto fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16176,20 +16216,22 @@ TEST(NVFuserTest, FusionWelfordOtherPersistence_CUDA) { at::Tensor t0 = at::randn({128, inner_size}, options); auto outputs = executor_cache.runFusionWithInputs({t0}); - auto t1 = t0.mean({1}).unsqueeze(1) + t0; - auto t2 = t0.sum({1}).unsqueeze(1) + t0; + auto t1 = t0.to(c10::kDouble).mean({1}).unsqueeze(1) + t0; + auto t2 = t0.to(c10::kDouble).sum({1}).unsqueeze(1) + t0; testValidate(fusion, outputs, {t0}, {t2, t1}, __LINE__, __FILE__); return executor_cache.getMostRecentKernelRuntime(); }; for (auto inner_size : {4096, 8192, 32768}) { - auto runtime = run_test(4096); - TORCH_CHECK(!runtime->isSegmented()); + auto runtime = run_test(inner_size); + TORCH_CHECK( + !runtime->isSegmented() || + runtime->fusionSegments()->groups().size() == 1); } } -TEST(NVFuserTest, FusionSegmentIslands_CUDA) { +TEST_F(NVFuserTest, FusionSegmentIslands_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16211,7 +16253,7 @@ TEST(NVFuserTest, FusionSegmentIslands_CUDA) { fusion_executor_cache.runFusionWithInputs({t0, t1}); } -TEST(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16247,7 +16289,7 @@ TEST(NVFuserTest, FusionBackOffInnerBroadcast_CUDA) { TORCH_CHECK(tv8->getMaxProducerPosition() == 2); } -TEST(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) { +TEST_F(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16267,7 +16309,7 @@ TEST(NVFuserTest, FusionBackOffInnerBroadcast2_CUDA) { TORCH_CHECK(tv3->getMaxProducerPosition() == 2); } -TEST(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) { +TEST_F(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16286,7 +16328,7 @@ TEST(NVFuserTest, FusionBackOffInnerBroadcast3_CUDA) { TORCH_CHECK(tv3->getMaxProducerPosition() == 3); } -TEST(NVFuserTest, FusionSimpleWarp_CUDA) { +TEST_F(NVFuserTest, FusionSimpleWarp_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16317,14 +16359,14 @@ TEST(NVFuserTest, FusionSimpleWarp_CUDA) { auto at_output = input1.sum({1}, true).add(input1); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSimpleWarpPad_CUDA) { +TEST_F(NVFuserTest, FusionSimpleWarpPad_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16365,13 +16407,13 @@ TEST(NVFuserTest, FusionSimpleWarpPad_CUDA) { auto at_output = input1.sum({1}, true).add(input1); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionWarpPadMergeSplit_CUDA) { +TEST_F(NVFuserTest, FusionWarpPadMergeSplit_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16409,13 +16451,13 @@ TEST(NVFuserTest, FusionWarpPadMergeSplit_CUDA) { auto at_output = input1.sum({1, 2}, true).add(input1); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSerialWarpReduction_CUDA) { +TEST_F(NVFuserTest, FusionSerialWarpReduction_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16450,13 +16492,13 @@ TEST(NVFuserTest, FusionSerialWarpReduction_CUDA) { auto at_output = input1.sum({1, 2}, true).add(input1); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTrivialWarpReduction_CUDA) { +TEST_F(NVFuserTest, FusionTrivialWarpReduction_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16494,13 +16536,13 @@ TEST(NVFuserTest, FusionTrivialWarpReduction_CUDA) { auto at_output = input1.sum({1, 2, 3}, true).add(input1); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionMultipleDimBinding_CUDA) { +TEST_F(NVFuserTest, FusionMultipleDimBinding_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16548,7 +16590,7 @@ TEST(NVFuserTest, FusionMultipleDimBinding_CUDA) { auto at_output = input1.sum({1}, true).add(input1); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1, input2}); auto outputs = fe.runFusion({input1, input2}); testValidate( fusion.get(), @@ -16559,7 +16601,7 @@ TEST(NVFuserTest, FusionMultipleDimBinding_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionPadNoWarpReduce_CUDA) { +TEST_F(NVFuserTest, FusionPadNoWarpReduce_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16588,19 +16630,19 @@ TEST(NVFuserTest, FusionPadNoWarpReduce_CUDA) { auto at_output = input1.sum({1}, true).add(input1); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) { +TEST_F(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(2); fusion->addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); fusion->addOutput(tv2); @@ -16623,13 +16665,13 @@ TEST(NVFuserTest, FusionWarpMutipleThreadDim_CUDA) { auto at_output = (input1 + 1).sum({1}); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) { +TEST_F(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -16673,13 +16715,13 @@ TEST(NVFuserTest, FusionWarpReduceUnrollOuterLoop_CUDA) { auto at_output = input1.sum({1}, true).add(input1); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSegfaultReduction_CUDA) { +TEST_F(NVFuserTest, FusionSegfaultReduction_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -16698,7 +16740,7 @@ TEST(NVFuserTest, FusionSegfaultReduction_CUDA) { std::vector at_sum_axes; std::vector outer_reduction_axes; std::vector outer_broadcast_mask(numDims, false); - Val* N = new Double(1); + Val* N = IrBuilder::create(1); for (const auto axis : c10::irange(numDims)) { if (axis != 1) { outer_reduction_axes.push_back(axis); @@ -16728,16 +16770,16 @@ TEST(NVFuserTest, FusionSegfaultReduction_CUDA) { &fusion, outputs, inputs, {at_output0, at_output1}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionPredicateElimination_CUDA) { +TEST_F(NVFuserTest, FusionPredicateElimination_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); - auto tv3 = add(tv2, new Double(3)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(2)); + auto tv3 = add(tv2, IrBuilder::create(3)); fusion.addOutput(tv3); @@ -16748,7 +16790,7 @@ TEST(NVFuserTest, FusionPredicateElimination_CUDA) { { GpuLower gpulw(&fusion); - TORCH_CHECK(!isPredicated(tv2, gpulw)); + TORCH_CHECK(!PredicatedChecker::isPredicated(tv2, gpulw)); } tv2->axis(1)->parallelize(ParallelType::Serial); @@ -16756,11 +16798,11 @@ TEST(NVFuserTest, FusionPredicateElimination_CUDA) { { GpuLower gpulw(&fusion); - TORCH_CHECK(isPredicated(tv2, gpulw)); + TORCH_CHECK(PredicatedChecker::isPredicated(tv2, gpulw)); } } -TEST(NVFuserTest, FusionForceFp16Simple_CUDA) { +TEST_F(NVFuserTest, FusionForceFp16Simple_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16798,53 +16840,55 @@ TEST(NVFuserTest, FusionForceFp16Simple_CUDA) { } } -TEST(NVFuserTest, FusionForceBf16Simple_CUDA) { +TEST_F(NVFuserTest, FusionForceBf16Simple_CUDA) { #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - if (at::cuda::getDeviceProperties(0)->major >= 8) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + // requires ampere+ GPU + if (!deviceMajorMinorCheck(8)) { + GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; + return; + } - auto tv0 = makeSymbolicTensor(2); - auto tv1 = makeSymbolicTensor(2); + std::unique_ptr fusion_ptr = std::make_unique(); + auto fusion = fusion_ptr.get(); + FusionGuard fg(fusion); - fusion->addInput(tv0); - fusion->addInput(tv1); + auto tv0 = makeSymbolicTensor(2); + auto tv1 = makeSymbolicTensor(2); - // Group 1 - auto tv2 = sum(tv0, {1}); - auto tv3 = broadcast(tv2, {false, true}); + fusion->addInput(tv0); + fusion->addInput(tv1); - // Group 2 - auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast - auto tv5 = castOp(DataType::BFloat16, tv4); + // Group 1 + auto tv2 = sum(tv0, {1}); + auto tv3 = broadcast(tv2, {false, true}); - fusion->addOutput(tv5); + // Group 2 + auto tv4 = add(tv3, tv1); // Edge: tv3: expect cast + auto tv5 = castOp(DataType::BFloat16, tv4); - FusionExecutorCache fec(std::move(fusion_ptr)); + fusion->addOutput(tv5); - std::vector shape{15, 16}; + FusionExecutorCache fec(std::move(fusion_ptr)); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn(shape, options); - auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + std::vector shape{15, 16}; - // Check the segmented edge is bf16 - auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); - for (auto edge : segmented_fusion->edges()) { - auto edge_tv = edge->val->as(); - TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16); - } - } else { - GTEST_SKIP(); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto in0 = at::randn(shape, options); + auto in1 = at::randn(shape, options); + fec.runFusionWithInputs({in0, in1}); + + // Check the segmented edge is bf16 + auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); + for (auto edge : segmented_fusion->edges()) { + auto edge_tv = edge->val->as(); + TORCH_CHECK(edge_tv->getDataType() == DataType::BFloat16); } #else - GTEST_SKIP(); + GTEST_SKIP() << "requires cuda 11.0 or newer toolkit"; #endif } -TEST(NVFuserTest, FusionForceFp16NotAllCast_CUDA) { +TEST_F(NVFuserTest, FusionForceFp16NotAllCast_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16893,64 +16937,66 @@ TEST(NVFuserTest, FusionForceFp16NotAllCast_CUDA) { } } -TEST(NVFuserTest, FusionForceBf16NotAllCast_CUDA) { +TEST_F(NVFuserTest, FusionForceBf16NotAllCast_CUDA) { #if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - if (at::cuda::getDeviceProperties(0)->major >= 8) { - std::unique_ptr fusion_ptr = std::make_unique(); - auto fusion = fusion_ptr.get(); - FusionGuard fg(fusion); + // requires ampere+ GPU + if (!deviceMajorMinorCheck(8)) { + GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs"; + return; + } - auto tv0 = makeSymbolicTensor(3); - auto tv1 = makeSymbolicTensor(3); + std::unique_ptr fusion_ptr = std::make_unique(); + auto fusion = fusion_ptr.get(); + FusionGuard fg(fusion); - fusion->addInput(tv0); - fusion->addInput(tv1); + auto tv0 = makeSymbolicTensor(3); + auto tv1 = makeSymbolicTensor(3); - // Group 1 - auto tv3 = sum(tv0, {1}); - auto tv4 = broadcast(tv3, {false, true, false}); - auto tv5 = sum(tv0, {1}); + fusion->addInput(tv0); + fusion->addInput(tv1); - // Group 2 - auto tv6 = add(tv4, tv1); // edge tv4, expect cast - auto tv7 = castOp(DataType::BFloat16, tv6); + // Group 1 + auto tv3 = sum(tv0, {1}); + auto tv4 = broadcast(tv3, {false, true, false}); + auto tv5 = sum(tv0, {1}); - // Group 3 - auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast + // Group 2 + auto tv6 = add(tv4, tv1); // edge tv4, expect cast + auto tv7 = castOp(DataType::BFloat16, tv6); - fusion->addOutput(tv7); - fusion->addOutput(tv8); + // Group 3 + auto tv8 = sum(tv5, {1}); // edge tv5, don't expect cast - FusionExecutorCache fec(std::move(fusion_ptr)); + fusion->addOutput(tv7); + fusion->addOutput(tv8); - std::vector shape{16, 16, 16}; + FusionExecutorCache fec(std::move(fusion_ptr)); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - auto in0 = at::randn(shape, options); - auto in1 = at::randn(shape, options); - fec.runFusionWithInputs({in0, in1}); + std::vector shape{16, 16, 16}; - auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); - auto complete_fusion = segmented_fusion->completeFusion(); + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + auto in0 = at::randn(shape, options); + auto in1 = at::randn(shape, options); + fec.runFusionWithInputs({in0, in1}); - // Check that the edge that wasn't fp16 is the producer of the - // reduction op, i.e. tv8 = sum(tv5,{1});. - for (auto edge : segmented_fusion->edges()) { - auto edge_tv = edge->val->as(); - if (edge_tv->getDataType() == DataType::Float) { - auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin()); - TORCH_CHECK(consumer->isA()); - } + auto segmented_fusion = fec.getMostRecentKernelRuntime()->fusionSegments(); + auto complete_fusion = segmented_fusion->completeFusion(); + + // Check that the edge that wasn't fp16 is the producer of the + // reduction op, i.e. tv8 = sum(tv5,{1});. + for (auto edge : segmented_fusion->edges()) { + auto edge_tv = edge->val->as(); + if (edge_tv->getDataType() == DataType::Float) { + auto consumer = *(complete_fusion->unordered_uses(edge_tv).begin()); + TORCH_CHECK(consumer->isA()); } - } else { - GTEST_SKIP(); } #else - GTEST_SKIP(); + GTEST_SKIP() << "requires cuda 11.0 or newer toolkit"; #endif } -TEST(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) { +TEST_F(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16961,10 +17007,10 @@ TEST(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) { fusion->addInput(tv0); fusion->addInput(tv1); - auto tv2 = mul(tv0, new Double(2)); + auto tv2 = mul(tv0, IrBuilder::create(2)); auto tv3 = broadcast(tv2, {false, false, true}); auto tv4 = add(tv3, tv1); - auto tv5 = mul(tv4, new Double(3)); + auto tv5 = mul(tv4, IrBuilder::create(3)); fusion->addOutput(tv5); // t4 cannot inner re-use t2, because there's a broadcast @@ -16978,13 +17024,13 @@ TEST(NVFuserTest, FusionBufferReuseBroadCastMultiVisit_CUDA) { auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3; FusionExecutor fe; - fe.compileFusion(fusion); + fe.compileFusion(fusion, {in0, in1}); auto outputs = fe.runFusion({in0, in1}); testValidate(fusion, outputs, {in0, in1}, {at_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBufferReuseStressTest_CUDA) { +TEST_F(NVFuserTest, FusionBufferReuseStressTest_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -16995,17 +17041,17 @@ TEST(NVFuserTest, FusionBufferReuseStressTest_CUDA) { fusion->addInput(tv0); fusion->addInput(tv1); - auto tv2 = mul(tv0, new Double(2)); - auto tv3 = mul(tv0, new Double(3)); + auto tv2 = mul(tv0, IrBuilder::create(2)); + auto tv3 = mul(tv0, IrBuilder::create(3)); auto tv4 = mul(tv2, tv3); // Broadcast buffer can be reused through outer sharing auto tv5 = broadcast(tv4, {true, false, false}); - auto tv6 = mul(tv5, new Double(5)); + auto tv6 = mul(tv5, IrBuilder::create(5)); auto tv7 = mul(tv6, tv1); - auto tv8 = mul(tv7, new Double(7)); + auto tv8 = mul(tv7, IrBuilder::create(7)); // tv9 shouldn't alias to avoid buffer over-subscription auto tv9 = broadcast(tv4, {true, false, false}); - auto tv10 = mul(tv9, new Double(9)); + auto tv10 = mul(tv9, IrBuilder::create(9)); auto tv11 = add(tv5, tv9); fusion->addOutput(tv7); fusion->addOutput(tv11); @@ -17031,7 +17077,7 @@ TEST(NVFuserTest, FusionBufferReuseStressTest_CUDA) { auto t10 = t9 * 9; auto t11 = t5 + t9; FusionExecutor fe; - fe.compileFusion(fusion); + fe.compileFusion(fusion, {in0, in1}); auto at_output = ((in0 * 2).unsqueeze(2) + in1) * 3; auto outputs = fe.runFusion({in0, in1}); @@ -17039,7 +17085,7 @@ TEST(NVFuserTest, FusionBufferReuseStressTest_CUDA) { testValidate(fusion, outputs, {in0, in1}, {t7, t11}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { +TEST_F(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -17048,12 +17094,12 @@ TEST(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { fusion->addInput(tv0); - auto tv1 = mul(tv0, new Double(2)); - auto tv2 = mul(tv1, new Double(2)); - auto tv3 = mul(tv2, new Double(2)); - auto tv4 = mul(tv3, new Double(2)); - auto tv5 = mul(tv4, new Double(2)); - auto tv6 = mul(tv5, new Double(2)); + auto tv1 = mul(tv0, IrBuilder::create(2)); + auto tv2 = mul(tv1, IrBuilder::create(2)); + auto tv3 = mul(tv2, IrBuilder::create(2)); + auto tv4 = mul(tv3, IrBuilder::create(2)); + auto tv5 = mul(tv4, IrBuilder::create(2)); + auto tv6 = mul(tv5, IrBuilder::create(2)); fusion->addOutput(tv6); @@ -17064,7 +17110,7 @@ TEST(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { auto in0 = at::randn({256, 512}, options); FusionExecutor fe; - fe.compileFusion(fusion); + fe.compileFusion(fusion, {in0}); auto outputs = fe.runFusion({in0}); auto at_out = in0.mul(2).mul(2).mul(2).mul(2).mul(2).mul(2); @@ -17072,7 +17118,7 @@ TEST(NVFuserTest, FusionBufferReuseLargeBuffer_CUDA) { testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { +TEST_F(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -17083,12 +17129,12 @@ TEST(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { fusion->addInput(tv0); fusion->addInput(tv1); - auto tv2 = mul(tv0, new Double(2)); + auto tv2 = mul(tv0, IrBuilder::create(2)); auto tv3 = broadcast(tv2, {false, false, true}); auto tv4 = add(tv3, tv1); // T4 to be inner aliased first, and // shouldn't outer alias on top - auto tv5 = mul(tv4, new Double(3)); - auto tv6 = mul(tv5, new Double(3)); + auto tv5 = mul(tv4, IrBuilder::create(3)); + auto tv6 = mul(tv5, IrBuilder::create(3)); fusion->addOutput(tv6); tv0->computeAt(tv6, 1, ComputeAtMode::BestEffort); @@ -17098,7 +17144,7 @@ TEST(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); FusionExecutor fe; - fe.compileFusion(fusion); + fe.compileFusion(fusion, {in0, in1}); auto outputs = fe.runFusion({in0, in1}); auto at_out = (in0.mul(2.0).unsqueeze(2) + in1).mul(3.0).mul(3.0); @@ -17106,7 +17152,7 @@ TEST(NVFuserTest, FusionBufferReuseNo2hop_CUDA) { testValidate(fusion, outputs, {in0, in1}, {at_out}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { +TEST_F(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -17116,8 +17162,8 @@ TEST(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { fusion->addInput(tv0); auto tv1 = sum(tv0, {1}); - auto tv2 = mul(tv1, new Double(2)); - auto tv3 = mul(tv2, new Double(2)); + auto tv2 = mul(tv1, IrBuilder::create(2)); + auto tv3 = mul(tv2, IrBuilder::create(2)); fusion->addOutput(tv3); @@ -17134,7 +17180,7 @@ TEST(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { auto in0 = at::randn({3, 3, 3}, options); FusionExecutor fe; - fe.compileFusion(fusion); + fe.compileFusion(fusion, {in0}); auto outputs = fe.runFusion({in0}); auto at_out = in0.sum(1).mul(2).mul(2); @@ -17142,7 +17188,7 @@ TEST(NVFuserTest, FusionBufferReuseAllocationOrder_CUDA) { testValidate(fusion, outputs, {in0}, {at_out}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { +TEST_F(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -17151,9 +17197,9 @@ TEST(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { fusion->addInput(tv0); - auto tv1 = mul(tv0, new Double(3)); - auto tv2 = mul(tv1, new Double(2)); - auto tv3 = mul(tv2, new Double(2)); + auto tv1 = mul(tv0, IrBuilder::create(3)); + auto tv2 = mul(tv1, IrBuilder::create(2)); + auto tv3 = mul(tv2, IrBuilder::create(2)); // tv1 used till here, cannot be reused by tv2 or tv3 auto tv4 = mul(tv3, tv1); @@ -17165,7 +17211,7 @@ TEST(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { auto in0 = at::randn({16, 16}, options); FusionExecutor fe; - fe.compileFusion(fusion); + fe.compileFusion(fusion, {in0}); auto cg_outputs = fe.runFusion({in0}); auto at_t0 = in0 * 3.0; @@ -17174,7 +17220,7 @@ TEST(NVFuserTest, FusionBufferReuseLiveInterval_CUDA) { testValidate(fusion, cg_outputs, {in0}, {at_out}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); auto fusion = fusion_ptr.get(); FusionGuard fg(fusion); @@ -17185,12 +17231,12 @@ TEST(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { fusion->addInput(tv0); fusion->addInput(tv1); - auto tv2 = mul(tv0, new Double(2)); - auto tv3 = mul(tv0, new Double(3)); + auto tv2 = mul(tv0, IrBuilder::create(2)); + auto tv3 = mul(tv0, IrBuilder::create(3)); auto tv4 = mul(tv2, tv3); auto tv5 = broadcast(tv4, {false, false, true}); auto tv6 = mul(tv5, tv1); - auto tv7 = mul(tv6, new Double(7)); + auto tv7 = mul(tv6, IrBuilder::create(7)); fusion->addOutput(tv7); // tv6 shouldn't re-use t2 or t3 because of @@ -17202,7 +17248,7 @@ TEST(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { auto in0 = at::randn({2, 2}, options); auto in1 = at::randn({2, 2, 2}, options); FusionExecutor fe; - fe.compileFusion(fusion); + fe.compileFusion(fusion, {in0, in1}); auto outputs = fe.runFusion({in0, in1}); auto t2 = in0 * 2; @@ -17214,7 +17260,7 @@ TEST(NVFuserTest, FusionBufferReuseNoAcrossBroadcast_CUDA) { testValidate(fusion, outputs, {in0, in1}, {t7}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue970_CUDA) { +TEST_F(NVFuserTest, FusionIssue970_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -17230,14 +17276,13 @@ TEST(NVFuserTest, FusionIssue970_CUDA) { tv1->split(1, 4); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor t0 = at::randn({nelm, nelm}, options); + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); auto outputs = fe.runFusion({t0}); auto ref = sum(t0, {1}).unsqueeze(-1).expand({nelm, nelm}) + t0; @@ -17246,15 +17291,15 @@ TEST(NVFuserTest, FusionIssue970_CUDA) { } // Reproducer of #1016 -TEST(NVFuserTest, FusionIssue1016_CUDA) { +TEST_F(NVFuserTest, FusionIssue1016_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(2)); fusion.addOutput(tv2); @@ -17262,15 +17307,15 @@ TEST(NVFuserTest, FusionIssue1016_CUDA) { tv2->split(-1, 8); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 10; int numel_y = 11; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = t0 + 1 + 2; @@ -17279,13 +17324,13 @@ TEST(NVFuserTest, FusionIssue1016_CUDA) { } // Reproducer of #1021 -TEST(NVFuserTest, FusionIssue1021_CUDA) { +TEST_F(NVFuserTest, FusionIssue1021_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = broadcast(tv1, {false, true}); fusion.addOutput(tv2); @@ -17298,12 +17343,12 @@ TEST(NVFuserTest, FusionIssue1021_CUDA) { tv2->axis(0)->parallelize(ParallelType::TIDx); tv2->axis(1)->parallelize(ParallelType::Vectorize); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({10}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = (t0 + 1).unsqueeze(-1); @@ -17312,7 +17357,7 @@ TEST(NVFuserTest, FusionIssue1021_CUDA) { } // Reproducer of issue #1053 -TEST(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { +TEST_F(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -17321,7 +17366,7 @@ TEST(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { auto tv1 = sum(tv0, {0}); fusion->addOutput(tv1); - auto tv2 = add(tv0, new Double(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); fusion->addOutput(tv2); tv1->split(0, 8); @@ -17340,20 +17385,20 @@ TEST(NVFuserTest, FusionNonUniqueThreadDim_CUDA) { auto at_tv2 = input1 + 1; FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( fusion.get(), outputs, {input1}, {at_tv1, at_tv2}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionParallelDimensionMap1_CUDA) { +TEST_F(NVFuserTest, FusionParallelDimensionMap1_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(1); fusion->addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); fusion->addOutput(tv1); fusion->addOutput(tv2); @@ -17366,25 +17411,22 @@ TEST(NVFuserTest, FusionParallelDimensionMap1_CUDA) { // actual values are not statically known GpuLower gpulw(fusion.get()); const auto& pdmap = gpulw.parallelDimensionMap(); - auto kir_tv1 = gpulw.lowerValue(tv1)->as(); - auto kir_tv2 = gpulw.lowerValue(tv2)->as(); - for (const auto i : c10::irange(kir_tv1->domain()->domain().size())) { - auto dom1 = kir_tv1->domain()->domain()[i]; - auto dom2 = kir_tv2->domain()->domain()[i]; + for (const auto i : c10::irange(tv1->domain()->domain().size())) { + auto dom1 = tv1->domain()->domain()[i]; + auto dom2 = tv2->domain()->domain()[i]; TORCH_INTERNAL_ASSERT(pdmap.equalDim(dom1->extent(), dom2->extent())); } TORCH_CHECK(pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isA() && - pdmap.get(ParallelType::TIDx)->as()->name() == - "blockDim.x"); + pdmap.get(ParallelType::TIDx)->isA() && + pdmap.get(ParallelType::TIDx)->as()->name() == "blockDim.x"); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({32}, options); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( @@ -17396,7 +17438,7 @@ TEST(NVFuserTest, FusionParallelDimensionMap1_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionParallelDimensionMap2_CUDA) { +TEST_F(NVFuserTest, FusionParallelDimensionMap2_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -17418,16 +17460,15 @@ TEST(NVFuserTest, FusionParallelDimensionMap2_CUDA) { const auto& pdmap = gpulw.parallelDimensionMap(); TORCH_CHECK(pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isA() && - pdmap.get(ParallelType::TIDx)->as()->name() == - "blockDim.x"); + pdmap.get(ParallelType::TIDx)->isA() && + pdmap.get(ParallelType::TIDx)->as()->name() == "blockDim.x"); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({11}, options); at::Tensor input2 = at::randn({11, 13}, options); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1, input2}); auto outputs = fe.runFusion({input1, input2}); auto ref = input1.unsqueeze(-1) + input2; @@ -17437,24 +17478,24 @@ TEST(NVFuserTest, FusionParallelDimensionMap2_CUDA) { } // Mix symbolic and concrete tensors -TEST(NVFuserTest, FusionParallelDimensionMap3_CUDA) { +TEST_F(NVFuserTest, FusionParallelDimensionMap3_CUDA) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); auto tv0 = makeSymbolicTensor(1); fusion->addInput(tv0); - auto tv2 = add(tv0, new Double(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); fusion->addOutput(tv2); - auto tv3 = add(tv0, new Double(1)); + auto tv3 = add(tv0, IrBuilder::create(1)); fusion->addOutput(tv3); tv2->split(0, 10); tv3->split(0, 20); - auto tv4 = add(tv0, new Double(1)); + auto tv4 = add(tv0, IrBuilder::create(1)); fusion->addOutput(tv4); - auto tv5 = add(tv0, new Double(1)); + auto tv5 = add(tv0, IrBuilder::create(1)); fusion->addOutput(tv5); // Not mapped but equal extent @@ -17471,19 +17512,18 @@ TEST(NVFuserTest, FusionParallelDimensionMap3_CUDA) { const auto& pdmap = gpulw.parallelDimensionMap(); TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isA() && - pdmap.get(ParallelType::TIDx)->as()->name() == - "blockDim.x"); + pdmap.get(ParallelType::TIDx)->isA() && + pdmap.get(ParallelType::TIDx)->as()->name() == "blockDim.x"); TORCH_CHECK(pdmap.isExact(ParallelType::TIDy)); TORCH_CHECK( pdmap.get(ParallelType::TIDy)->isConst() && - pdmap.get(ParallelType::TIDy)->as()->value().value() == 10); + pdmap.get(ParallelType::TIDy)->as()->value().value() == 10); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({13}, options); FusionExecutor fe; - fe.compileFusion(fusion.get()); + fe.compileFusion(fusion.get(), {input1}); auto outputs = fe.runFusion({input1}); testValidate( @@ -17496,7 +17536,7 @@ TEST(NVFuserTest, FusionParallelDimensionMap3_CUDA) { } // Parallelizing merged broadcast domains -TEST(NVFuserTest, FusionParallelDimensionMap4_CUDA) { +TEST_F(NVFuserTest, FusionParallelDimensionMap4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -17504,7 +17544,7 @@ TEST(NVFuserTest, FusionParallelDimensionMap4_CUDA) { fusion.addInput(tv0); auto tv1 = makeSymbolicTensor(2); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); auto tv3 = broadcast(tv2, {true, false}); auto tv4 = add(tv3, tv1); fusion.addOutput(tv4); @@ -17526,16 +17566,15 @@ TEST(NVFuserTest, FusionParallelDimensionMap4_CUDA) { const auto& pdmap = gpulw.parallelDimensionMap(); TORCH_CHECK(!pdmap.isExact(ParallelType::TIDx)); TORCH_CHECK( - pdmap.get(ParallelType::TIDx)->isA() && - pdmap.get(ParallelType::TIDx)->as()->name() == - "blockDim.x"); + pdmap.get(ParallelType::TIDx)->isA() && + pdmap.get(ParallelType::TIDx)->as()->name() == "blockDim.x"); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({13}, options); at::Tensor input2 = at::randn({15, 13}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input1, input2}); auto outputs = fe.runFusion({input1, input2}); auto ref = (input1 + 1).unsqueeze(0) + input2; @@ -17543,7 +17582,7 @@ TEST(NVFuserTest, FusionParallelDimensionMap4_CUDA) { testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionParallelDimensionMap5_CUDA) { +TEST_F(NVFuserTest, FusionParallelDimensionMap5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -17570,18 +17609,17 @@ TEST(NVFuserTest, FusionParallelDimensionMap5_CUDA) { TORCH_CHECK(pdmap.isExact(ParallelType::TIDy)); TORCH_CHECK( pdmap.get(ParallelType::TIDx)->isConst() && - pdmap.get(ParallelType::TIDx)->as()->value().value() == 4); + pdmap.get(ParallelType::TIDx)->as()->value().value() == 4); TORCH_CHECK( - pdmap.get(ParallelType::TIDy)->isA() && - pdmap.get(ParallelType::TIDy)->as()->name() == - "blockDim.y"); + pdmap.get(ParallelType::TIDy)->isA() && + pdmap.get(ParallelType::TIDy)->as()->name() == "blockDim.y"); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor input1 = at::randn({13}, options); at::Tensor input2 = at::randn({13, 15}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {input1, input2}); auto outputs = fe.runFusion({input1, input2}); auto ref = (input1).unsqueeze(-1) + input2; @@ -17589,7 +17627,7 @@ TEST(NVFuserTest, FusionParallelDimensionMap5_CUDA) { testValidate(&fusion, outputs, {input1, input2}, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) { +TEST_F(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) { auto fusion_ptr = std::make_unique(); auto& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -17603,7 +17641,7 @@ TEST(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) { auto t13 = makeSymbolicTensor(3, DataType::Half); auto t15 = makeSymbolicTensor(3, DataType::Half); auto t17 = makeSymbolicTensor(3, DataType::Half); - auto d56 = new Double(); + auto d56 = IrBuilder::create(); fusion.addInput(t0); fusion.addInput(t1); @@ -17636,9 +17674,10 @@ TEST(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) { auto t29 = mul(t25, t23); auto t30 = sum(t29, {2}); auto t31 = broadcast(t30, {false, false, true}); - auto d59 = mul(t1->getRootDomain()[2]->extent(), new Double(1)); + auto d59 = + mul(t1->getRootDomain()[2]->extent(), IrBuilder::create(1)); auto t26 = mul(d59, t25); - auto txx = mul(t26, new Double(1)); + auto txx = mul(t26, IrBuilder::create(1)); auto t33 = sub(txx, t28); auto d70 = unaryOp(UnaryOpType::Reciprocal, d59); auto t35 = mul(d70, t6); @@ -17694,23 +17733,23 @@ TEST(NVFuserTest, FusionSegmenterCombineReductionsCycleRepro_CUDA) { } } -TEST(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) { +TEST_F(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); - auto tv3 = add(tv0, new Double(1)); - auto tv4 = add(tv3, new Double(1)); + auto tv3 = add(tv0, IrBuilder::create(1)); + auto tv4 = add(tv3, IrBuilder::create(1)); fusion.addOutput(tv4); - auto tv5 = add(tv0, new Double(1)); - auto tv6 = add(tv5, new Double(1)); + auto tv5 = add(tv0, IrBuilder::create(1)); + auto tv6 = add(tv5, IrBuilder::create(1)); fusion.addOutput(tv6); // Case 1: local memory tensor computed serially and used by @@ -17732,13 +17771,13 @@ TEST(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) { tv5->axis(-1)->parallelize(ParallelType::TIDx); tv5->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int nx = 11; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({nx}, options); std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref = t0 + 2; @@ -17748,16 +17787,16 @@ TEST(NVFuserTest, FusionSerialAndParallelIndexing_CUDA) { } // Repro of issue #1105 -TEST(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) { +TEST_F(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); @@ -17783,12 +17822,12 @@ TEST(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) { } } - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({17}, options); std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref1 = t0 + 3; @@ -17796,24 +17835,24 @@ TEST(NVFuserTest, FusionWARSyncAliasedSmem_CUDA) { testValidate(&fusion, outputs, aten_inputs, {ref1}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue1099_CUDA) { +TEST_F(NVFuserTest, FusionIssue1099_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); auto tv3 = makeSymbolicTensor(1); fusion.addInput(tv3); // Just to make TIDx/y/z non-exact - auto tv4 = add(tv3, new Double(1)); - auto tv5 = add(tv4, new Double(1)); - auto tv6 = add(tv5, new Double(1)); + auto tv4 = add(tv3, IrBuilder::create(1)); + auto tv5 = add(tv4, IrBuilder::create(1)); + auto tv6 = add(tv5, IrBuilder::create(1)); fusion.addOutput(tv6); tv2->split(0, 4); @@ -17835,13 +17874,13 @@ TEST(NVFuserTest, FusionIssue1099_CUDA) { tv6->split(0, 7); tv6->axis(-1)->parallelize(ParallelType::TIDz); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({17}, options); at::Tensor t3 = at::randn({19}, options); std::vector aten_inputs = {t0, t3}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref_t2 = t0 + 2; @@ -17852,18 +17891,15 @@ TEST(NVFuserTest, FusionIssue1099_CUDA) { } // Repro of issue #1080 -TEST(NVFuserTest, FusionUnswitchPredicate_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionUnswitchPredicate_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv2->split(0, 4); @@ -17883,14 +17919,14 @@ TEST(NVFuserTest, FusionUnswitchPredicate_CUDA) { tv1->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int nx = 4; const int ny = 10; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({nx, ny}, options); std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref = t0 + 2; @@ -17898,7 +17934,7 @@ TEST(NVFuserTest, FusionUnswitchPredicate_CUDA) { testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue1189_CUDA) { +TEST_F(NVFuserTest, FusionIssue1189_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -17926,12 +17962,12 @@ TEST(NVFuserTest, FusionIssue1189_CUDA) { parallelize(tv2); parallelize(tv3); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({16, 16, 1}, options); at::Tensor t1 = at::randn({16, 16, 1}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0, t1}); auto outputs = fe.runFusion({t0, t1}); auto ref = (t0 + t1).sum({1}); @@ -17939,7 +17975,7 @@ TEST(NVFuserTest, FusionIssue1189_CUDA) { testValidate(&fusion, outputs, {t0, t1}, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue1052_CUDA) { +TEST_F(NVFuserTest, FusionIssue1052_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -17948,10 +17984,10 @@ TEST(NVFuserTest, FusionIssue1052_CUDA) { auto tv1 = makeSymbolicTensor(1); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); fusion.addOutput(tv2); - auto tv3 = add(tv1, new Double(1)); + auto tv3 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv3); tv2->axis(-1)->parallelize(ParallelType::TIDx); @@ -17960,13 +17996,13 @@ TEST(NVFuserTest, FusionIssue1052_CUDA) { scheduler_utils::parallelizeAllLike(tv2, {tv0}); scheduler_utils::parallelizeAllLike(tv3, {tv1}); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({10}, options); at::Tensor t1 = at::randn({100}, options); std::vector aten_inputs = {t0, t1}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref_t2 = t0 + 1; @@ -17977,7 +18013,7 @@ TEST(NVFuserTest, FusionIssue1052_CUDA) { } // Repro of issue #1115 -TEST(NVFuserTest, FusionPointwiseBroadcast_CUDA) { +TEST_F(NVFuserTest, FusionPointwiseBroadcast_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18002,7 +18038,7 @@ TEST(NVFuserTest, FusionPointwiseBroadcast_CUDA) { schedulePointwise(&fusion, aten_inputs); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto at_x_add_bias = at_x + at_bias; @@ -18012,23 +18048,23 @@ TEST(NVFuserTest, FusionPointwiseBroadcast_CUDA) { testValidate(&fusion, outputs, aten_inputs, {aten_y}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionSmemAliasSerial_CUDA) { +TEST_F(NVFuserTest, FusionSmemAliasSerial_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); // Just set the dimension of TIDx auto tv4 = makeSymbolicTensor(1); fusion.addInput(tv4); - auto tv5 = add(tv4, new Double(1)); + auto tv5 = add(tv4, IrBuilder::create(1)); fusion.addOutput(tv5); tv1->setMemoryType(MemoryType::Shared); @@ -18040,14 +18076,13 @@ TEST(NVFuserTest, FusionSmemAliasSerial_CUDA) { // TIDx. They should be predicated as they are redundant and can // interfere with smem aliasing (issue #1100). - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({10}, options); - at::Tensor t4 = at::randn({1024}, options); std::vector aten_inputs = {t0, t4}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref1 = t0 + 3; @@ -18056,14 +18091,14 @@ TEST(NVFuserTest, FusionSmemAliasSerial_CUDA) { testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) { +TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); fusion.addOutput(tv1); auto tv2 = makeSymbolicTensor(1); @@ -18074,13 +18109,13 @@ TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) { tv1->axis(0)->parallelize(ParallelType::TIDx); tv3->axis(0)->parallelize(ParallelType::BIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({17}, options); at::Tensor t2 = at::randn({19}, options); std::vector aten_inputs = {t0, t2}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref1 = t0 + 1; @@ -18089,14 +18124,14 @@ TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions_CUDA) { testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) { +TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); fusion.addOutput(tv1); auto tv2 = makeSymbolicTensor(1); @@ -18107,13 +18142,13 @@ TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) { tv1->axis(0)->parallelize(ParallelType::TIDx); tv3->axis(0)->parallelize(ParallelType::BIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({17}, options); at::Tensor t2 = at::randn({19}, options); std::vector aten_inputs = {t0, t2}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref1 = t0 + 1; @@ -18122,7 +18157,7 @@ TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions_CUDA) { testValidate(&fusion, outputs, aten_inputs, {ref1, ref2}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) { +TEST_F(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18134,12 +18169,12 @@ TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) { auto tv2 = makeSymbolicTensor(3); fusion.addInput(tv2); - auto tv3 = add(tv2, new Double(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); auto tv4 = makeSymbolicTensor(3); fusion.addInput(tv4); - auto tv5 = add(tv4, new Double(1)); + auto tv5 = add(tv4, IrBuilder::create(1)); fusion.addOutput(tv5); tv1->axis(0)->parallelize(ParallelType::BIDx); @@ -18175,7 +18210,7 @@ TEST(NVFuserTest, FusionGridReductionWithNonExactParallelDimensions2_CUDA) { #endif } -TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) { +TEST_F(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18187,12 +18222,12 @@ TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) { auto tv2 = makeSymbolicTensor(3); fusion.addInput(tv2); - auto tv3 = add(tv2, new Double(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); auto tv4 = makeSymbolicTensor(3); fusion.addInput(tv4); - auto tv5 = add(tv4, new Double(1)); + auto tv5 = add(tv4, IrBuilder::create(1)); fusion.addOutput(tv5); tvs.avg->axis(0)->parallelize(ParallelType::BIDx); @@ -18229,7 +18264,7 @@ TEST(NVFuserTest, FusionGridWelfordWithNonExactParallelDimensions2_CUDA) { } // Repro of issue #1102 -TEST(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { +TEST_F(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18237,18 +18272,18 @@ TEST(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { fusion.addInput(tv0); // Just to make TIDx/y/z non-exact - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); auto tv4 = makeSymbolicTensor(1); fusion.addInput(tv4); - auto tv5 = add(tv4, new Double(1)); - auto tv6 = add(tv5, new Double(1)); - auto tv7 = add(tv6, new Double(1)); - auto tv8 = add(tv7, new Double(1)); + auto tv5 = add(tv4, IrBuilder::create(1)); + auto tv6 = add(tv5, IrBuilder::create(1)); + auto tv7 = add(tv6, IrBuilder::create(1)); + auto tv8 = add(tv7, IrBuilder::create(1)); auto tv9 = sum(tv8, {0}); fusion.addOutput(tv9); @@ -18274,13 +18309,13 @@ TEST(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { tv5->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({17}, options); at::Tensor t4 = at::randn({19}, options); std::vector aten_inputs = {t0, t4}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref1 = t0 + 3; @@ -18290,8 +18325,9 @@ TEST(NVFuserTest, FusionPredicateParallelizedDomains_CUDA) { } // Repro of #1102 and #1129 -TEST(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 7) { +TEST_F(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { + if (!deviceMajorMinorCheck(7)) { + GTEST_SKIP() << "skipping tests on pre-Volta GPUs"; return; } Fusion fusion; @@ -18302,16 +18338,16 @@ TEST(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { auto tv1 = makeSymbolicTensor(1); fusion.addInput(tv1); - auto tv2 = add(tv0, new Double(1)); - auto tv3 = add(tv2, new Double(1)); - auto tv4 = add(tv3, new Double(1)); - auto tv5 = add(tv4, new Double(1)); + auto tv2 = add(tv0, IrBuilder::create(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); + auto tv4 = add(tv3, IrBuilder::create(1)); + auto tv5 = add(tv4, IrBuilder::create(1)); fusion.addOutput(tv5); // Just to make TIDx/y/z non-exact - auto tvx = add(tv1, new Double(1)); - auto tvy = add(tvx, new Double(1)); - auto tvz = add(tvy, new Double(1)); + auto tvx = add(tv1, IrBuilder::create(1)); + auto tvy = add(tvx, IrBuilder::create(1)); + auto tvz = add(tvy, IrBuilder::create(1)); fusion.addOutput(tvz); tv5->split(0, 4); @@ -18335,13 +18371,13 @@ TEST(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { tv->setMemoryType(MemoryType::Shared); } - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({17}, options); at::Tensor t1 = at::randn({19}, options); std::vector aten_inputs = {t0, t1}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref1 = t0 + 4; @@ -18351,21 +18387,24 @@ TEST(NVFuserTest, FusionSmemPredicateUnswitch_CUDA) { } // Repro of issue #1136 -TEST(NVFuserTest, FusionFloatPow_CUDA) { +TEST_F(NVFuserTest, FusionFloatPow_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = binaryOp(BinaryOpType::Pow, tv0, new Int(4)); + auto tv1 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create(4)); // To check if pow(tv0, 2) is replaced with tv0 * tv0 - auto tv2 = binaryOp(BinaryOpType::Pow, tv0, new Int(2)); + auto tv2 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create(2)); // To check if pow(tv0, 2.0) is replaced with tv0 * tv0 - auto tv3 = binaryOp(BinaryOpType::Pow, tv0, new Double(2)); - auto tv4 = binaryOp(BinaryOpType::Pow, tv0, new Int(3)); - auto tv5 = binaryOp(BinaryOpType::Pow, tv0, new Double(3)); - auto s = binaryOp(BinaryOpType::Pow, new Double(3), new Double(3)); + auto tv3 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create(2)); + auto tv4 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create(3)); + auto tv5 = binaryOp(BinaryOpType::Pow, tv0, IrBuilder::create(3)); + auto s = binaryOp( + BinaryOpType::Pow, + IrBuilder::create(3), + IrBuilder::create(3)); auto tv6 = add(tv0, s); fusion.addOutput(tv1); @@ -18382,14 +18421,14 @@ TEST(NVFuserTest, FusionFloatPow_CUDA) { TransformPropagator::from(tv1); scheduler_utils::parallelizeAllLike(tv1, {tv2, tv3, tv4, tv5, tv6}); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({1000}, options); // Negative inputs cause nan in Fuesr as use_fast_math is enabled t0 = abs(t0); std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto p4 = at::pow(t0, 4); @@ -18406,7 +18445,7 @@ TEST(NVFuserTest, FusionFloatPow_CUDA) { __FILE__); } -TEST(NVFuserTest, FusionIssue1127_CUDA) { +TEST_F(NVFuserTest, FusionIssue1127_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18435,7 +18474,7 @@ TEST(NVFuserTest, FusionIssue1127_CUDA) { ASSERT_ANY_THROW(fusion.printKernel()); } -TEST(NVFuserTest, FusionChannelsLastParser_CUDA) { +TEST_F(NVFuserTest, FusionChannelsLastParser_CUDA) { // This test may not pass if using a custom block sync as there may // be additional calls. Skip the test as it's not specifically // relevant with block synchronizatin. @@ -18486,30 +18525,30 @@ TEST(NVFuserTest, FusionChannelsLastParser_CUDA) { const std::string expected_kernel = R"( __global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, Tensor<__half, 4> T7) { if ((((((((((nvfuser_index_t)blockIdx.x) * 1) + 0) * 1) + 0) * 128) + ((nvfuser_index_t)threadIdx.x)) < (T0.size[0] * (T0.size[1] * (T0.size[2] * T0.size[3]))))) { - constexpr nvfuser_index_t ki674 = 0; + constexpr nvfuser_index_t i120 = 0; __half T9[1]; - constexpr nvfuser_index_t ki716 = 0; - T9[ki716] = 0; - constexpr nvfuser_index_t ki707 = 0; - T9[ki707] - = T2[((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * (((1 * T0.size[2]) * T0.size[1]) * T0.size[3])) + ((((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * ((1 * T0.size[2]) * T0.size[1])) + (((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * (1 * T0.size[2])) + ((((((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki707) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3]) * 1)]; + constexpr nvfuser_index_t i132 = 0; + T9[i132] = 0; + constexpr nvfuser_index_t i128 = 0; + T9[i128] + = T2[((((((((((nvfuser_index_t)blockIdx.x) * 1) + i120) * 1) + i128) * 128) + ((nvfuser_index_t)threadIdx.x)) / (T0.size[1] * (T0.size[2] * T0.size[3]))) * (((1 * T0.size[2]) * T0.size[1]) * T0.size[3])) + ((((((((((((nvfuser_index_t)blockIdx.x) * 1) + i120) * 1) + i128) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) % T0.size[3]) * ((1 * T0.size[2]) * T0.size[1])) + (((((((((((nvfuser_index_t)blockIdx.x) * 1) + i120) * 1) + i128) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) / (T0.size[2] * T0.size[3])) * (1 * T0.size[2])) + ((((((((((((nvfuser_index_t)blockIdx.x) * 1) + i120) * 1) + i128) * 128) + ((nvfuser_index_t)threadIdx.x)) % (T0.size[1] * (T0.size[2] * T0.size[3]))) % (T0.size[2] * T0.size[3])) / T0.size[3]) * 1)]; __half T8[1]; - constexpr nvfuser_index_t ki722 = 0; - T8[ki722] = 0; - constexpr nvfuser_index_t ki702 = 0; - T8[ki702] - = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki702) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; + constexpr nvfuser_index_t i134 = 0; + T8[i134] = 0; + constexpr nvfuser_index_t i126 = 0; + T8[i126] + = T0[(((((((((nvfuser_index_t)blockIdx.x) * 1) + i120) * 1) + i126) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)]; __half T10[1]; - constexpr nvfuser_index_t ki683 = 0; + constexpr nvfuser_index_t i124 = 0; float T3[1]; T3[0] - = __half2float(T9[ki683]); + = __half2float(T9[i124]); float T4[1]; T4[0] = T3[0]; float T1[1]; T1[0] - = __half2float(T8[ki683]); + = __half2float(T8[i124]); float T5[1]; T5[0] = T1[0] @@ -18517,11 +18556,11 @@ __global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, float T6[1]; T6[0] = relu(T5[0]); - T10[ki683] + T10[i124] = __float2half(T6[0]); - constexpr nvfuser_index_t ki676 = 0; - T7[(((((((((nvfuser_index_t)blockIdx.x) * 1) + ki674) * 1) + ki676) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)] - = T10[ki676]; + constexpr nvfuser_index_t i122 = 0; + T7[(((((((((nvfuser_index_t)blockIdx.x) * 1) + i120) * 1) + i122) * 128) + ((nvfuser_index_t)threadIdx.x)) * 1)] + = T10[i122]; } } )"; @@ -18558,7 +18597,7 @@ __global__ void CUDAGeneratedKernel(Tensor<__half, 4> T0, Tensor<__half, 4> T2, // TORCH_CHECK(output_ref.equal(outputs[0])); } -TEST(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { +TEST_F(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18566,8 +18605,8 @@ TEST(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { fusion.addInput(tv0); auto tv1 = sum(tv0, {1}); - auto tv2 = add(tv1, new Double(1)); - auto tv3 = add(tv2, new Double(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); @@ -18575,12 +18614,12 @@ TEST(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { tv2->computeAt(tv3, -1); tv3->axis(0)->parallelize(ParallelType::Unswitch); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({10, 1024}, options); std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref = sum(t0, {1}) + 2; @@ -18588,24 +18627,24 @@ TEST(NVFuserTest, FusionThreadPredicateUnswitch_CUDA) { testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionNonContigOutputs_CUDA) { +TEST_F(NVFuserTest, FusionNonContigOutputs_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); fusion.addOutput(tv1); tv1->setContiguity(false); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor at_input = at::randn({10}, options); at::Tensor at_output = at::empty_strided({10}, {2}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {at_input}); auto returned_outputs = fe.runFusion({at_input}, {at_output}); // Returned outputs should only contain one tensor that is the same @@ -18619,7 +18658,7 @@ TEST(NVFuserTest, FusionNonContigOutputs_CUDA) { testValidate(&fusion, {at_output}, {at_input}, {at_ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionTestWarpSoftMax_CUDA) { +TEST_F(NVFuserTest, FusionTestWarpSoftMax_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18654,14 +18693,15 @@ TEST(NVFuserTest, FusionTestWarpSoftMax_CUDA) { // Test result FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref_output = at::_softmax(aten_input, 1, false); testValidate(&fusion, outputs, aten_inputs, {ref_output}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue1133_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 7) { +TEST_F(NVFuserTest, FusionIssue1133_CUDA) { + if (!deviceMajorMinorCheck(7)) { + GTEST_SKIP() << "skipping tests on pre-Volta GPUs"; return; } Fusion fusion; @@ -18670,9 +18710,9 @@ TEST(NVFuserTest, FusionIssue1133_CUDA) { auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); - auto tv3 = add(tv2, new Double(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); @@ -18702,20 +18742,20 @@ TEST(NVFuserTest, FusionIssue1133_CUDA) { // There should be no allocation other than those for tv1 and tv2 TORCH_CHECK(false, "Invalid allocation detected"); } - TORCH_CHECK(size->isA(), "Invalid allocation size"); - TORCH_CHECK(size->as()->isConst(), "Allocation not constant"); - auto size_int = size->as()->value().value(); + TORCH_CHECK(size->isA(), "Invalid allocation size"); + TORCH_CHECK(size->as()->isConst(), "Allocation not constant"); + auto size_int = size->as()->value().value(); if (alloc->buffer()->name() == 1) { TORCH_CHECK( size_int == split_factor, "Invalid allocation size: ", - size->as()->value().value()); + size->as()->value().value()); tv1_validated = true; } else { TORCH_CHECK( size_int == 1, "Invalid allocation size: ", - size->as()->value().value()); + size->as()->value().value()); tv2_validated = true; } } @@ -18724,12 +18764,12 @@ TEST(NVFuserTest, FusionIssue1133_CUDA) { TORCH_CHECK(tv1_validated, "Failed to validate tv1 allocation"); TORCH_CHECK(tv2_validated, "Failed to validate tv2 allocation"); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({99, 101}, options); std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref = (t0 + 1).sum({1}) + 1; @@ -18737,7 +18777,7 @@ TEST(NVFuserTest, FusionIssue1133_CUDA) { testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionRfactorContigIDs_CUDA) { +TEST_F(NVFuserTest, FusionRfactorContigIDs_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18756,12 +18796,12 @@ TEST(NVFuserTest, FusionRfactorContigIDs_CUDA) { tv2->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({99, 101}, options); std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); auto outputs = fe.runFusion(aten_inputs); auto ref = t0.sum({1}); @@ -18769,7 +18809,7 @@ TEST(NVFuserTest, FusionRfactorContigIDs_CUDA) { testValidate(&fusion, outputs, aten_inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) { +TEST_F(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18831,7 +18871,7 @@ TEST(NVFuserTest, FusionPersistentBufferCalculation1_CUDA) { aten_t0.size(1) * dataTypeSize(DataType::Float)); } -TEST(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) { +TEST_F(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18894,7 +18934,7 @@ TEST(NVFuserTest, FusionPersistentBufferCalculation2_CUDA) { aten_t0.size(1) * dataTypeSize(DataType::Half)); } -TEST(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) { +TEST_F(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -18979,7 +19019,7 @@ TEST(NVFuserTest, FusionPersistentBufferCalculation3_CUDA) { (dataTypeSize(DataType::Half) + dataTypeSize(DataType::Float))); } -TEST(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) { +TEST_F(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -19056,10 +19096,7 @@ TEST(NVFuserTest, FusionPersistentBufferCalculation4_CUDA) { aten_t0.size(1) * dataTypeSize(DataType::Half)); } -TEST(NVFuserTest, PersistentBufferProjection_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionPersistentBufferProjection_CUDA) { std::unique_ptr fusion_ptr = std::make_unique(); Fusion& fusion = *fusion_ptr.get(); FusionGuard fg(&fusion); @@ -19107,8 +19144,9 @@ TEST(NVFuserTest, PersistentBufferProjection_CUDA) { testValidate(&fusion, cg_outputs, {aten_t0}, {aten_t7}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionIssue1223_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 7) { +TEST_F(NVFuserTest, FusionIssue1223_CUDA) { + if (!deviceMajorMinorCheck(7)) { + GTEST_SKIP() << "skipping tests on pre-Volta GPUs"; return; } Fusion fusion; @@ -19117,11 +19155,11 @@ TEST(NVFuserTest, FusionIssue1223_CUDA) { auto tv0 = makeContigTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {0, 1}); fusion.addOutput(tv2); - auto tv3 = add(tv0, new Double(0)); + auto tv3 = add(tv0, IrBuilder::create(0)); fusion.addOutput(tv3); tv2->split(0, 4); @@ -19153,7 +19191,7 @@ TEST(NVFuserTest, FusionIssue1223_CUDA) { at::Tensor at_t0 = at::ones({11, 10}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {at_t0}); auto cg_outputs = fe.runFusion({at_t0}); auto at_t1 = (at_t0 + 1).sum(); @@ -19163,14 +19201,14 @@ TEST(NVFuserTest, FusionIssue1223_CUDA) { } // See #1247 and #1250 -TEST(NVFuserTest, FusionRfactorPredication1_CUDA) { +TEST_F(NVFuserTest, FusionRfactorPredication1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeContigTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = min(tv1, {0}); fusion.addOutput(tv2); @@ -19179,7 +19217,7 @@ TEST(NVFuserTest, FusionRfactorPredication1_CUDA) { auto tv3 = makeContigTensor(1); fusion.addInput(tv3); - auto tv4 = add(tv3, new Double(1)); + auto tv4 = add(tv3, IrBuilder::create(1)); fusion.addOutput(tv4); tv2->split(0, 4); @@ -19197,7 +19235,7 @@ TEST(NVFuserTest, FusionRfactorPredication1_CUDA) { at::Tensor at_t3 = at::randn({128}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {at_t0, at_t3}); auto cg_outputs = fe.runFusion({at_t0, at_t3}); auto at_t2 = (at_t0 + 1).min(); @@ -19207,7 +19245,7 @@ TEST(NVFuserTest, FusionRfactorPredication1_CUDA) { &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionRfactorPredication2_CUDA) { +TEST_F(NVFuserTest, FusionRfactorPredication2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -19221,7 +19259,7 @@ TEST(NVFuserTest, FusionRfactorPredication2_CUDA) { auto tv2 = makeContigTensor(1); fusion.addInput(tv2); - auto tv3 = add(tv2, new Double(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); tv1->split(0, 4); @@ -19250,7 +19288,7 @@ TEST(NVFuserTest, FusionRfactorPredication2_CUDA) { at::Tensor at_t3 = at::randn({128}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {at_t0, at_t3}); auto cg_outputs = fe.runFusion({at_t0, at_t3}); auto at_t2 = std::get<0>(at_t0.min(0)); @@ -19260,7 +19298,7 @@ TEST(NVFuserTest, FusionRfactorPredication2_CUDA) { &fusion, cg_outputs, {at_t0, at_t3}, {at_t2, at_t4}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -19292,7 +19330,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { TORCH_CHECK( gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1, "Only tv1 should have a non-divisible predicate."); - for (auto tv : {tv1}) { + for (auto tv : {loweredTv(tv1, gpulw)}) { auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); TORCH_CHECK( it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), @@ -19309,7 +19347,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { at::Tensor t0 = at::randn({24}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto ref = t0.sum(); @@ -19318,14 +19356,14 @@ TEST(NVFuserTest, FusionNonDivisibleSplit1_CUDA) { } // Repro of issue #1074 -TEST(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); tv2->split(0, 2); @@ -19346,7 +19384,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { TORCH_CHECK( gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 1, "Only tv2 should have a non-divisible predicate."); - for (auto tv : {tv2}) { + for (auto tv : {loweredTv(tv2, gpulw)}) { auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); TORCH_CHECK( it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), @@ -19363,7 +19401,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { at::Tensor t0 = at::randn({13, 17}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto ref = t0 + 2; @@ -19372,14 +19410,14 @@ TEST(NVFuserTest, FusionNonDivisibleSplit2_CUDA) { } // Similar to FusionNonDivisibleSplit1 but with unswitch -TEST(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {0}); fusion.addOutput(tv2); @@ -19397,7 +19435,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { TORCH_CHECK( gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, "Both tv1 and tv2 should have a non-divisible predicate."); - for (auto tv : {tv1, tv2}) { + for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) { auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); TORCH_CHECK( it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), @@ -19414,7 +19452,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { at::Tensor t0 = at::randn({24}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto ref = (t0 + 1).sum(); @@ -19423,14 +19461,14 @@ TEST(NVFuserTest, FusionNonDivisibleSplit3_CUDA) { } // Non-divisible split through merge -TEST(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {0, 1}); fusion.addOutput(tv2); @@ -19447,7 +19485,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { TORCH_CHECK( gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, "Both tv1 and tv2 should have a non-divisible predicate."); - for (auto tv : {tv1, tv2}) { + for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) { auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); TORCH_CHECK( it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), @@ -19464,7 +19502,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { at::Tensor t0 = at::randn({24, 2}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto ref = (t0 + 1).sum(); @@ -19473,14 +19511,14 @@ TEST(NVFuserTest, FusionNonDivisibleSplit4_CUDA) { } // Nested splits -TEST(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {0}); fusion.addOutput(tv2); @@ -19501,7 +19539,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { TORCH_CHECK( gpulw.nonDivisibleSplitInfo().splitsToPredicate().size() == 2, "Both tv1 and tv2 should have a non-divisible predicate."); - for (auto tv : {tv1, tv2}) { + for (auto tv : {loweredTv(tv1, gpulw), loweredTv(tv2, gpulw)}) { auto it = gpulw.nonDivisibleSplitInfo().splitsToPredicate().find(tv); TORCH_CHECK( it != gpulw.nonDivisibleSplitInfo().splitsToPredicate().end(), @@ -19518,7 +19556,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { at::Tensor t0 = at::randn({24}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto ref = (t0 + 1).sum(); @@ -19527,7 +19565,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplit5_CUDA) { } // Vectorized non-divisible split. Must be validated at run time -TEST(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -19556,13 +19594,12 @@ TEST(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { splits_to_predicate); } - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); - auto t0 = at::randn({32}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto ref = t0; @@ -19576,7 +19613,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplitVectorize1_CUDA) { } // If a split is validated at run time, it's not necessary to predicate. -TEST(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -19584,7 +19621,7 @@ TEST(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { fusion.addInput(tv0); auto tv1 = set(tv0); - auto tv2 = add(tv1, new Double(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); auto tv3 = sum(tv2, {0}); fusion.addOutput(tv3); @@ -19611,13 +19648,13 @@ TEST(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { splits_to_predicate); } - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); auto t0 = at::randn({1024}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto ref = (t0 + 1).sum(); @@ -19625,6 +19662,784 @@ TEST(NVFuserTest, FusionNonDivisibleSplitVectorize2_CUDA) { testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } +TEST_F(NVFuserTest, FusionIssue1284Repro_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + std::vector input_shape_0 = {10, 20}; + std::vector input_shape_1 = {15}; + + TensorView* in_0 = makeSymbolicTensor(input_shape_0.size()); + TensorView* in_1 = makeSymbolicTensor(input_shape_1.size()); + fusion.addInput(in_0); + fusion.addInput(in_1); + + TensorView* out_0 = add(in_0, IrBuilder::create(0.f)); + TensorView* out_1 = add(in_1, IrBuilder::create(2.f)); + + fusion.addOutput(out_0); + fusion.addOutput(out_1); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_in_0 = at::randn(input_shape_0, options); + at::Tensor at_in_1 = at::randn(input_shape_1, options); + std::vector aten_inputs = {at_in_0, at_in_1}; + + FusionExecutorCache fec(std::move(fusion_ptr)); + auto outputs = fec.runFusionWithInputs(aten_inputs); + + auto t1 = at_in_1 + 2; + + auto runtime = fec.getMostRecentKernelRuntime(); + TORCH_INTERNAL_ASSERT(runtime->isSegmented()); + TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2); + + testValidate( + &fusion, outputs, {at_in_0, at_in_1}, {at_in_0, t1}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionIssue1284Repro2_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + std::vector input_shape_0 = {4, 4}; + std::vector input_shape_1 = {3, 4, 4}; + std::vector input_shape_2 = {2, 8, 4, 4}; + + TensorView* in_0 = makeSymbolicTensor(input_shape_0.size()); + TensorView* in_1 = makeSymbolicTensor(input_shape_1.size()); + TensorView* in_2 = makeSymbolicTensor(input_shape_2.size()); + + fusion.addInput(in_0); + fusion.addInput(in_1); + fusion.addInput(in_2); + + TensorView* out_0 = add(in_0, in_1); + TensorView* out_1 = add(in_0, in_2); + + fusion.addOutput(out_0); + fusion.addOutput(out_1); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor at_in_0 = at::randn(input_shape_0, options); + at::Tensor at_in_1 = at::randn(input_shape_1, options); + at::Tensor at_in_2 = at::randn(input_shape_2, options); + + std::vector aten_inputs = {at_in_0, at_in_1, at_in_2}; + + FusionExecutorCache fec(std::move(fusion_ptr)); + auto outputs = fec.runFusionWithInputs(aten_inputs); + + auto t0 = at_in_0 + at_in_1; + auto t1 = at_in_0 + at_in_2; + + auto runtime = fec.getMostRecentKernelRuntime(); + TORCH_INTERNAL_ASSERT(runtime->isSegmented()); + TORCH_INTERNAL_ASSERT(runtime->fusionSegments()->groups().size() == 2); + + testValidate( + &fusion, + outputs, + {at_in_0, at_in_1, at_in_2}, + {t0, t1}, + __LINE__, + __FILE__); +} + +TEST_F(NVFuserTest, FusionIssue1305Repro_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + Fusion& fusion = *fusion_ptr.get(); + FusionGuard fg(&fusion); + + auto t0 = makeContigTensor(1); + auto t1 = makeContigTensor(2); + + fusion.addInput(t0); + fusion.addInput(t1); + + auto t2 = broadcast(t0, {true, false}); + auto t3 = add(t1, t2); + auto t4 = add(t3, t2); + auto t5 = sum(t4, {1}); + auto t6 = broadcast(t5, {false, true}); + auto t7 = add(t3, t6); + + fusion.addOutput(t7); + + t3->computeAt(t7, -1, ComputeAtMode::MostInlined); + + TORCH_INTERNAL_ASSERT(t3->getComputeAtPosition() == 1); +} + +TEST_F(NVFuserTest, FusionDoubleBuffering1_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + auto tv3 = set(tv2); + fusion.addOutput(tv3); + + tv1->setMemoryType(MemoryType::Shared); + + tv3->split(-1, 128); + tv3->split(-1, 32); + TransformPropagator::from(tv3); + + tv0->computeAt(tv3, 1); + + tv3->axis(-2)->parallelize(ParallelType::BIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion)); + + tv1->doubleBuffer(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({1000}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto cg_outputs = fe.runFusion({t0}); + + auto ref = t0 + 1; + + testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionDoubleBuffering2_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + auto tv3 = set(tv2); + fusion.addOutput(tv3); + + tv3->split(-1, 128); + tv3->split(-1, 32); + TransformPropagator::from(tv3); + + tv0->computeAt(tv3, -1); + + tv3->axis(-2)->parallelize(ParallelType::BIDx); + tv3->axis(-1)->parallelize(ParallelType::TIDx); + scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion)); + + tv1->doubleBuffer(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({1000}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto cg_outputs = fe.runFusion({t0}); + + auto ref = t0 + 1; + + testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionDoubleBuffering3_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = add(tv0, IrBuilder::create(1.0)); + auto tv2 = set(tv1); + auto tv3 = add(tv2, IrBuilder::create(1.0)); + fusion.addOutput(tv3); + + tv1->setMemoryType(MemoryType::Shared); + + tv3->split(-1, 128); + tv3->split(-1, 32); + TransformPropagator::from(tv3); + + tv0->computeAt(tv3, 1); + + // tv2 is invalid to double-buffer as its producer, tv1, is + // computed inside the double-buffering loop. + ASSERT_ANY_THROW(tv2->doubleBuffer()); + + // Moving tv2 inner makes tv1 large enough to double-buffer tv2 + tv2->computeAt(tv3, 2); + + tv2->doubleBuffer(); + + tv3->axis(-1)->parallelize(ParallelType::TIDx); + scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion)); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({1000}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto cg_outputs = fe.runFusion({t0}); + + auto ref = t0 + 2; + + testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); +} + +// Double buffering smem to local and unswitch +TEST_F(NVFuserTest, FusionDoubleBuffering4_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = add(tv0, IrBuilder::create(1.0)); + auto tv2 = set(tv1); + auto tv3 = add(tv2, IrBuilder::create(1.0)); + fusion.addOutput(tv3); + + tv1->setMemoryType(MemoryType::Shared); + + tv3->split(-1, 128); + tv3->split(-1, 32); + tv3->split(-1, 8); + TransformPropagator::from(tv3); + + tv0->computeAt(tv3, 2); + tv2->computeAt(tv3, -1); + + tv3->axis(-1)->parallelize(ParallelType::TIDx); + tv3->axis(1)->parallelize(ParallelType::Unswitch); + scheduler_utils::parallelizeAllLike(tv3, ir_utils::allTvs(&fusion)); + + tv2->doubleBuffer(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({1000}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto cg_outputs = fe.runFusion({t0}); + + auto ref = t0 + 2; + + testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); +} + +// Double buffering gmem to shared and unswitch +TEST_F(NVFuserTest, FusionDoubleBuffering5_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + fusion.addOutput(tv2); + + tv1->setMemoryType(MemoryType::Shared); + + tv2->split(-1, 128); + tv2->split(-1, 32); + tv2->split(-1, 8); + TransformPropagator::from(tv2); + + tv0->computeAt(tv2, 2); + tv1->computeAt(tv2, -1); + + tv2->axis(-1)->parallelize(ParallelType::TIDx); + tv2->axis(1)->parallelize(ParallelType::Unswitch); + scheduler_utils::parallelizeAllLike(tv2, ir_utils::allTvs(&fusion)); + + tv1->doubleBuffer(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({1000}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto cg_outputs = fe.runFusion({t0}); + + auto ref = t0 + 1; + + testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); +} + +// Double buffering smem to local and unroll +TEST_F(NVFuserTest, FusionDoubleBuffering6_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = add(tv0, IrBuilder::create(1.0)); + auto tv2 = set(tv1); + auto tv3 = add(tv2, IrBuilder::create(1.0)); + fusion.addOutput(tv3); + + tv1->setMemoryType(MemoryType::Shared); + + tv3->split(-1, 128); + tv3->split(-1, 16); + tv3->split(-2, 4); + tv3->split(-2, 2); + TransformPropagator::from(tv3); + + tv0->computeAt(tv3, 1); + tv2->computeAt(tv3, -1); + + tv3->axis(2)->parallelize(ParallelType::Unroll); + tv3->axis(4)->parallelize(ParallelType::TIDx); + + tv2->doubleBuffer(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({199}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto cg_outputs = fe.runFusion({t0}); + + auto ref = t0 + 2; + + testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); +} + +// Double buffering and vectorize +TEST_F(NVFuserTest, FusionDoubleBuffering7_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + auto tv2 = add(tv1, IrBuilder::create(1.0)); + fusion.addOutput(tv2); + + tv2->split(-1, 128); + tv2->split(-1, 4); + TransformPropagator::from(tv2); + + tv1->computeAt(tv2, 2); + + tv2->axis(-2)->parallelize(ParallelType::TIDx); + + tv1->axis(-1)->parallelize(ParallelType::Vectorize); + + tv1->doubleBuffer(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({200}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto cg_outputs = fe.runFusion({t0}); + + auto ref = t0 + 1; + + testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); +} + +// Multiple tensors to double-buffer +TEST_F(NVFuserTest, FusionDoubleBuffering8_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + auto tv1 = makeContigTensor(1); + fusion.addInput(tv1); + + auto tv2 = set(tv0); + auto tv3 = set(tv1); + auto tv4 = add(tv2, tv3); + fusion.addOutput(tv4); + + tv4->split(0, 32); + tv4->split(0, 4); + TransformPropagator::from(tv4); + + tv0->computeAt(tv4, 1); + tv1->computeAt(tv4, 1); + + tv4->axis(-1)->parallelize(ParallelType::TIDx); + scheduler_utils::parallelizeAllLike(tv4, ir_utils::allTvs(&fusion)); + + tv2->doubleBuffer(); + tv3->doubleBuffer(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({100}, options); + auto t1 = at::randn({100}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0, t1}); + auto cg_outputs = fe.runFusion({t0, t1}); + + auto ref = t0 + t1; + + testValidate(&fusion, cg_outputs, {t0, t1}, {ref}, __LINE__, __FILE__); +} + +// Nested double buffering from gmem to smem and smem to register +TEST_F(NVFuserTest, FusionDoubleBuffering9_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto out = tv1; + fusion.addOutput(out); + + auto tv2 = tv0->cache_after(); + auto tv3 = tv2->cache_after(); + + out->split(0, 32); + out->split(0, 4); + TransformPropagator::from(out); + + tv2->setMemoryType(MemoryType::Shared); + + tv2->computeAt(out, 1); + tv3->computeAt(out, -1); + + out->axis(-1)->parallelize(ParallelType::TIDx); + scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion)); + + tv2->doubleBuffer(); + tv3->doubleBuffer(); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({1001}, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto cg_outputs = fe.runFusion({t0}); + + auto ref = t0 + 1; + + testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); +} + +// FusionSmemBlockGemmCache + double buffering at both smem and local +TEST_F(NVFuserTest, FusionSmemBlockGemmCacheDoubleBuffer_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Algorithm + TensorView* tv0 = makeSymbolicTensor(2); // (M, K) + TensorView* tv1 = makeSymbolicTensor(2); // (K, N) + TensorView* tv2 = broadcast(tv0, {false, false, true}); // (M, K, B) + TensorView* tv3 = broadcast(tv1, {true, false, false}); // (B, K, N) + TensorView* tv4 = mul(tv2, tv3); // M, K, N + TensorView* tv5 = sum(tv4, {1}); // M, R, N + fusion.addInput(tv0); + fusion.addInput(tv1); + fusion.addOutput(tv5); + + TensorView* tv6 = tv5->cache_before(); + + // For smem double buffering + auto tv0_cache_local = tv0->cache_after(); + auto tv1_cache_local = tv1->cache_after(); + + // For register double buffering + auto tv0_cache_smem = tv0->cache_after(); + auto tv1_cache_smem = tv1->cache_after(); + + const int BSX = 32; + const int TSX = 8; + + // [M, K, N] + tv6->split(-1, BSX); + tv6->split(-1, TSX); + tv6->split(1, BSX); + tv6->split(0, BSX); + tv6->split(1, TSX); + // [M/BSX, BSX/TSX, TSX, K/BSX, BSX, N/BSX, BSX/TSX, TSX] + tv6->reorder( + {{4, 7}, {7, 6}, {6, 5}, {2, 4}, {1, 3}, {3, 2}, {5, 1}, {0, 0}}); + // [M/BSX, N/BSX, K/BSX, BSX/TSX, BSX/TSX, TSX, TSX, BSX] + + auto tv6_rf = tv6->rFactor({-1}); + + TransformPropagator::from(tv6_rf); + + tv0->computeAt(tv6, 3); + tv1->computeAt(tv6, 3); + + tv6_rf->computeAt(tv6, -1); + tv0_cache_local->computeAt(tv6_rf, -1); + tv1_cache_local->computeAt(tv6_rf, -1); + + tv0_cache_smem->setMemoryType(MemoryType::Shared); + tv1_cache_smem->setMemoryType(MemoryType::Shared); + + tv5->axis(0)->parallelize(ParallelType::BIDx); + tv5->axis(1)->parallelize(ParallelType::BIDy); + tv5->axis(-3)->parallelize(ParallelType::TIDy); + tv5->axis(-1)->parallelize(ParallelType::TIDx); + + scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion)); + + tv0_cache_local->doubleBuffer(); + tv1_cache_local->doubleBuffer(); + + tv0_cache_smem->doubleBuffer(); + tv1_cache_smem->doubleBuffer(); + + constexpr int M = 154, K = 45, N = 1524; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({M, K}, options); + at::Tensor t1 = at::randn({K, N}, options); + at::Tensor aten_output = matmul(t0.to(at::kDouble), t1.to(at::kDouble)); + + std::vector aten_inputs = {t0, t1}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); + auto cg_outputs = fe.runFusion(aten_inputs); + + testValidate( + &fusion, cg_outputs, aten_inputs, {aten_output}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionIntermediateTensorVectorize_CUDA) { + auto mem_types = {MemoryType::Shared, MemoryType::Local}; + + for (auto mem_type : mem_types) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(1); + fusion.addInput(tv0); + + auto tv1 = set(tv0); + auto tv2 = set(tv1); + auto tv3 = set(tv2); + fusion.addOutput(tv3); + + tv1->setMemoryType(mem_type); + + tv3->split(-1, 4); + TransformPropagator::from(tv3); + + tv1->computeAt(tv3, -2); + + tv2->axis(-1)->parallelize(ParallelType::Vectorize); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({15}, options); + FusionExecutor fe; + fe.compileFusion(&fusion); + + // This should throw an exception as the extent of t0 is not + // divisible by the vector width + ASSERT_ANY_THROW(fe.runFusion({t0})); + + auto t1 = at::randn({16}, options); + auto cg_outputs = fe.runFusion({t1}); + + auto ref = t1; + + testValidate(&fusion, cg_outputs, {t1}, {ref}, __LINE__, __FILE__); + } +} + +TEST_F(NVFuserTest, FusionBroadcastConcretization1_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeConcreteTensor({10, 1}); + fusion.addInput(tv0); + auto tv1 = makeConcreteTensor({10, 20}); + fusion.addInput(tv1); + auto tv2 = makeConcreteTensor({10, 10}); + fusion.addInput(tv2); + + // Not concretized + auto tv3 = sum(tv2, {1}); + auto tv4 = broadcast(tv3, {false, true}); + auto tv5 = add(tv0, tv4); + fusion.addOutput(tv5); + + // Concretized + auto tv6 = sum(tv2, {1}); + auto tv7 = broadcast(tv6, {false, true}); + auto tv8 = add(tv1, tv7); + fusion.addOutput(tv8); + + for (auto tv : {tv3, tv4, tv5, tv6, tv7, tv8}) { + tv->axis(1)->parallelize(ParallelType::TIDx); + } + + GpuLower gpulw(&fusion); + TORCH_CHECK(!gpulw.concretizedBroadcastDomains().isConcretized( + loweredTv(tv4, gpulw)->axis(1))); + TORCH_CHECK(gpulw.concretizedBroadcastDomains().isConcretized( + loweredTv(tv7, gpulw)->axis(1))); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({10, 1}, options); + auto t1 = at::randn({10, 20}, options); + auto t2 = at::randn({10, 10}, options); + std::vector aten_inputs = {t0, t1, t2}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); + auto outputs = fe.runFusion(aten_inputs); + + auto t5 = t0 + t2.sum({1}).unsqueeze(-1); + auto t8 = t1 + t2.sum({1}).unsqueeze(-1); + + testValidate(&fusion, outputs, aten_inputs, {t5, t8}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionBroadcastConcretization2_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + auto tv1 = sum(tv0, {0, 1}); + auto tv2 = broadcast(tv1, {true}); + auto tv3 = broadcast(tv2, {false, true}); + fusion.addOutput(tv3); + + // tv1 is thread-predicated with TIDx and TIDy + tv1->axis(0)->parallelize(ParallelType::TIDx); + tv1->axis(1)->parallelize(ParallelType::TIDy); + // tv2 broadcasts along TIDx + tv2->axis(0)->parallelize(ParallelType::TIDx); + // tv3 broadcasts along TIDy + tv3->axis(0)->parallelize(ParallelType::TIDx); + tv3->axis(1)->parallelize(ParallelType::TIDy); + + // Both tv2 and tv3 broadcast along predicated TID dimensions, but + // since the broadcast domains are not concretized, there should be + // no actual parallel broadcast + + GpuLower gpulw(&fusion); + TORCH_CHECK( + !gpulw.kernel()->summary().has_block_broadcasts && + !gpulw.kernel()->summary().has_grid_broadcasts, + "There must be no parallel broadcast in this fusion"); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn({10, 11}, options); + std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); + auto outputs = fe.runFusion(aten_inputs); + + auto t3 = t0.sum().unsqueeze(-1).unsqueeze(-1); + + testValidate(&fusion, outputs, aten_inputs, {t3}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionBroadcastConcretization3_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + std::vector input_shape({10, 4, 8}); + std::vector output_shape({8, 4, 1}); + + auto tv0 = makeConcreteTensor(input_shape); + fusion.addInput(tv0); + + auto tv2 = sum(tv0, {0}); + auto tv3 = set(tv2); + auto tv4 = + view(tv3, {input_shape.begin() + 1, input_shape.end()}, output_shape); + auto tv5 = add(tv4, IrBuilder::create(1)); + fusion.addOutput(tv5); + + tv2->axis(0)->parallelize(ParallelType::TIDx); + tv4->axis(-1)->parallelize(ParallelType::TIDx); + tv5->axis(-1)->parallelize(ParallelType::TIDx); + + // The view op adds a broadcast domain in tv4, which is + // parallelized. Howver, it is never materialized, so there should + // be no parallel broadcast. + + GpuLower gpulw(&fusion); + TORCH_CHECK( + !gpulw.kernel()->summary().has_block_broadcasts && + !gpulw.kernel()->summary().has_grid_broadcasts, + "There must be no parallel broadcast in this fusion"); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + auto t0 = at::randn(input_shape, options); + std::vector aten_inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, aten_inputs); + auto outputs = fe.runFusion(aten_inputs); + + auto t5 = at::native::view(t0.sum(0), output_shape) + 1; + + testValidate(&fusion, outputs, aten_inputs, {t5}, __LINE__, __FILE__); +} + +// Merging non-broadcast and broadcast domains +// TODO: Fix use case see issue https://github.com/csarofeen/pytorch/issues/1418 +// validateParallelize does not pass. Even if it's skipped, +// generated code is invalid as blockBroadcast is not used. +#if 0 +TEST_F(NVFuserTest, FusionBroadcastConcretization4_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + auto tv1 = sum(tv0, {1}); + auto tv2 = broadcast(tv1, {false, true}); + auto tv3 = add(tv2, tv0); + fusion.addOutput(tv3); + + tv1->axis(1)->parallelize(ParallelType::TIDx); + + tv2->merge(0, 1); + tv2->axis(0)->parallelize(ParallelType::TIDx); + // TODO: When set to shared memory, this kernel should be correct, but fails + // validation and when skipped produces incorrect code + tv2->setMemoryType(MemoryType::Shared); + + tv3->merge(0, 1); + tv3->axis(0)->parallelize(ParallelType::TIDx); + + fusion.printMath(); + fusion.printKernel(); +} +#endif + } // namespace jit } // namespace torch #endif // #if defined(USE_CUDA) diff --git a/test/cpp/jit/test_gpu_shift.cpp b/test/cpp/jit/test_gpu_shift.cpp index 71fa156c2d2..2665f16563b 100644 --- a/test/cpp/jit/test_gpu_shift.cpp +++ b/test/cpp/jit/test_gpu_shift.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -18,8 +19,6 @@ #include #include #include -#include -#include #include #include #include @@ -82,33 +81,38 @@ void checkIntValue( void checkIntValue( kir::ExpressionEvaluator& evaluator, - const kir::Val* val, - kir::Int::ScalarType expected_value) { + const Val* val, + Int::ScalarType expected_value) { const auto actual_value = evaluator.evaluate(val); TORCH_CHECK(actual_value.has_value()); TORCH_CHECK(actual_value.value() == expected_value); } +// Used to signify invalid ranges, i.e., values at offset 0 to +// start_offset, and values at offset stop_offset to the end of the +// domain. +static constexpr int invalid_marker = 1; + // ATen version of tensor shifting auto shift( at::Tensor tensor, const std::vector& offsets, - std::vector strides = {}) { + std::vector padding = {}) { TORCH_INTERNAL_ASSERT(tensor.ndimension() == offsets.size()); - if (strides.empty()) { - strides = std::vector(tensor.ndimension(), 1); + if (padding.empty()) { + padding = offsets; + for (auto& p : padding) { + p = std::abs(p); + } } at::Tensor t = tensor; - std::vector stride_indices; for (size_t i = 0; i < offsets.size(); ++i) { - auto stride = strides[i]; - stride_indices.push_back( - at::indexing::Slice(0, at::indexing::None, stride)); - const auto offset = offsets[i]; + auto offset = offsets[i]; + t = t.roll(offsets[i], i); if (offset == 0) { continue; } - t = t.roll(offsets[i], i); + // Zero padding std::vector indices( tensor.ndimension(), at::indexing::Slice(0, at::indexing::None)); if (offset > 0) { @@ -117,8 +121,20 @@ auto shift( indices[i] = at::indexing::Slice(offset, at::indexing::None); } t.index(indices) = 0; + // Fill the outside range by the special marker value. + const auto pad = padding[i]; + if (offset > 0) { + indices[i] = at::indexing::Slice(0, offset - pad); + } else { + offset += pad; + TORCH_INTERNAL_ASSERT(offset <= 0); + if (offset == 0) { + continue; + } + indices[i] = at::indexing::Slice(offset, at::indexing::None); + } + t.index(indices) = invalid_marker; } - t = t.index(stride_indices); return t; } @@ -153,13 +169,28 @@ auto gather( TORCH_CHECK(w_size != 0); const auto& pad = pad_width[i]; TORCH_CHECK(pad.size() == 2); + const auto out_extent_adj = -w_size + 1 + pad[0] + pad[1]; + TORCH_INTERNAL_ASSERT(out_extent_adj <= 0); + const auto stride = strides[i]; + TORCH_CHECK(stride >= 1); + at::Tensor concat_tensor; + for (int w = 0; w < w_size; ++w) { std::vector shift_offsets(t.ndimension(), 0); shift_offsets[i] = pad[0] - w; - std::vector shift_strides(t.ndimension(), 1); - shift_strides[i] = strides[i]; - auto shifted = shift(t, shift_offsets, shift_strides); + auto shifted = shift(t, shift_offsets); + // Apply stride + if (stride != 1) { + std::vector indices( + shifted.ndimension(), at::indexing::Slice(0, at::indexing::None)); + if (out_extent_adj == 0) { + indices[i] = at::indexing::Slice(0, at::indexing::None, strides[i]); + } else { + indices[i] = at::indexing::Slice(0, out_extent_adj, strides[i]); + } + shifted = shifted.index(indices); + } shifted = shifted.unsqueeze(-1); if (w == 0) { concat_tensor = shifted; @@ -169,13 +200,32 @@ auto gather( } t = concat_tensor; } + + // Fill invalid regions with the marker. Note that when non-unit + // stride is used, it trims invalid regions, so no marking is + // necessary. + for (size_t i = 0; i < window_shape.size(); ++i) { + if (strides[i] != 1) { + continue; + } + + const auto out_extent_adj = + -window_shape[i] + 1 + pad_width[i][0] + pad_width[i][1]; + if (out_extent_adj < 0) { + std::vector indices( + t.ndimension(), at::indexing::Slice(0, at::indexing::None)); + indices[i] = at::indexing::Slice(out_extent_adj, at::indexing::None); + t.index(indices) = invalid_marker; + } + } + return t; } } // namespace // Shift an input tensor -TEST(NVFuserTest, FusionShift1_CUDA) { +TEST_F(NVFuserTest, FusionShift1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -202,7 +252,7 @@ TEST(NVFuserTest, FusionShift1_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = shift(t0, {-1, 0}); @@ -219,19 +269,19 @@ TEST(NVFuserTest, FusionShift1_CUDA) { } // Shifts an intermediate tensor -TEST(NVFuserTest, FusionShift2_CUDA) { +TEST_F(NVFuserTest, FusionShift2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {-1, 0}); fusion.addOutput(tv2); // make it a little more complex - auto tv3 = add(tv0, new Double(3)); - auto tv4 = add(tv3, new Double(4)); + auto tv3 = add(tv0, IrBuilder::create(3)); + auto tv4 = add(tv3, IrBuilder::create(4)); auto tv5 = shift(tv4, {-1, 0}); auto tv6 = shift(tv4, {0, -1}); auto tv7 = shift(tv4, {1, 0}); @@ -250,21 +300,22 @@ TEST(NVFuserTest, FusionShift2_CUDA) { // t4 allocation: (t3.size[0] + 2) * (t3.size[1] + 1) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1 || tensor_name == 3 || tensor_name == 4) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { if (tensor_name == 1 && i == 1) { - TORCH_CHECK(alloc->shape().at(i)->isA()); + TORCH_CHECK(alloc->shape().at(i)->isA()); continue; } auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - TORCH_CHECK(def != nullptr && def->operation() == BinaryOpType::Add); - TORCH_CHECK(def->as()->lhs()->isA()); - auto rhs = dynamic_cast(def->as()->rhs()); + dynamic_cast(alloc->shape().at(i)->definition()); + TORCH_CHECK( + def != nullptr && def->getBinaryOpType() == BinaryOpType::Add); + TORCH_CHECK(def->as()->lhs()->isA()); + auto rhs = dynamic_cast(def->as()->rhs()); TORCH_CHECK(rhs != nullptr && rhs->isConst()); int rhs_value = *rhs->value(); if (tensor_name == 1) { @@ -290,7 +341,7 @@ TEST(NVFuserTest, FusionShift2_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -309,14 +360,14 @@ TEST(NVFuserTest, FusionShift2_CUDA) { testValidate(&fusion, outputs, inputs, {t2, t11}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftRightOfCA_CUDA) { +TEST_F(NVFuserTest, FusionShiftRightOfCA_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {0, 1}); fusion.addOutput(tv2); @@ -324,15 +375,15 @@ TEST(NVFuserTest, FusionShiftRightOfCA_CUDA) { tv1->setMemoryType(MemoryType::Global); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 100; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -341,16 +392,16 @@ TEST(NVFuserTest, FusionShiftRightOfCA_CUDA) { TORCH_CHECK(t2.allclose(outputs[0])); } -TEST(NVFuserTest, FusionShiftLeftOfCA_CUDA) { +TEST_F(NVFuserTest, FusionShiftLeftOfCA_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); auto tv3 = shift(tv2, {-1, 0}); - auto tv4 = add(tv3, new Double(1)); + auto tv4 = add(tv3, IrBuilder::create(1)); fusion.addOutput(tv4); tv0->computeAt(tv4, -1); @@ -360,13 +411,13 @@ TEST(NVFuserTest, FusionShiftLeftOfCA_CUDA) { ASSERT_ANY_THROW(fusion.printKernel()); } -TEST(NVFuserTest, FusionShiftSplit1_CUDA) { +TEST_F(NVFuserTest, FusionShiftSplit1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {0, 1}); auto tv3 = shift(tv1, {0, -2}); fusion.addOutput(tv2); @@ -379,35 +430,29 @@ TEST(NVFuserTest, FusionShiftSplit1_CUDA) { tv0->computeAt(tv2, -2); tv0->computeAt(tv3, -2); - // t1 allocation: (4 + 3) + // t1 allocation: 7 GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1) { TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 3); + auto size = dynamic_cast(alloc->shape().at(0)); + TORCH_CHECK( + size != nullptr && size->isConst() && size->value().value() == 7); } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 9; int numel_y = 11; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -417,23 +462,23 @@ TEST(NVFuserTest, FusionShiftSplit1_CUDA) { testValidate(&fusion, outputs, inputs, {t2, t3}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftSplit2_CUDA) { +TEST_F(NVFuserTest, FusionShiftSplit2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); auto tv3 = shift(tv2, {0, -1}); auto tv4 = shift(tv2, {0, 1}); auto tv5 = add(tv3, tv4); fusion.addOutput(tv5); - auto tv6 = add(tv0, new Double(1)); + auto tv6 = add(tv0, IrBuilder::create(1)); auto tv7 = shift(tv6, {0, 0}); - auto tv8 = add(tv7, new Double(1)); + auto tv8 = add(tv7, IrBuilder::create(1)); fusion.addOutput(tv8); int split_factor = 4; @@ -444,26 +489,20 @@ TEST(NVFuserTest, FusionShiftSplit2_CUDA) { tv0->computeAt(tv5, -2); tv0->computeAt(tv8, -2); - // t1 and t2 allocation: (4 + 2) - // t4 allocation: (4) + // t1 and t2 allocation: 6 + // t4 allocation: 4 GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1 || tensor_name == 2) { TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 2); + auto size = dynamic_cast(alloc->shape().at(0)); + TORCH_CHECK( + size != nullptr && size->isConst() && size->value().value() == 6); } else if (tensor_name == 4) { TORCH_CHECK(alloc->shape().size() == 1); - auto size = dynamic_cast(alloc->shape().at(0)); + auto size = dynamic_cast(alloc->shape().at(0)); TORCH_CHECK(size != nullptr && size->isConst()); int size_value = *size->value(); TORCH_CHECK(size_value == split_factor); @@ -471,15 +510,15 @@ TEST(NVFuserTest, FusionShiftSplit2_CUDA) { } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 9; int numel_y = 11; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 2; @@ -494,14 +533,14 @@ TEST(NVFuserTest, FusionShiftSplit2_CUDA) { testValidate(&fusion, outputs, inputs, {t5, t8}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftDoubleSplit_CUDA) { +TEST_F(NVFuserTest, FusionShiftDoubleSplit_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(2)); auto tv3 = shift(tv2, {0, 1}); fusion.addOutput(tv3); @@ -518,35 +557,29 @@ TEST(NVFuserTest, FusionShiftDoubleSplit_CUDA) { // t2: [i1, i2/8, 8] // t3: [i1, i2/8, 8] - // t1 and t2 allocation: (split_factor1 + 1) + // t1 and t2 allocation: (split_factor1 + 1) = 9 GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1 || tensor_name == 2) { TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1); + auto size = dynamic_cast(alloc->shape().at(0)); + TORCH_CHECK( + size != nullptr && size->isConst() && size->value().value() == 9); } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 3; @@ -555,7 +588,7 @@ TEST(NVFuserTest, FusionShiftDoubleSplit_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShift3ptStencil_CUDA) { +TEST_F(NVFuserTest, FusionShift3ptStencil_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -576,7 +609,7 @@ TEST(NVFuserTest, FusionShift3ptStencil_CUDA) { tv_out = add(tv_out, tv); } - tv_out = div(tv_out, new Double(tvs.size() + 1)); + tv_out = div(tv_out, IrBuilder::create(tvs.size() + 1)); fusion.addOutput(tv_out); @@ -598,32 +631,29 @@ TEST(NVFuserTest, FusionShift3ptStencil_CUDA) { // cache allocation: (split_factor + 2) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == cache->name()) { TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 2); + auto size = dynamic_cast(alloc->shape().at(0)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == split_factor + 2); } } } - FusionExecutor fe; - fe.compileFusion(&fusion); + cache->doubleBuffer(); int numel_x = 99; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3; @@ -631,7 +661,7 @@ TEST(NVFuserTest, FusionShift3ptStencil_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShift5ptStencil_CUDA) { +TEST_F(NVFuserTest, FusionShift5ptStencil_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -651,7 +681,7 @@ TEST(NVFuserTest, FusionShift5ptStencil_CUDA) { tv_out = add(tv_out, tv); } - tv_out = div(tv_out, new Double(tvs.size() + 1)); + tv_out = div(tv_out, IrBuilder::create(tvs.size() + 1)); fusion.addOutput(tv_out); @@ -672,28 +702,22 @@ TEST(NVFuserTest, FusionShift5ptStencil_CUDA) { // cache allocation: (split_factor + 2) * (split_factor + 2) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == cache->name()) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == split_factor[i] + 2); } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); + cache->doubleBuffer(); int numel_x = 99; int numel_y = 101; @@ -701,6 +725,9 @@ TEST(NVFuserTest, FusionShift5ptStencil_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = t0; @@ -712,7 +739,7 @@ TEST(NVFuserTest, FusionShift5ptStencil_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShift9ptStencil_CUDA) { +TEST_F(NVFuserTest, FusionShift9ptStencil_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -740,7 +767,7 @@ TEST(NVFuserTest, FusionShift9ptStencil_CUDA) { tv_out = add(tv_out, tv); } - tv_out = div(tv_out, new Double(tvs.size() + 1)); + tv_out = div(tv_out, IrBuilder::create(tvs.size() + 1)); fusion.addOutput(tv_out); @@ -763,28 +790,22 @@ TEST(NVFuserTest, FusionShift9ptStencil_CUDA) { // cache allocation: (split_factor + 2) * (split_factor + 2) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == cache->name()) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == split_factor[i] + 2); } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); + cache->doubleBuffer(); int numel_x = 99; int numel_y = 101; @@ -792,6 +813,9 @@ TEST(NVFuserTest, FusionShift9ptStencil_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = t0; @@ -803,13 +827,13 @@ TEST(NVFuserTest, FusionShift9ptStencil_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftSmemBlocking_CUDA) { +TEST_F(NVFuserTest, FusionShiftSmemBlocking_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {0, 1}); fusion.addOutput(tv2); @@ -826,35 +850,30 @@ TEST(NVFuserTest, FusionShiftSmemBlocking_CUDA) { // tv1 allocation: (split_factor + 1) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == tv1->name()) { TORCH_CHECK(alloc->shape().size() == 1); for (int i = 0; i < 1; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == smem_block_factor && rhs_value == 1); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == smem_block_factor + 1); } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 100; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -864,7 +883,7 @@ TEST(NVFuserTest, FusionShiftSmemBlocking_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShift3ptStencilParallel_CUDA) { +TEST_F(NVFuserTest, FusionShift3ptStencilParallel_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -881,7 +900,7 @@ TEST(NVFuserTest, FusionShift3ptStencilParallel_CUDA) { tv_out = add(tv_out, tv); } - tv_out = div(tv_out, new Double(tvs.size() + 1)); + tv_out = div(tv_out, IrBuilder::create(tvs.size() + 1)); fusion.addOutput(tv_out); @@ -902,14 +921,16 @@ TEST(NVFuserTest, FusionShift3ptStencilParallel_CUDA) { tv_out->axis(-1)->parallelize(ParallelType::TIDx); tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); + tv0_cache->doubleBuffer(); int numel_x = 99; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3; @@ -917,7 +938,7 @@ TEST(NVFuserTest, FusionShift3ptStencilParallel_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShift5ptStencilParallel_CUDA) { +TEST_F(NVFuserTest, FusionShift5ptStencilParallel_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -937,7 +958,7 @@ TEST(NVFuserTest, FusionShift5ptStencilParallel_CUDA) { tv_out = add(tv_out, tv); } - tv_out = div(tv_out, new Double(tvs.size() + 1)); + tv_out = div(tv_out, IrBuilder::create(tvs.size() + 1)); fusion.addOutput(tv_out); @@ -965,15 +986,15 @@ TEST(NVFuserTest, FusionShift5ptStencilParallel_CUDA) { tv0_cache->axis(-1)->parallelize(ParallelType::TIDx); tv0_cache->axis(-2)->parallelize(ParallelType::TIDy); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = t0; @@ -985,13 +1006,13 @@ TEST(NVFuserTest, FusionShift5ptStencilParallel_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftMerge1_CUDA) { +TEST_F(NVFuserTest, FusionShiftMerge1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {-1, 1}); fusion.addOutput(tv2); @@ -1006,35 +1027,30 @@ TEST(NVFuserTest, FusionShiftMerge1_CUDA) { // t1 allocation: (split_factor + 1) * (split_factor + 1) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 1); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == split_factor + 1); } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -1044,13 +1060,13 @@ TEST(NVFuserTest, FusionShiftMerge1_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftMerge2_CUDA) { +TEST_F(NVFuserTest, FusionShiftMerge2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {1, -1}); auto tv3 = shift(tv1, {-1, 1}); auto tv4 = add(tv2, tv3); @@ -1067,35 +1083,30 @@ TEST(NVFuserTest, FusionShiftMerge2_CUDA) { // t1 allocation: (split_factor + 2) * (split_factor + 2) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor && rhs_value == 2); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == split_factor + 2); } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -1106,14 +1117,14 @@ TEST(NVFuserTest, FusionShiftMerge2_CUDA) { TORCH_CHECK(t4.allclose(outputs[0])); } -TEST(NVFuserTest, FusionShiftGlobal_CUDA) { +TEST_F(NVFuserTest, FusionShiftGlobal_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {0, 1}); auto tv3 = shift(tv1, {-1, 0}); auto tv4 = add(tv2, tv3); @@ -1132,17 +1143,18 @@ TEST(NVFuserTest, FusionShiftGlobal_CUDA) { // t1 allocation: (t1.size[0] + 1) * (t1.size[1] + 1) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - TORCH_CHECK(def != nullptr && def->operation() == BinaryOpType::Add); - TORCH_CHECK(def->as()->lhs()->isA()); - auto rhs = dynamic_cast(def->as()->rhs()); + dynamic_cast(alloc->shape().at(i)->definition()); + TORCH_CHECK( + def != nullptr && def->getBinaryOpType() == BinaryOpType::Add); + TORCH_CHECK(def->as()->lhs()->isA()); + auto rhs = dynamic_cast(def->as()->rhs()); TORCH_CHECK(rhs != nullptr && rhs->isConst()); int rhs_value = *rhs->value(); TORCH_CHECK(rhs_value == 1); @@ -1159,7 +1171,7 @@ TEST(NVFuserTest, FusionShiftGlobal_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -1171,14 +1183,14 @@ TEST(NVFuserTest, FusionShiftGlobal_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) { +TEST_F(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(2)); auto tv3 = shift(tv2, {0, 1}); fusion.addOutput(tv3); @@ -1194,33 +1206,27 @@ TEST(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) { // t1 and t2 allocation: (split_factor1 + 1) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1 || tensor_name == 2) { - TORCH_CHECK(alloc->shape().size() == 1); - auto def = - dynamic_cast(alloc->shape().at(0)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1); + auto size = dynamic_cast(alloc->shape().at(0)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == split_factor1 + 1); } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 3; @@ -1229,15 +1235,15 @@ TEST(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) { +TEST_F(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(2)); auto tv3 = shift(tv2, {1, 1}); fusion.addOutput(tv3); @@ -1271,35 +1277,30 @@ TEST(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) { // t1 and t2 allocation: (split_factor1 + 1) * (split_factor1 + 1) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1 || tensor_name == 2) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor1 && rhs_value == 1); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == split_factor1 + 1); } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = shift(t0 + 1 + 2, {1, 1}); @@ -1307,7 +1308,7 @@ TEST(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) { +TEST_F(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1327,7 +1328,7 @@ TEST(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) { tv_out = add(tv_out, tv); } - tv_out = div(tv_out, new Double(tvs.size() + 1)); + tv_out = div(tv_out, IrBuilder::create(tvs.size() + 1)); fusion.addOutput(tv_out); @@ -1361,35 +1362,30 @@ TEST(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) { // cache allocation: (split_factor1 + 2) * (split_factor2 + 2) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == tv0_cache->name()) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor[i] && rhs_value == 2); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK( + size != nullptr && size->isConst() && + size->value().value() == split_factor[i] + 2); } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = t0; @@ -1401,7 +1397,7 @@ TEST(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftChain1_CUDA) { +TEST_F(NVFuserTest, FusionShiftChain1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1416,15 +1412,15 @@ TEST(NVFuserTest, FusionShiftChain1_CUDA) { tv0->computeAt(tv2, -2); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = shift(shift(t0, {0, 1}), {0, 1}); @@ -1432,7 +1428,7 @@ TEST(NVFuserTest, FusionShiftChain1_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftChain2_CUDA) { +TEST_F(NVFuserTest, FusionShiftChain2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1446,15 +1442,15 @@ TEST(NVFuserTest, FusionShiftChain2_CUDA) { tv0->computeAt(tv2, -2); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = shift(shift(t0, {0, 1}), {0, -1}); @@ -1462,13 +1458,13 @@ TEST(NVFuserTest, FusionShiftChain2_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftChain3_CUDA) { +TEST_F(NVFuserTest, FusionShiftChain3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {0, 1}); auto tv3 = shift(tv2, {0, 1}); fusion.addOutput(tv3); @@ -1484,40 +1480,33 @@ TEST(NVFuserTest, FusionShiftChain3_CUDA) { // tv1: (split_factor + 2) // tv2: (split_factor + 1) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1 || tensor_name == 2) { TORCH_CHECK(alloc->shape().size() == 1); for (int i = 0; i < 1; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK(size != nullptr && size->isConst()); if (tensor_name == 1) { - TORCH_CHECK(rhs_value == 2); + TORCH_CHECK(size->value().value() == split_factor + 2); } else if (tensor_name == 2) { - TORCH_CHECK(rhs_value == 1); + TORCH_CHECK(size->value().value() == split_factor + 1); } } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -1528,7 +1517,7 @@ TEST(NVFuserTest, FusionShiftChain3_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftChain4_CUDA) { +TEST_F(NVFuserTest, FusionShiftChain4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1558,42 +1547,36 @@ TEST(NVFuserTest, FusionShiftChain4_CUDA) { // tv2: (split_factor + 7) * (split_factor + 7) // tv3: (split_factor + 4) * (split_factor + 4) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == 1 || tensor_name == 2) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK(size != nullptr && size->isConst()); + auto size_val = size->value().value(); if (tensor_name == 1) { - TORCH_CHECK(rhs_value == 9); + TORCH_CHECK(size_val == split_factor + 9); } else if (tensor_name == 2) { - TORCH_CHECK(rhs_value == 7); + TORCH_CHECK(size_val == split_factor + 7); } else if (tensor_name == 3) { - TORCH_CHECK(rhs_value == 4); + TORCH_CHECK(size_val == split_factor + 4); } } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = shift(t0, {1, -1}); @@ -1605,7 +1588,7 @@ TEST(NVFuserTest, FusionShiftChain4_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShift5ptStencilChain_CUDA) { +TEST_F(NVFuserTest, FusionShift5ptStencilChain_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1625,7 +1608,8 @@ TEST(NVFuserTest, FusionShift5ptStencilChain_CUDA) { tv_stencil1 = add(tv_stencil1, tv); } - tv_stencil1 = div(tv_stencil1, new Double(tv_stencil1_shifts.size() + 1)); + tv_stencil1 = div( + tv_stencil1, IrBuilder::create(tv_stencil1_shifts.size() + 1)); // Second stencil: Same 5pt stencil std::vector tv_stencil2_shifts; @@ -1638,7 +1622,8 @@ TEST(NVFuserTest, FusionShift5ptStencilChain_CUDA) { tv_stencil2 = add(tv_stencil2, tv); } - tv_stencil2 = div(tv_stencil2, new Double(tv_stencil2_shifts.size() + 1)); + tv_stencil2 = div( + tv_stencil2, IrBuilder::create(tv_stencil2_shifts.size() + 1)); auto tv_out = tv_stencil2; @@ -1682,41 +1667,34 @@ TEST(NVFuserTest, FusionShift5ptStencilChain_CUDA) { // tv0_cache: (split_factor + 4) * (split_factor + 4) // tv_stencil1: (split_factor + 2) * (split_factor + 2) GpuLower gpulw(&fusion); - for (const auto& kir_node : gpulw.kernel()->irNodes()) { - if (auto alloc = dynamic_cast(kir_node.get())) { + for (const auto expr : gpulw.kernel()->unordered_exprs()) { + if (auto alloc = dynamic_cast(expr)) { auto tensor_name = alloc->buffer()->name(); if (tensor_name == tv0_cache->name() || tensor_name == tv_stencil1->name()) { TORCH_CHECK(alloc->shape().size() == 2); for (int i = 0; i < 2; ++i) { - auto def = - dynamic_cast(alloc->shape().at(i)->definition()); - auto lhs = dynamic_cast(def->as()->lhs()); - TORCH_CHECK(lhs != nullptr && lhs->isConst()); - int lhs_value = *lhs->value(); - auto rhs = dynamic_cast(def->as()->rhs()); - TORCH_CHECK(rhs != nullptr && rhs->isConst()); - int rhs_value = *rhs->value(); - TORCH_CHECK(lhs_value == split_factor[i]); + auto size = dynamic_cast(alloc->shape().at(i)); + TORCH_CHECK(size != nullptr && size->isConst()); if (tensor_name == tv0_cache->name()) { - TORCH_CHECK(rhs_value == 4); + TORCH_CHECK(size->value().value() == split_factor[i] + 4); } else if (tensor_name == tv_stencil1->name()) { - TORCH_CHECK(rhs_value == 2); + TORCH_CHECK(size->value().value() == split_factor[i] + 2); } } } } } - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto stencil1 = t0; @@ -1735,13 +1713,13 @@ TEST(NVFuserTest, FusionShift5ptStencilChain_CUDA) { } // Shift a reduced tensor -TEST(NVFuserTest, FusionShiftReduction1_CUDA) { +TEST_F(NVFuserTest, FusionShiftReduction1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); auto tv3 = shift(tv2, {1}); fusion.addOutput(tv3); @@ -1758,7 +1736,7 @@ TEST(NVFuserTest, FusionShiftReduction1_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -1770,13 +1748,13 @@ TEST(NVFuserTest, FusionShiftReduction1_CUDA) { } // Parallelized version of FusionShiftReduction1 -TEST(NVFuserTest, FusionShiftReduction2_CUDA) { +TEST_F(NVFuserTest, FusionShiftReduction2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); auto tv3 = shift(tv2, {1}); fusion.addOutput(tv3); @@ -1799,7 +1777,7 @@ TEST(NVFuserTest, FusionShiftReduction2_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -1810,13 +1788,13 @@ TEST(NVFuserTest, FusionShiftReduction2_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftRfactor1_CUDA) { +TEST_F(NVFuserTest, FusionShiftRfactor1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = sum(tv1, {1}); auto tv3 = shift(tv2, {1}); fusion.addOutput(tv3); @@ -1841,7 +1819,7 @@ TEST(NVFuserTest, FusionShiftRfactor1_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -1852,7 +1830,7 @@ TEST(NVFuserTest, FusionShiftRfactor1_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftBcast1_CUDA) { +TEST_F(NVFuserTest, FusionShiftBcast1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1877,7 +1855,7 @@ TEST(NVFuserTest, FusionShiftBcast1_CUDA) { std::vector inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t4 = t0.unsqueeze(-1).expand({numel_x, numel_y}) + t1; @@ -1886,7 +1864,7 @@ TEST(NVFuserTest, FusionShiftBcast1_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftBcast2_CUDA) { +TEST_F(NVFuserTest, FusionShiftBcast2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1911,7 +1889,7 @@ TEST(NVFuserTest, FusionShiftBcast2_CUDA) { std::vector inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y}); @@ -1922,7 +1900,7 @@ TEST(NVFuserTest, FusionShiftBcast2_CUDA) { } // Combine ShiftBcast1 and ShiftBcast2 with parallelization -TEST(NVFuserTest, FusionShiftBcast3_CUDA) { +TEST_F(NVFuserTest, FusionShiftBcast3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -1959,7 +1937,7 @@ TEST(NVFuserTest, FusionShiftBcast3_CUDA) { std::vector inputs = {t0, t1}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y}); @@ -1972,14 +1950,14 @@ TEST(NVFuserTest, FusionShiftBcast3_CUDA) { } // See issue #893 -TEST(NVFuserTest, FusionShiftSyncPlacement1_CUDA) { +TEST_F(NVFuserTest, FusionShiftSyncPlacement1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv0, IrBuilder::create(2)); auto tv3 = add(tv1, tv2); auto tv4 = shift(tv3, {0, 1}); fusion.addOutput(tv4); @@ -1996,15 +1974,15 @@ TEST(NVFuserTest, FusionShiftSyncPlacement1_CUDA) { tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -2016,14 +1994,14 @@ TEST(NVFuserTest, FusionShiftSyncPlacement1_CUDA) { } // See issue #893. Top-level placement. -TEST(NVFuserTest, FusionShiftSyncPlacement2_CUDA) { +TEST_F(NVFuserTest, FusionShiftSyncPlacement2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv0, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv0, IrBuilder::create(2)); auto tv3 = add(tv1, tv2); auto tv4 = shift(tv3, {1}); fusion.addOutput(tv4); @@ -2037,14 +2015,14 @@ TEST(NVFuserTest, FusionShiftSyncPlacement2_CUDA) { tv3->axis(-1)->parallelize(ParallelType::TIDx); tv4->axis(-1)->parallelize(ParallelType::TIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -2055,14 +2033,14 @@ TEST(NVFuserTest, FusionShiftSyncPlacement2_CUDA) { testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftSyncPlacement3_CUDA) { +TEST_F(NVFuserTest, FusionShiftSyncPlacement3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); - auto tv2 = add(tv1, new Double(2)); + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = add(tv1, IrBuilder::create(2)); auto tv3 = shift(tv2, {1}); fusion.addOutput(tv3); @@ -2093,7 +2071,7 @@ TEST(NVFuserTest, FusionShiftSyncPlacement3_CUDA) { // along the Y dimension. The other 10 warps are used to load a 32x10 // tile, and all warps will do coalesced loads. No such optimization // is done in the fuser version. -TEST(NVFuserTest, FusionHdiff_CUDA) { +TEST_F(NVFuserTest, FusionHdiff_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2123,7 +2101,7 @@ TEST(NVFuserTest, FusionHdiff_CUDA) { // T9 = T0 * 4 // T10 = T9 - T8 - auto lap = sub(mul(inp, new Double(4)), sum_of_neighbors); + auto lap = sub(mul(inp, IrBuilder::create(4)), sum_of_neighbors); // T11 = shift(T10) // T12 = T11 - T10 @@ -2133,8 +2111,9 @@ TEST(NVFuserTest, FusionHdiff_CUDA) { // T16 = T15 > 0 // T17 = T16 ? 0 : T12 auto flx_cond = - gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)), new Double(0)); - auto flx0 = where(flx_cond, new Double(0), flx); + gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)), + IrBuilder::create(0)); + auto flx0 = where(flx_cond, IrBuilder::create(0), flx); // T18 = shift(T10) // T19 = T18 - T10 @@ -2144,9 +2123,10 @@ TEST(NVFuserTest, FusionHdiff_CUDA) { // T22 = T19 * T21 // T23 = T22 > 0 auto fly_cond = - gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)), new Double(0)); + gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)), + IrBuilder::create(0)); // T24 = T23 ? 0 : T19 - auto fly0 = where(fly_cond, new Double(0), fly); + auto fly0 = where(fly_cond, IrBuilder::create(0), fly); // T25 = shift(flx0) // T26 = T17 - T25 @@ -2233,9 +2213,6 @@ TEST(NVFuserTest, FusionHdiff_CUDA) { } ///////////////////////////////// - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 101; int numel_y = 99; int numel_z = 10; @@ -2244,7 +2221,11 @@ TEST(NVFuserTest, FusionHdiff_CUDA) { at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options); at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options); std::vector inputs = {inp_at, coeff_at}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto fuser_output = fe.runFusion(inputs)[0]; + // Trim the outer rim std::vector indices{ at::indexing::Slice(0, at::indexing::None), @@ -2273,7 +2254,7 @@ TEST(NVFuserTest, FusionHdiff_CUDA) { } } -TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { +TEST_F(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2303,7 +2284,7 @@ TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { // T9 = T0 * 4 // T10 = T9 - T8 - auto lap = sub(mul(inp, new Double(4)), sum_of_neighbors); + auto lap = sub(mul(inp, IrBuilder::create(4)), sum_of_neighbors); // T11 = shift(T10) // T12 = T11 - T10 @@ -2313,8 +2294,9 @@ TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { // T16 = T15 > 0 // T17 = T16 ? 0 : T12 auto flx_cond = - gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)), new Double(0)); - auto flx0 = where(flx_cond, new Double(0), flx); + gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)), + IrBuilder::create(0)); + auto flx0 = where(flx_cond, IrBuilder::create(0), flx); // T18 = shift(T10) // T19 = T18 - T10 @@ -2324,9 +2306,10 @@ TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { // T22 = T19 * T21 // T23 = T22 > 0 auto fly_cond = - gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)), new Double(0)); + gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)), + IrBuilder::create(0)); // T24 = T23 ? 0 : T19 - auto fly0 = where(fly_cond, new Double(0), fly); + auto fly0 = where(fly_cond, IrBuilder::create(0), fly); // T25 = shift(flx0) // T26 = T17 - T25 @@ -2428,9 +2411,6 @@ TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { } ///////////////////////////////// - FusionExecutor fe; - fe.compileFusion(&fusion); - const int halo_extent = 2; const int numel_x = 64 + halo_extent * 2; const int numel_y = 64 + halo_extent * 2; @@ -2440,7 +2420,11 @@ TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options); at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options); std::vector inputs = {inp_at, coeff_at}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto fuser_output = fe.runFusion(inputs)[0]; + // Trim the outer rim std::vector indices{ at::indexing::Slice(0, at::indexing::None), @@ -2470,7 +2454,7 @@ TEST(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) { } // 3x3 max pooling -TEST(NVFuserTest, FusionMaxPooling_CUDA) { +TEST_F(NVFuserTest, FusionMaxPooling_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2537,9 +2521,6 @@ TEST(NVFuserTest, FusionMaxPooling_CUDA) { max_tensor->axis(0)->parallelize(ParallelType::BIDx); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int hw = 50; const int num_channels = 20; const int pooling_window = 3; @@ -2555,6 +2536,8 @@ TEST(NVFuserTest, FusionMaxPooling_CUDA) { aten_inp = at::abs(aten_inp); std::vector inputs = {aten_inp}; + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = at::max_pool2d( @@ -2563,7 +2546,7 @@ TEST(NVFuserTest, FusionMaxPooling_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGatherPadding1_CUDA) { +TEST_F(NVFuserTest, FusionGather1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2586,13 +2569,13 @@ TEST(NVFuserTest, FusionGatherPadding1_CUDA) { auto ref = gather(t0, window_shape, padding_width); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto outputs = fe.runFusion({t0}); TORCH_CHECK(ref.equal(outputs[0])); } -TEST(NVFuserTest, FusionGatherPadding2_CUDA) { +TEST_F(NVFuserTest, FusionGather2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2602,7 +2585,7 @@ TEST(NVFuserTest, FusionGatherPadding2_CUDA) { auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = gather(tv1, window_shape, padding_width); @@ -2629,7 +2612,7 @@ TEST(NVFuserTest, FusionGatherPadding2_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -2639,7 +2622,332 @@ TEST(NVFuserTest, FusionGatherPadding2_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionConv2DStatic_CUDA) { +TEST_F(NVFuserTest, FusionGather3_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + const std::vector window_shape = {1, 3}; + const std::vector> padding_width = {{0, 0}, {0, 0}}; + + auto tv1 = gather(tv0, window_shape, padding_width); + + fusion.addOutput(tv1); + + const int s1 = 11; + const int s2 = 13; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + std::vector size({s1, s2}); + at::Tensor t0 = at::randn(size, options); + size.insert(size.end(), window_shape.begin(), window_shape.end()); + // Use a pre-allocated output tensor filled with 1 so that invalid + // writes to outside valid ranges can be detected + at::Tensor output = at::ones(size, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto outputs = fe.runFusion({t0}, {output}); + + auto ref = gather(t0, window_shape, padding_width); + TORCH_CHECK(ref.equal(outputs[0])); +} + +TEST_F(NVFuserTest, FusionGather4_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + const std::vector window_shape = {3, 3}; + const std::vector> padding_width = {{0, 0}, {0, 0}}; + + auto tv1 = gather(tv0, window_shape, padding_width); + + fusion.addOutput(tv1); + + const int s1 = 11; + const int s2 = 13; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + std::vector size({s1, s2}); + at::Tensor t0 = at::randn(size, options); + size.insert(size.end(), window_shape.begin(), window_shape.end()); + // Use a pre-allocated output tensor filled with 1 so that invalid + // writes to outside valid ranges can be detected + at::Tensor output = at::ones(size, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto outputs = fe.runFusion({t0}, {output}); + + auto ref = gather(t0, window_shape, padding_width); + + TORCH_CHECK(ref.equal(outputs[0])); +} + +TEST_F(NVFuserTest, FusionGather5_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + const std::vector window_shape = {3, 3}; + const std::vector> padding_width = {{1, 0}, {0, 1}}; + + auto tv1 = gather(tv0, window_shape, padding_width); + + fusion.addOutput(tv1); + + const int s1 = 11; + const int s2 = 13; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + std::vector size({s1, s2}); + at::Tensor t0 = at::randn(size, options); + size.insert(size.end(), window_shape.begin(), window_shape.end()); + // Use a pre-allocated output tensor filled with 1 so that invalid + // writes to outside valid ranges can be detected + at::Tensor output = at::ones(size, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto outputs = fe.runFusion({t0}, {output}); + + auto ref = gather(t0, window_shape, padding_width); + + TORCH_CHECK(ref.equal(outputs[0])); +} + +// Conv-like pattern with no padding +TEST_F(NVFuserTest, FusionGather6_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + const std::vector window_shape = {3, 4}; + const std::vector> padding_width = {{0, 0}, {0, 0}}; + + auto tv1 = gather(tv0, window_shape, padding_width); + + fusion.addOutput(tv1); + + // Blocking the spatial dimensions + const int block_x = 16; + const int block_y = 8; + + auto tv0_cache = tv0->cache_after(); + auto out = tv1; + auto out_cache = out->cache_before(); + + out->split(1, block_x); + out->split(0, block_y); + out->reorder({{1, 2}, {2, 1}}); + + TransformPropagator::from(out); + + tv0->computeAt(out, 2); + + tv0_cache->setMemoryType(MemoryType::Shared); + + out->axis(0)->parallelize(ParallelType::BIDy); + out->axis(1)->parallelize(ParallelType::BIDx); + out->axis(2)->parallelize(ParallelType::TIDy); + out->axis(3)->parallelize(ParallelType::TIDx); + scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion)); + + const int s1 = 101; + const int s2 = 99; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + std::vector size({s1, s2}); + at::Tensor t0 = at::randn(size, options); + size.insert(size.end(), window_shape.begin(), window_shape.end()); + // Use a pre-allocated output tensor filled with 1 so that invalid + // writes to outside valid ranges can be detected + at::Tensor output = at::ones(size, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto outputs = fe.runFusion({t0}, {output}); + + auto ref = gather(t0, window_shape, padding_width); + + TORCH_CHECK(ref.equal(outputs[0])); +} + +// Conv-like pattern with irregular padding +TEST_F(NVFuserTest, FusionGather7_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + const std::vector window_shape = {3, 4}; + const std::vector> padding_width = {{0, 2}, {2, 1}}; + + auto tv1 = gather(tv0, window_shape, padding_width); + + fusion.addOutput(tv1); + + // Blocking the spatial dimensions + const int block_x = 16; + const int block_y = 8; + + auto tv0_cache = tv0->cache_after(); + auto out = tv1; + auto out_cache = out->cache_before(); + + out->split(1, block_x); + out->split(0, block_y); + out->reorder({{1, 2}, {2, 1}}); + + TransformPropagator::from(out); + + tv0->computeAt(out, 2); + + tv0_cache->setMemoryType(MemoryType::Shared); + + out->axis(0)->parallelize(ParallelType::BIDy); + out->axis(1)->parallelize(ParallelType::BIDx); + out->axis(2)->parallelize(ParallelType::TIDy); + out->axis(3)->parallelize(ParallelType::TIDx); + scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion)); + + const int s1 = 101; + const int s2 = 99; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + std::vector size({s1, s2}); + at::Tensor t0 = at::randn(size, options); + size.insert(size.end(), window_shape.begin(), window_shape.end()); + at::Tensor output = at::ones(size, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto outputs = fe.runFusion({t0}, {output}); + + auto ref = gather(t0, window_shape, padding_width); + + TORCH_CHECK(ref.equal(outputs[0])); +} + +// With no padding but with striding +TEST_F(NVFuserTest, FusionGather8_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + const std::vector window_shape = {2, 3}; + const std::vector> padding_width = {{0, 0}, {0, 0}}; + const std::vector strides = {3, 3}; + + auto tv1 = gather(tv0, window_shape, padding_width, strides); + + fusion.addOutput(tv1); + + const int s1 = 11; + const int s2 = 13; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + std::vector size({s1, s2}); + at::Tensor t0 = at::randn(size, options); + for (const auto i : c10::irange(size.size())) { + size[i] = ceilDiv( + size[i] - window_shape[i] + 1 + padding_width[i][0] + + padding_width[i][1], + strides[i]); + } + size.insert(size.end(), window_shape.begin(), window_shape.end()); + // Use a pre-allocated output tensor filled with 1 so that invalid + // writes to outside valid ranges can be detected + at::Tensor output = at::ones(size, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto outputs = fe.runFusion({t0}, {output}); + + auto ref = gather(t0, window_shape, padding_width, strides); + + TORCH_CHECK(ref.equal(outputs[0])); +} + +// Similar to Gather8 but with splitting and parallelization +TEST_F(NVFuserTest, FusionGather9_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + const std::vector window_shape = {3, 4}; + const std::vector> padding_width = {{0, 0}, {0, 0}}; + const std::vector strides = {2, 2}; + + auto tv1 = gather(tv0, window_shape, padding_width, strides); + + fusion.addOutput(tv1); + + // Blocking the spatial dimensions + const int block_x = 16; + const int block_y = 8; + + auto tv0_cache = tv0->cache_after(); + auto out = tv1; + auto out_cache = out->cache_before(); + + out->split(1, block_x); + out->split(0, block_y); + out->reorder({{1, 2}, {2, 1}}); + + TransformPropagator::from(out); + + tv0->computeAt(out, 2); + + tv0_cache->setMemoryType(MemoryType::Shared); + + out->axis(0)->parallelize(ParallelType::BIDy); + out->axis(1)->parallelize(ParallelType::BIDx); + out->axis(2)->parallelize(ParallelType::TIDy); + out->axis(3)->parallelize(ParallelType::TIDx); + scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion)); + + const int s1 = 101; + const int s2 = 99; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + std::vector size({s1, s2}); + at::Tensor t0 = at::randn(size, options); + for (const auto i : c10::irange(size.size())) { + size[i] = ceilDiv( + size[i] - window_shape[i] + 1 + padding_width[i][0] + + padding_width[i][1], + strides[i]); + } + size.insert(size.end(), window_shape.begin(), window_shape.end()); + // Use a pre-allocated output tensor filled with 1 so that invalid + // writes to outside valid ranges can be detected + at::Tensor output = at::ones(size, options); + + FusionExecutor fe; + fe.compileFusion(&fusion, {t0}); + auto outputs = fe.runFusion({t0}, {output}); + + auto ref = gather(t0, window_shape, padding_width, strides); + + TORCH_CHECK(ref.equal(outputs[0])); +} + +TEST_F(NVFuserTest, FusionConv2D_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2709,9 +3017,6 @@ TEST(NVFuserTest, FusionConv2DStatic_CUDA) { scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int dim_h = 99; const int dim_w = 101; const int dim_c = 10; @@ -2723,6 +3028,8 @@ TEST(NVFuserTest, FusionConv2DStatic_CUDA) { at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options); std::vector inputs = {at_inp, at_w}; + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto cg_outputs = fe.runFusion(inputs); at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis @@ -2732,9 +3039,7 @@ TEST(NVFuserTest, FusionConv2DStatic_CUDA) { testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); } -// Mostly the same as the static conv test, but the shape of the weights, -// 3x3 in this case, is given dynamically -TEST(NVFuserTest, FusionConv2DDynamic_CUDA) { +TEST_F(NVFuserTest, FusionConv2DNoPadding_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -2742,26 +3047,14 @@ TEST(NVFuserTest, FusionConv2DDynamic_CUDA) { auto inp = makeSymbolicTensor(3); fusion.addInput(inp); - // Weights: [K, C, S, T] + // Weights: [K, C, 3, 3] auto w = makeSymbolicTensor(4); fusion.addInput(w); - auto w_h = new Int(); - fusion.addInput(w_h); - auto w_w = new Int(); - fusion.addInput(w_w); - - auto pad_h = new Int(); - fusion.addInput(pad_h); - auto pad_w = new Int(); - fusion.addInput(pad_w); - - // Gather a neighbor tile of [w_dim_h, w_dim_w] with padding - auto inp_tile = gather( - inp, - {new Int(1), w_h, w_w}, - {{new Int(0), new Int(0)}, {pad_h, pad_h}, {pad_w, pad_w}}); - // inp_tile: [C, 1, H - w_h + 1, W - w_w + 1, w_h, w_w] + // Gather a neighbor tile of [3, 3] with no padding + auto inp_tile = + gather(inp, {1, 3, 3}, {{0, 0}, {0, 0}, {0, 0}}, {1, 1, 1}, true); + // inp_tile: [C, H-2, W-2, 1, 3, 3] auto inp_bc = broadcast(inp_tile, {true, false, false, false, false, false, false}); @@ -2775,6 +3068,7 @@ TEST(NVFuserTest, FusionConv2DDynamic_CUDA) { fusion.addOutput(out); //////////////////////////////////// + // Cache the input and weight tensors auto inp_cache = inp->cache_after(); @@ -2815,36 +3109,135 @@ TEST(NVFuserTest, FusionConv2DDynamic_CUDA) { scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); + const int dim_h = 99; + const int dim_w = 101; + const int dim_c = 10; + const int dim_f = 20; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); + at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options); + std::vector inputs = {at_inp, at_w}; + FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); + auto cg_outputs = fe.runFusion(inputs); + + at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis + std::vector stride = {1, 1}; + std::vector padding = {0, 0}; + auto at_out = at::conv2d(at_inp, at_w, {}, stride, padding); + at_out = at_out.squeeze(0); // drop the N axis + + testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionConv2DNoPaddingStrided_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Input: [C, H, W] + auto inp = makeSymbolicTensor(3); + fusion.addInput(inp); + + // Weights: [K, C, 3, 3] + auto w = makeSymbolicTensor(4); + fusion.addInput(w); + + // Gather a neighbor tile of [2, 2] with no padding and strides of + // [2, 2] + auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 0}, {0, 0}}, {1, 2, 2}); + // inp_tile: [C, H/2, W/2, 1, 2, 2] + + auto inp_bc = + broadcast(inp_tile, {true, false, false, false, false, false, false}); + auto w_bc = broadcast(w, {false, false, true, true, true, false, false}); + + auto inp_times_w = mul(inp_bc, w_bc); + + // Reduce the channel and neighbor tile dimensions + auto out = sum(inp_times_w, {1, 4, 5, 6}); + + fusion.addOutput(out); + + //////////////////////////////////// + + // Cache the input and weight tensors + auto inp_cache = inp->cache_after(); + + // Blocking the spatial dimensions + const int block_w = 16; + const int block_h = 4; + // Blocking the channel dimension + const int block_c = 8; + + out->split(2, block_h); + out->split(4, block_w); + out->reorder({{3, 4}}); + // out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3] + + out->split(1, block_c); + // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3] + + auto out_rf = out->rFactor({1, -3, -2, -1}); + // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3] + // out_rf: [K, Ci, Ho, Wo, Hi, Wi] + + // Create a [block_x, block_y] tile on smem + inp_cache->computeAt(out, 4); + // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi] + inp_cache->setMemoryType(MemoryType::Shared); + + // Move Ci forward + out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}}); + inp_cache->computeAt(out_rf, 5); + + inp_tile->computeAt(out_rf, -1); + w->computeAt(out_rf, -1); + + out->axis(0)->parallelize(ParallelType::BIDx); + out->axis(1)->parallelize(ParallelType::TIDz); + out->axis(4)->parallelize(ParallelType::TIDy); + out->axis(5)->parallelize(ParallelType::TIDx); + + scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); const int dim_h = 99; const int dim_w = 101; const int dim_c = 10; const int dim_f = 20; - const int dim_w_h = 3; - const int dim_w_w = 3; - const int dim_pad_h = (dim_w_h - 1) / 2; - const int dim_pad_w = (dim_w_w - 1) / 2; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); - at::Tensor at_w = at::randn({dim_f, dim_c, dim_w_h, dim_w_w}, options); - std::vector inputs = { - at_inp, at_w, dim_w_h, dim_w_w, dim_pad_h, dim_pad_w}; + at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options); + std::vector inputs = {at_inp, at_w}; + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto cg_outputs = fe.runFusion(inputs); at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis - auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1); + std::vector stride = {2, 2}; + std::vector padding = {0, 0}; + auto at_out = at::conv2d(at_inp, at_w, {}, stride, padding); at_out = at_out.squeeze(0); // drop the N axis testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); } // 5x5 followed by 3x3 -TEST(NVFuserTest, FusionConv2DDynamicChain_CUDA) { +TEST_F(NVFuserTest, FusionConv2DChain_CUDA) { + const int dim_w1_h = 5; + const int dim_w1_w = 5; + const int dim_pad1_h = (dim_w1_h - 1) / 2; + const int dim_pad1_w = (dim_w1_w - 1) / 2; + const int dim_w2_h = 3; + const int dim_w2_w = 3; + const int dim_pad2_h = (dim_w2_h - 1) / 2; + const int dim_pad2_w = (dim_w2_w - 1) / 2; + Fusion fusion; FusionGuard fg(&fusion); @@ -2860,31 +3253,11 @@ TEST(NVFuserTest, FusionConv2DDynamicChain_CUDA) { auto w2 = makeSymbolicTensor(4); fusion.addInput(w2); - auto w1_h = new Int(); - fusion.addInput(w1_h); - auto w1_w = new Int(); - fusion.addInput(w1_w); - - auto w2_h = new Int(); - fusion.addInput(w2_h); - auto w2_w = new Int(); - fusion.addInput(w2_w); - - auto pad_h1 = new Int(); - fusion.addInput(pad_h1); - auto pad_w1 = new Int(); - fusion.addInput(pad_w1); - - auto pad_h2 = new Int(); - fusion.addInput(pad_h2); - auto pad_w2 = new Int(); - fusion.addInput(pad_w2); - // Gather a neighbor tile of [w1_h, w1_w] with padding auto inp_tile = gather( inp, - {new Int(1), w1_h, w1_w}, - {{new Int(0), new Int(0)}, {pad_h1, pad_h1}, {pad_w1, pad_w1}}); + {1, dim_w1_h, dim_w1_w}, + {{0, 0}, {dim_pad1_h, dim_pad1_h}, {dim_pad1_w, dim_pad1_w}}); // inp_tile: [C, 1, H - w1_h + 1, W - w1_w + 1, w1_h, w1_w] auto inp_bc = @@ -2899,8 +3272,8 @@ TEST(NVFuserTest, FusionConv2DDynamicChain_CUDA) { // Second conv auto out1_tile = gather( out1, - {new Int(1), w2_h, w2_w}, - {{new Int(0), new Int(0)}, {pad_h2, pad_h2}, {pad_w2, pad_w2}}); + {1, dim_w2_h, dim_w2_w}, + {{0, 0}, {dim_pad2_h, dim_pad2_h}, {dim_pad2_w, dim_pad2_w}}); auto out1_bc = broadcast(out1_tile, {true, false, false, false, false, false, false}); @@ -2948,41 +3321,21 @@ TEST(NVFuserTest, FusionConv2DDynamicChain_CUDA) { scheduler_utils::parallelizeAllLike(out2, {inp_cache, out1}); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int dim_h = 99; const int dim_w = 101; const int dim_k1 = 3; const int dim_k2 = 5; const int dim_k3 = 7; - const int dim_w1_h = 5; - const int dim_w1_w = 5; - const int dim_pad1_h = (dim_w1_h - 1) / 2; - const int dim_pad1_w = (dim_w1_w - 1) / 2; - const int dim_w2_h = 3; - const int dim_w2_w = 3; - const int dim_pad2_h = (dim_w2_h - 1) / 2; - const int dim_pad2_w = (dim_w2_w - 1) / 2; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::manual_seed(0); at::Tensor at_inp = at::randn({dim_k1, dim_h, dim_w}, options); at::Tensor at_w1 = at::randn({dim_k2, dim_k1, dim_w1_h, dim_w1_w}, options); at::Tensor at_w2 = at::randn({dim_k3, dim_k2, dim_w2_h, dim_w2_w}, options); - std::vector inputs = { - at_inp, - at_w1, - at_w2, - dim_w1_h, - dim_w1_w, - dim_w2_h, - dim_w2_w, - dim_pad1_h, - dim_pad1_w, - dim_pad2_h, - dim_pad2_w}; + std::vector inputs = {at_inp, at_w1, at_w2}; + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto cg_outputs = fe.runFusion(inputs); at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis @@ -2993,7 +3346,7 @@ TEST(NVFuserTest, FusionConv2DDynamicChain_CUDA) { testValidate(&fusion, cg_outputs, inputs, {at_out2}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) { +TEST_F(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3064,9 +3417,6 @@ TEST(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) { scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int dim_h = 99; const int dim_w = 101; const int dim_c = 10; @@ -3078,6 +3428,8 @@ TEST(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) { at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options); std::vector inputs = {at_inp, at_w}; + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto cg_outputs = fe.runFusion(inputs); at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis @@ -3095,8 +3447,299 @@ TEST(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) { testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); } +TEST_F(NVFuserTest, FusionConv4x4Pad1x1_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Input: [C, H, W] + auto inp = makeSymbolicTensor(3); + fusion.addInput(inp); + + // Weights: [K, C, 4, 4] + auto w = makeSymbolicTensor(4); + fusion.addInput(w); + + // Gather a neighbor tile of [4, 4] with padding size of 1 for both + // sides of the spatial dimensions. The resulting extent is + // decreased by one. + auto inp_tile = + gather(inp, {1, 4, 4}, {{0, 0}, {1, 1}, {1, 1}}, {1, 1, 1}, true); + // inp_tile: [C, H-1, W-1, 1, 4, 4] + + auto inp_bc = + broadcast(inp_tile, {true, false, false, false, false, false, false}); + auto w_bc = broadcast(w, {false, false, true, true, true, false, false}); + + auto inp_times_w = mul(inp_bc, w_bc); + + // Reduce the channel and neighbor tile dimensions + auto out = sum(inp_times_w, {1, 4, 5, 6}); + + fusion.addOutput(out); + + //////////////////////////////////// + + // Cache the input and weight tensors + auto inp_cache = inp->cache_after(); + + // Blocking the spatial dimensions + const int block_w = 16; + const int block_h = 4; + // Blocking the channel dimension + const int block_c = 8; + + out->split(2, block_h); + out->split(4, block_w); + out->reorder({{3, 4}}); + // out: [K, C, Ho, Wo, Hi, Wi, 1, 4, 4] + + out->split(1, block_c); + // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 4, 4] + + auto out_rf = out->rFactor({1, -3, -2, -1}); + // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 4, 4] + // out_rf: [K, Ci, Ho, Wo, Hi, Wi] + + // Create a [block_x, block_y] tile on smem + inp_cache->computeAt(out, 4); + // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi] + inp_cache->setMemoryType(MemoryType::Shared); + + // Move Ci forward + out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}}); + inp_cache->computeAt(out_rf, 5); + + inp_tile->computeAt(out_rf, -1); + w->computeAt(out_rf, -1); + + out->axis(0)->parallelize(ParallelType::BIDx); + out->axis(1)->parallelize(ParallelType::TIDz); + out->axis(4)->parallelize(ParallelType::TIDy); + out->axis(5)->parallelize(ParallelType::TIDx); + + scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); + + const int dim_h = 99; + const int dim_w = 101; + const int dim_c = 10; + const int dim_f = 20; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); + at::Tensor at_w = at::randn({dim_f, dim_c, 4, 4}, options); + std::vector inputs = {at_inp, at_w}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); + auto cg_outputs = fe.runFusion(inputs); + + at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis + auto at_out = + at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 1, 1); + at_out = at_out.squeeze(0); // drop the N axis + + testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionConv4x5Pad1x2_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Input: [C, H, W] + auto inp = makeSymbolicTensor(3); + fusion.addInput(inp); + + // Weights: [K, C, 4, 4] + auto w = makeSymbolicTensor(4); + fusion.addInput(w); + + // Gather a neighbor tile of [4, 5] with padding size of 1 and 2 for + // each side of the spatial dimensions. + auto inp_tile = + gather(inp, {1, 4, 5}, {{0, 0}, {1, 1}, {2, 2}}, {1, 1, 1}, true); + // inp_tile: [C, H-1, W, 1, 4, 5] + + auto inp_bc = + broadcast(inp_tile, {true, false, false, false, false, false, false}); + auto w_bc = broadcast(w, {false, false, true, true, true, false, false}); + + auto inp_times_w = mul(inp_bc, w_bc); + + // Reduce the channel and neighbor tile dimensions + auto out = sum(inp_times_w, {1, 4, 5, 6}); + + fusion.addOutput(out); + + //////////////////////////////////// + + // Cache the input and weight tensors + auto inp_cache = inp->cache_after(); + + // Blocking the spatial dimensions + const int block_w = 16; + const int block_h = 4; + // Blocking the channel dimension + const int block_c = 8; + + out->split(2, block_h); + out->split(4, block_w); + out->reorder({{3, 4}}); + // out: [K, C, Ho, Wo, Hi, Wi, 1, 4, 5] + + out->split(1, block_c); + // out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 4, 5] + + auto out_rf = out->rFactor({1, -3, -2, -1}); + // out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 4, 5] + // out_rf: [K, Ci, Ho, Wo, Hi, Wi] + + // Create a [block_x, block_y] tile on smem + inp_cache->computeAt(out, 4); + // inp_cache: [Co, Ho, Wo, Ci, Hi, Wi] + inp_cache->setMemoryType(MemoryType::Shared); + + // Move Ci forward + out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}}); + inp_cache->computeAt(out_rf, 5); + + inp_tile->computeAt(out_rf, -1); + w->computeAt(out_rf, -1); + + out->axis(0)->parallelize(ParallelType::BIDx); + out->axis(1)->parallelize(ParallelType::TIDz); + out->axis(4)->parallelize(ParallelType::TIDy); + out->axis(5)->parallelize(ParallelType::TIDx); + + scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); + + const int dim_h = 99; + const int dim_w = 101; + const int dim_c = 10; + const int dim_f = 20; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); + at::Tensor at_w = at::randn({dim_f, dim_c, 4, 5}, options); + std::vector inputs = {at_inp, at_w}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); + auto cg_outputs = fe.runFusion(inputs); + + at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis + auto at_out = + at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 1, {1, 2}); + at_out = at_out.squeeze(0); // drop the N axis + + testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionConv4x4Pad1x1Stride4_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + // Input: [C, H, W] + auto inp = makeSymbolicTensor(3); + fusion.addInput(inp); + + // Weights: [K, C, 3, 3] + auto w = makeSymbolicTensor(4); + fusion.addInput(w); + + // Gather a neighbor tile of [4, 4] with padding size of 1 for both + // sides of the spatial dimensions. Set the stride width as 4. + auto inp_tile = gather(inp, {1, 4, 4}, {{0, 0}, {1, 1}, {1, 1}}, {1, 4, 4}); + // inp_tile: [C, H/4, s4, W/4, s4, 1, 4, 4] + + auto inp_bc = + broadcast(inp_tile, {true, false, false, false, false, false, false}); + auto w_bc = broadcast(w, {false, false, true, true, true, false, false}); + + auto inp_times_w = mul(inp_bc, w_bc); + + // Reduce the channel and neighbor tile dimensions + auto out = sum(inp_times_w, {1, 4, 5, 6}); + + fusion.addOutput(out); + + //////////////////////////////////// + + // Cache the input and weight tensors + auto inp_cache = inp->cache_after(); + + // Blocking the spatial dimensions + const int block_w = 16; + const int block_h = 4; + const int block_c = 2; + + // [K, C, H/s, W/s, 1, 4, 4] + out->split(2, block_h); + // [K, C, H/s/block_h, block_h, W/s, 1, 4, 4] + out->split(4, block_w); + // [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 4, 4] + out->reorder({{3, 4}}); + // [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 4, 4] + out->split(1, block_c); + // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 4, + // 4] + out->split(4, 1); + // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1, + // 4, 4] + + auto out_rf = out->rFactor({1, -3, -2, -1}); + // [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1, + // 4, 4] + + // out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w] + + inp_cache->computeAt(out, 5); + inp_cache->setMemoryType(MemoryType::Shared); + // [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1, + // 4, 4] + + // Move C/block_c before block_h/2 and share the domain from + // inp_cache to out_rf + out_rf->reorder({{7, 5}, {5, 6}, {6, 7}}); + inp_cache->computeAt(out_rf, 6); + + inp_tile->computeAt(out_rf, -1); + w->computeAt(out_rf, -1); + + out->axis(0)->parallelize(ParallelType::BIDx); + out->axis(1)->parallelize(ParallelType::TIDz); + out->axis(4)->parallelize(ParallelType::Unswitch); + out->axis(5)->parallelize(ParallelType::TIDy); + out->axis(6)->parallelize(ParallelType::TIDx); + + scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); + + const int dim_h = 99; + const int dim_w = 101; + const int dim_c = 10; + const int dim_f = 20; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options); + at::Tensor at_w = at::randn({dim_f, dim_c, 4, 4}, options); + std::vector inputs = {at_inp, at_w}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); + auto cg_outputs = fe.runFusion(inputs); + + at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis + auto at_out = + at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 4, {1, 1}); + at_out = at_out.squeeze(0); // drop the N axis + + testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); +} + // POC implementation of im2col for 3-by-3 kernels -TEST(NVFuserTest, FusionIm2Col_CUDA) { +TEST_F(NVFuserTest, FusionIm2Col_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3147,9 +3790,6 @@ TEST(NVFuserTest, FusionIm2Col_CUDA) { scheduler_utils::parallelizeAllLike(out, {inp_cache, inp_tile}); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int dim_h = 31; const int dim_w = 33; const int dim_c = 5; @@ -3160,6 +3800,8 @@ TEST(NVFuserTest, FusionIm2Col_CUDA) { at::Tensor at_inp = at::randn({dim_n, dim_c, dim_h, dim_w}, options); std::vector inputs = {at_inp}; + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto cg_outputs = fe.runFusion(inputs); auto at_out = at::im2col(at_inp, {3, 3}, {1, 1}, {1, 1}, {1, 1}); @@ -3171,14 +3813,14 @@ TEST(NVFuserTest, FusionIm2Col_CUDA) { testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftNoPadding1_CUDA) { +TEST_F(NVFuserTest, FusionShiftNoPadding1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {1, -1}, false); auto tv3 = shift(tv1, {-1, 1}, false); auto tv4 = add(tv2, tv3); @@ -3201,9 +3843,6 @@ TEST(NVFuserTest, FusionShiftNoPadding1_CUDA) { tv5->axis(-2)->parallelize(ParallelType::TIDy); scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion)); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; @@ -3211,6 +3850,9 @@ TEST(NVFuserTest, FusionShiftNoPadding1_CUDA) { at::manual_seed(0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -3226,14 +3868,14 @@ TEST(NVFuserTest, FusionShiftNoPadding1_CUDA) { } // Split and merge -TEST(NVFuserTest, FusionShiftNoPadding2_CUDA) { +TEST_F(NVFuserTest, FusionShiftNoPadding2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {1, -1}, false); auto tv3 = shift(tv1, {-1, 1}, false); auto tv4 = add(tv2, tv3); @@ -3256,9 +3898,6 @@ TEST(NVFuserTest, FusionShiftNoPadding2_CUDA) { tv5->axis(-1)->parallelize(ParallelType::TIDx); scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion)); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; @@ -3266,6 +3905,9 @@ TEST(NVFuserTest, FusionShiftNoPadding2_CUDA) { at::manual_seed(0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -3281,14 +3923,14 @@ TEST(NVFuserTest, FusionShiftNoPadding2_CUDA) { } // Split and merge, then welford -TEST(NVFuserTest, FusionShiftNoPadding3_CUDA) { +TEST_F(NVFuserTest, FusionShiftNoPadding3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {1, -1}, false); auto tv3 = shift(tv1, {-1, 1}, false); auto tv4 = add(tv2, tv3); @@ -3316,9 +3958,6 @@ TEST(NVFuserTest, FusionShiftNoPadding3_CUDA) { tv_avg->axis(-1)->parallelize(ParallelType::TIDx); scheduler_utils::parallelizeAllLike(tv_avg, ir_utils::allTvs(&fusion)); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; @@ -3327,7 +3966,11 @@ TEST(NVFuserTest, FusionShiftNoPadding3_CUDA) { at::manual_seed(0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); + outputs[1] /= (numel_x - 2) * (numel_y - 2); auto t1 = t0 + 1; @@ -3346,13 +3989,13 @@ TEST(NVFuserTest, FusionShiftNoPadding3_CUDA) { } // Shift indexing and predication with contiguous merge -TEST(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) { +TEST_F(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {1, -1}, true); auto tv3 = shift(tv1, {-1, 1}, false); auto tv4 = add(tv2, tv3); @@ -3366,15 +4009,15 @@ TEST(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) { tv2->setMemoryType(MemoryType::Global); tv3->setMemoryType(MemoryType::Global); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 9; int numel_y = 11; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); std::vector indices{ @@ -3392,14 +4035,14 @@ TEST(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) { testValidate(&fusion, {fuser_out}, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftNoPaddingChain_CUDA) { +TEST_F(NVFuserTest, FusionShiftNoPaddingChain_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {1, -1}, false); auto tv3 = shift(tv2, {1, -1}, false); auto tv4 = sum(tv3, {0, 1}); @@ -3422,9 +4065,6 @@ TEST(NVFuserTest, FusionShiftNoPaddingChain_CUDA) { scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3}); - FusionExecutor fe; - fe.compileFusion(&fusion); - int numel_x = 99; int numel_y = 101; @@ -3433,6 +4073,9 @@ TEST(NVFuserTest, FusionShiftNoPaddingChain_CUDA) { at::manual_seed(0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -3447,14 +4090,14 @@ TEST(NVFuserTest, FusionShiftNoPaddingChain_CUDA) { } // Rfactor is not allowed with partial domains -TEST(NVFuserTest, FusionShiftNoPaddingRfactor_CUDA) { +TEST_F(NVFuserTest, FusionShiftNoPaddingRfactor_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {1, -1}, false); auto tv3 = sum(tv2, {0, 1}); fusion.addOutput(tv3); @@ -3466,7 +4109,61 @@ TEST(NVFuserTest, FusionShiftNoPaddingRfactor_CUDA) { ASSERT_ANY_THROW(tv3->rFactor({-2})); } -TEST(NVFuserTest, FusionPartialSplit1_CUDA) { +TEST_F(NVFuserTest, FusionShiftPadding1_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + auto tv1 = add(tv0, IrBuilder::create(1)); + auto tv2 = shift(tv1, {2, -2}, {1, 1}); + auto tv3 = shift(tv1, {-3, 2}, {2, 2}); + auto tv4 = add(tv2, tv3); + auto tv5 = sum(tv4, {0, 1}); + + fusion.addOutput(tv5); + + tv1->setMemoryType(MemoryType::Shared); + + tv5->split(0, 4); + tv5->split(-1, 8); + tv5->reorder({{1, 2}}); + + TransformPropagator::from(tv5); + + tv2->computeAt(tv5, -1); + tv3->computeAt(tv5, -1); + + tv5->axis(-1)->parallelize(ParallelType::TIDx); + tv5->axis(-2)->parallelize(ParallelType::TIDy); + scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion)); + + int numel_x = 99; + int numel_y = 101; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::manual_seed(0); + at::Tensor t0 = at::randn({numel_x, numel_y}, options); + std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); + auto outputs = fe.runFusion(inputs); + + auto t1 = t0 + 1; + auto t2 = shift(t1, {2, -2}); + auto t3 = shift(t1, {-3, 2}); + auto t4 = t2 + t3; + std::vector indices{ + at::indexing::Slice(1, -1), at::indexing::Slice(0, -1)}; + t4 = t4.index(indices); + auto ref = t4.sum(at::ArrayRef{0, 1}); + + testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); +} + +TEST_F(NVFuserTest, FusionPartialSplit1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3474,7 +4171,7 @@ TEST(NVFuserTest, FusionPartialSplit1_CUDA) { // [I] fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(0)); + auto tv1 = add(tv0, IrBuilder::create(0)); // [I] auto tv2 = shift(tv1, {1}, false); // [1:I] @@ -3504,9 +4201,6 @@ TEST(NVFuserTest, FusionPartialSplit1_CUDA) { tv1->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - // gridDim.x is ceilDiv(numel_x - 2, 8), not ceilDiv(numel_x, 8), // so it's going to be just 2 rather than 3. const int numel_x = 18; @@ -3527,6 +4221,9 @@ TEST(NVFuserTest, FusionPartialSplit1_CUDA) { at::manual_seed(0); at::Tensor t0 = at::randn({numel_x}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); std::vector indices{at::indexing::Slice(1, -1)}; @@ -3538,21 +4235,21 @@ TEST(NVFuserTest, FusionPartialSplit1_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionPartialSplit2_CUDA) { +TEST_F(NVFuserTest, FusionPartialSplit2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(0)); + auto tv1 = add(tv0, IrBuilder::create(0)); auto tv2 = shift(tv1, {1}, false); auto tv3 = shift(tv1, {-1}, false); auto tv4 = add(tv2, tv3); fusion.addOutput(tv4); - auto tv5 = add(tv1, new Double(1)); - auto tv6 = add(tv5, new Double(1)); + auto tv5 = add(tv1, IrBuilder::create(1)); + auto tv6 = add(tv5, IrBuilder::create(1)); fusion.addOutput(tv6); tv4->split(0, 4, true, true); @@ -3568,14 +4265,14 @@ TEST(NVFuserTest, FusionPartialSplit2_CUDA) { } // 2D version of PartialSplit1 -TEST(NVFuserTest, FusionPartialSplit3_CUDA) { +TEST_F(NVFuserTest, FusionPartialSplit3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(2); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(0)); + auto tv1 = add(tv0, IrBuilder::create(0)); auto tv2 = shift(tv1, {1, 2}, false); auto tv3 = shift(tv1, {-2, -1}, false); auto tv4 = add(tv2, tv3); @@ -3595,9 +4292,6 @@ TEST(NVFuserTest, FusionPartialSplit3_CUDA) { tv1->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int numel_x = 32 + 3; const int numel_y = 32 + 3; @@ -3606,6 +4300,9 @@ TEST(NVFuserTest, FusionPartialSplit3_CUDA) { at::manual_seed(0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); std::vector indices{ @@ -3620,7 +4317,7 @@ TEST(NVFuserTest, FusionPartialSplit3_CUDA) { // Almost same fusion with Shift5ptStencilChain but non-padded shift // and partial split. -TEST(NVFuserTest, FusionPartialSplit4_CUDA) { +TEST_F(NVFuserTest, FusionPartialSplit4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3641,7 +4338,8 @@ TEST(NVFuserTest, FusionPartialSplit4_CUDA) { tv_stencil1 = add(tv_stencil1, tv); } - tv_stencil1 = div(tv_stencil1, new Double(tv_stencil1_shifts.size() + 1)); + tv_stencil1 = div( + tv_stencil1, IrBuilder::create(tv_stencil1_shifts.size() + 1)); // Second stencil: Same 5pt stencil std::vector tv_stencil2_shifts; @@ -3654,7 +4352,8 @@ TEST(NVFuserTest, FusionPartialSplit4_CUDA) { tv_stencil2 = add(tv_stencil2, tv); } - tv_stencil2 = div(tv_stencil2, new Double(tv_stencil2_shifts.size() + 1)); + tv_stencil2 = div( + tv_stencil2, IrBuilder::create(tv_stencil2_shifts.size() + 1)); auto tv_out = tv_stencil2; @@ -3696,9 +4395,6 @@ TEST(NVFuserTest, FusionPartialSplit4_CUDA) { tv0_cache->setMemoryType(MemoryType::Shared); tv_stencil1->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - // Input matrix size is 68x68, and the output is 64x64. Both // gridDim.x and gridim.y should be ceilDiv(numel - 4, // split_factor), which is 4. If full split is used, the grid @@ -3709,6 +4405,9 @@ TEST(NVFuserTest, FusionPartialSplit4_CUDA) { auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); std::vector indices{ @@ -3731,7 +4430,7 @@ TEST(NVFuserTest, FusionPartialSplit4_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionPartialSplit5_CUDA) { +TEST_F(NVFuserTest, FusionPartialSplit5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3743,7 +4442,7 @@ TEST(NVFuserTest, FusionPartialSplit5_CUDA) { fusion.addInput(tv0); auto tv1 = shift(tv0, {0, 1}, false); - auto tv2 = add(tv1, new Double(1)); + auto tv2 = add(tv1, IrBuilder::create(1)); fusion.addOutput(tv2); @@ -3760,12 +4459,12 @@ TEST(NVFuserTest, FusionPartialSplit5_CUDA) { tv1->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x, numel_y}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); std::vector indices{ @@ -3779,7 +4478,7 @@ TEST(NVFuserTest, FusionPartialSplit5_CUDA) { testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionPartialSplit6_CUDA) { +TEST_F(NVFuserTest, FusionPartialSplit6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3788,9 +4487,9 @@ TEST(NVFuserTest, FusionPartialSplit6_CUDA) { auto tv0 = makeConcreteTensor({numel_x}); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {1}, false); - auto tv3 = add(tv2, new Double(1)); + auto tv3 = add(tv2, IrBuilder::create(1)); fusion.addOutput(tv3); @@ -3803,12 +4502,12 @@ TEST(NVFuserTest, FusionPartialSplit6_CUDA) { tv1->setMemoryType(MemoryType::Shared); tv2->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x}, options); std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); std::vector indices{ @@ -3821,7 +4520,7 @@ TEST(NVFuserTest, FusionPartialSplit6_CUDA) { testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionShiftUnswitch1_CUDA) { +TEST_F(NVFuserTest, FusionShiftUnswitch1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3840,7 +4539,7 @@ TEST(NVFuserTest, FusionShiftUnswitch1_CUDA) { auto tv4 = shift(tv0, {-2, -2}); fusion.addOutput(tv4); - auto tv5 = add(tv0, new Double(1)); + auto tv5 = add(tv0, IrBuilder::create(1)); auto tv6 = shift(tv5, {0, -1}); fusion.addOutput(tv6); @@ -3862,7 +4561,7 @@ TEST(NVFuserTest, FusionShiftUnswitch1_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = shift(t0, {-1, 0}); @@ -3881,27 +4580,22 @@ TEST(NVFuserTest, FusionShiftUnswitch1_CUDA) { TORCH_CHECK(t6.equal(outputs[4])); } -TEST(NVFuserTest, FusionGatherUnswitch1_CUDA) { +TEST_F(NVFuserTest, FusionGatherUnswitch1_CUDA) { + const int tv1_gather = 3; + const int tv1_gather_pad = 1; + const int tv2_gather = 5; + const int tv2_gather_pad = 2; + Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1_gather_param = new Int(); - fusion.addInput(tv1_gather_param); - auto tv1_gather_pad_param = new Int(); - fusion.addInput(tv1_gather_pad_param); - auto tv1 = gather( - tv0, {tv1_gather_param}, {{tv1_gather_pad_param, tv1_gather_pad_param}}); + auto tv1 = gather(tv0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}}); fusion.addOutput(tv1); - auto tv2_gather_param = new Int(); - fusion.addInput(tv2_gather_param); - auto tv2_gather_pad_param = new Int(); - fusion.addInput(tv2_gather_pad_param); - auto tv2 = gather( - tv0, {tv2_gather_param}, {{tv2_gather_pad_param, tv2_gather_pad_param}}); + auto tv2 = gather(tv0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}}); fusion.addOutput(tv2); // Static gather @@ -3923,18 +4617,13 @@ TEST(NVFuserTest, FusionGatherUnswitch1_CUDA) { tv4->axis(1)->parallelize(ParallelType::TIDx); const int numel_x = 100; - const int tv1_gather = 3; - const int tv1_gather_pad = 1; - const int tv2_gather = 5; - const int tv2_gather_pad = 2; auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); at::Tensor t0 = at::randn({numel_x}, options); - std::vector inputs = { - t0, tv1_gather, tv1_gather_pad, tv2_gather, tv2_gather_pad}; + std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = gather(t0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}}); @@ -3950,7 +4639,7 @@ TEST(NVFuserTest, FusionGatherUnswitch1_CUDA) { TORCH_CHECK(t4.equal(outputs[3])); } -TEST(NVFuserTest, FusionGatherStrided1_CUDA) { +TEST_F(NVFuserTest, FusionGatherStrided1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -3973,7 +4662,7 @@ TEST(NVFuserTest, FusionGatherStrided1_CUDA) { at::Tensor t0 = at::randn({s1, s2}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto outputs = fe.runFusion({t0}); // tv1 has a stride dimension, so its number of dimensions should be @@ -4013,7 +4702,7 @@ TEST(NVFuserTest, FusionGatherStrided1_CUDA) { } // Split strided domain -TEST(NVFuserTest, FusionGatherStrided2_CUDA) { +TEST_F(NVFuserTest, FusionGatherStrided2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4024,7 +4713,7 @@ TEST(NVFuserTest, FusionGatherStrided2_CUDA) { auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = gather(tv1, window_shape, padding_width, strides); @@ -4054,7 +4743,7 @@ TEST(NVFuserTest, FusionGatherStrided2_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -4065,7 +4754,7 @@ TEST(NVFuserTest, FusionGatherStrided2_CUDA) { } // Outer split -TEST(NVFuserTest, FusionGatherStrided3_CUDA) { +TEST_F(NVFuserTest, FusionGatherStrided3_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4076,7 +4765,7 @@ TEST(NVFuserTest, FusionGatherStrided3_CUDA) { auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = gather(tv1, window_shape, padding_width, strides); @@ -4102,7 +4791,7 @@ TEST(NVFuserTest, FusionGatherStrided3_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -4112,7 +4801,7 @@ TEST(NVFuserTest, FusionGatherStrided3_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionGatherStrided4_CUDA) { +TEST_F(NVFuserTest, FusionGatherStrided4_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4123,7 +4812,7 @@ TEST(NVFuserTest, FusionGatherStrided4_CUDA) { auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); // Test propagation of split from one gather output to another auto tv2 = gather(tv1, window_shape, padding_width, strides); @@ -4147,7 +4836,7 @@ TEST(NVFuserTest, FusionGatherStrided4_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -4158,7 +4847,7 @@ TEST(NVFuserTest, FusionGatherStrided4_CUDA) { } // Same as GatherStrided1 but with stride != window -TEST(NVFuserTest, FusionGatherStrided5_CUDA) { +TEST_F(NVFuserTest, FusionGatherStrided5_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4181,7 +4870,7 @@ TEST(NVFuserTest, FusionGatherStrided5_CUDA) { at::Tensor t0 = at::randn({s1, s2}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto outputs = fe.runFusion({t0}); auto ref = gather(t0, window_shape, padding_width, strides); @@ -4190,7 +4879,7 @@ TEST(NVFuserTest, FusionGatherStrided5_CUDA) { } // Same as GatherStrided2 but with stride != window -TEST(NVFuserTest, FusionGatherStrided6_CUDA) { +TEST_F(NVFuserTest, FusionGatherStrided6_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4201,7 +4890,7 @@ TEST(NVFuserTest, FusionGatherStrided6_CUDA) { auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = gather(tv1, window_shape, padding_width, strides); @@ -4231,7 +4920,7 @@ TEST(NVFuserTest, FusionGatherStrided6_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -4242,7 +4931,7 @@ TEST(NVFuserTest, FusionGatherStrided6_CUDA) { } // Same as GatherStrided4 but different strides -TEST(NVFuserTest, FusionGatherStrided7_CUDA) { +TEST_F(NVFuserTest, FusionGatherStrided7_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4252,7 +4941,7 @@ TEST(NVFuserTest, FusionGatherStrided7_CUDA) { auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); // Use different strides auto tv2 = gather(tv1, window_shape, padding_width, {3}); @@ -4271,7 +4960,7 @@ TEST(NVFuserTest, FusionGatherStrided7_CUDA) { } // Same as GatherStrided2 but with unswitch -TEST(NVFuserTest, FusionGatherStrided8_CUDA) { +TEST_F(NVFuserTest, FusionGatherStrided8_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4282,7 +4971,7 @@ TEST(NVFuserTest, FusionGatherStrided8_CUDA) { auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = gather(tv1, window_shape, padding_width, strides); @@ -4316,7 +5005,7 @@ TEST(NVFuserTest, FusionGatherStrided8_CUDA) { std::vector inputs = {t0}; FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto t1 = t0 + 1; @@ -4327,7 +5016,7 @@ TEST(NVFuserTest, FusionGatherStrided8_CUDA) { } // Chained strided gather. Not supported yet. -TEST(NVFuserTest, FusionGatherStridedChain_CUDA) { +TEST_F(NVFuserTest, FusionGatherStridedChain_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4339,7 +5028,7 @@ TEST(NVFuserTest, FusionGatherStridedChain_CUDA) { auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = gather(tv1, window_shape, padding_width, strides); // Reduce gathered window @@ -4356,10 +5045,7 @@ TEST(NVFuserTest, FusionGatherStridedChain_CUDA) { ASSERT_ANY_THROW(GpuLower gpulw(&fusion)); } -TEST(NVFuserTest, FusionMaxPoolingStrided_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionMaxPoolingStrided_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4379,7 +5065,7 @@ TEST(NVFuserTest, FusionMaxPoolingStrided_CUDA) { auto max_tensor = reductionOp( BinaryOpType::Max, {-3, -2, -1}, - new Double(std::numeric_limits::lowest()), + IrBuilder::create(std::numeric_limits::lowest()), inp_tile); fusion.addOutput(max_tensor); @@ -4410,9 +5096,6 @@ TEST(NVFuserTest, FusionMaxPoolingStrided_CUDA) { inp_cache->setMemoryType(MemoryType::Shared); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int hw = 50; const int num_channels = 20; const int pooling_window = 3; @@ -4428,6 +5111,8 @@ TEST(NVFuserTest, FusionMaxPoolingStrided_CUDA) { aten_inp = at::abs(aten_inp); std::vector inputs = {aten_inp}; + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto outputs = fe.runFusion(inputs); auto ref = at::max_pool2d( @@ -4436,10 +5121,7 @@ TEST(NVFuserTest, FusionMaxPoolingStrided_CUDA) { testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionConv2DStaticStrided_CUDA) { - if (at::cuda::getDeviceProperties(0)->major < 6) { - return; - } +TEST_F(NVFuserTest, FusionConv2DStaticStrided_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4518,9 +5200,6 @@ TEST(NVFuserTest, FusionConv2DStaticStrided_CUDA) { scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf}); - FusionExecutor fe; - fe.compileFusion(&fusion); - const int dim_h = 99; const int dim_w = 101; const int dim_c = 10; @@ -4532,6 +5211,8 @@ TEST(NVFuserTest, FusionConv2DStaticStrided_CUDA) { at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options); std::vector inputs = {at_inp, at_w}; + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); auto cg_outputs = fe.runFusion(inputs); at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis @@ -4541,14 +5222,14 @@ TEST(NVFuserTest, FusionConv2DStaticStrided_CUDA) { testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionNonDivisibleHalo1_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleHalo1_CUDA) { Fusion fusion; FusionGuard fg(&fusion); auto tv0 = makeSymbolicTensor(1); fusion.addInput(tv0); - auto tv1 = add(tv0, new Double(1)); + auto tv1 = add(tv0, IrBuilder::create(1)); auto tv2 = shift(tv1, {-1}); fusion.addOutput(tv2); @@ -4564,7 +5245,7 @@ TEST(NVFuserTest, FusionNonDivisibleHalo1_CUDA) { at::Tensor t0 = at::randn({24}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto ref = shift((t0 + 1), {-1}); @@ -4572,7 +5253,7 @@ TEST(NVFuserTest, FusionNonDivisibleHalo1_CUDA) { testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__); } -TEST(NVFuserTest, FusionNonDivisibleHalo2_CUDA) { +TEST_F(NVFuserTest, FusionNonDivisibleHalo2_CUDA) { Fusion fusion; FusionGuard fg(&fusion); @@ -4621,7 +5302,7 @@ TEST(NVFuserTest, FusionNonDivisibleHalo2_CUDA) { at::Tensor t0 = at::randn({111, 222}, options); FusionExecutor fe; - fe.compileFusion(&fusion); + fe.compileFusion(&fusion, {t0}); auto cg_outputs = fe.runFusion({t0}); auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}}); @@ -4632,6 +5313,59 @@ TEST(NVFuserTest, FusionNonDivisibleHalo2_CUDA) { testValidate(&fusion, cg_outputs, {t0}, {t4}, __LINE__, __FILE__); } +TEST_F(NVFuserTest, FusionGather9ptStencilDoubleBuffering_CUDA) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeSymbolicTensor(2); + fusion.addInput(tv0); + + auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}}); + auto tv2 = sum(tv1, {-2, -1}); + auto tv3 = div(tv2, IrBuilder::create(9)); + + auto out = tv3; + + fusion.addOutput(out); + + auto tv0_cache = tv0->cache_after(); + + tv0_cache->setMemoryType(MemoryType::Shared); + + out->split(-2, 4); + out->split(-1, 32); + out->reorder({{1, 2}, {2, 1}}); + TransformPropagator::from(out); + + tv0->computeAt(out, 2); + + out->axis(3)->parallelize(ParallelType::TIDx); + out->axis(2)->parallelize(ParallelType::TIDy); + out->axis(0)->parallelize(ParallelType::BIDx); + + scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion)); + + tv0_cache->doubleBuffer(); + + int numel_x = 99; + int numel_y = 101; + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({numel_x, numel_y}, options); + std::vector inputs = {t0}; + + FusionExecutor fe; + fe.compileFusion(&fusion, inputs); + auto outputs = fe.runFusion(inputs); + + auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}}); + auto t2 = sum(t1, {-2, -1}); + auto t3 = t2 / 9; + auto ref = t3; + + testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__); +} + } // namespace jit } // namespace torch #endif // #if defined(USE_CUDA) diff --git a/test/cpp/jit/test_gpu_validator.h b/test/cpp/jit/test_gpu_validator.h index 5923e384e39..4b01f361cfc 100644 --- a/test/cpp/jit/test_gpu_validator.h +++ b/test/cpp/jit/test_gpu_validator.h @@ -4,6 +4,7 @@ #include #include +#include #include namespace torch { @@ -11,6 +12,25 @@ namespace jit { namespace fuser { namespace cuda { +inline bool deviceMajorMinorCheck(int major, int minor = 0) { + auto dev_prop = at::cuda::getDeviceProperties(0); + if (dev_prop->major < major || + (dev_prop->major == major && dev_prop->minor < minor)) { + return false; + } + return true; +} + +class NVFuserTest : public ::testing::Test { + protected: + void SetUp() override { + // requires PASCAL or newer + if (!deviceMajorMinorCheck(6)) { + GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs"; + } + } +}; + struct ValidationConstants { // Tolerances generated from randn + add + sum fusion // compared against double precision @@ -66,8 +86,8 @@ std::pair getTolerance( } else { // Reduction case size_t entry = 0; - while (sum_tolerance_entry[entry][0] < reduction_size && - entry < sum_tolerance_entry.size()) { + while (entry < sum_tolerance_entry.size() && + sum_tolerance_entry[entry][0] < reduction_size) { entry++; } double abs_tol = 0.0; @@ -221,7 +241,7 @@ class ReductionSizeMapper : private IterVisitor { } void handle(Expr* expr) override { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { return; } diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index 1d78a19c5ad..299c738c570 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -3,6 +3,10 @@ import unittest import os import random +import enum +import copy +from functools import reduce +import operator import torch from torch.nn import functional @@ -20,6 +24,8 @@ import itertools import numpy as np import math +from torch.autograd.gradcheck import gradcheck + from typing import List CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')) @@ -465,21 +471,25 @@ class TestCudaFuser(JitTestCase): self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) def _unary_test_helper(self, operation, dtype, random_data): - shape = (4, 8, 32, 32) + gradient_check = (dtype == torch.float64) and random_data + shape = (8, 7) + torch.cuda.manual_seed_all(211) # need additional def of t for boolean ops def t(x: torch.Tensor, y: torch.Tensor): o = x * y + o = o + 5e-3 o = operation(o) return o - y = torch.tensor([1], device="cuda").to(dtype) + y = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check) + y = y.to(dtype=dtype) if random_data: - x = torch.randn(shape, dtype=torch.float32, device="cuda") + x = torch.rand(shape, dtype=torch.float32, device="cuda", requires_grad=gradient_check) if dtype in self.int_types: # prefer a larger variance for integer types - x *= 5 + x = x * 5 x = x.to(dtype=dtype) else: x = self.special_values.to(dtype=dtype) @@ -491,14 +501,14 @@ class TestCudaFuser(JitTestCase): t_jit = torch.jit.script(t) jit_o = t_jit(x, y) jit_o = t_jit(x, y) - if dtype in self.support_tensor_dtypes: + jit_o = t_jit(x, y) + if gradient_check: + gradcheck(t_jit, [x, y], nondet_tol=1e-5) + elif dtype in self.support_tensor_dtypes: self.assertGraphContains(t_jit.graph_for(x, y), FUSION_GUARD) o = t(x, y) self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o, jit_o, msg=f""" - failing case: - {dtype} {operation} {x} - """) + self.assertTrue(self._compare("failing case {}\n{}\n{}\n{}".format(dtype, operation, x, y), o, jit_o, 1e-2)) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, @@ -651,6 +661,16 @@ class TestCudaFuser(JitTestCase): o = o + z return o + def t_int(x: torch.Tensor, y: torch.Tensor): + o = operation(x, y) + o = 2 + o + return o + + def t_float(x: torch.Tensor, y: torch.Tensor): + o = operation(x, y) + o = 2. + o + return o + shape = (4, 32, 32) if random_data: x = (torch.randn(shape, dtype=torch.float, device="cuda") * 5).to(dtype_arg1) @@ -665,14 +685,16 @@ class TestCudaFuser(JitTestCase): if operation in div_like and (dtype_arg2 == torch.int32 or dtype_arg2 == torch.int64): y[y == 0] = 1 - o = t(x, y, z) - t_jit = torch.jit.script(t) - jit_o = t_jit(x, y, z) - jit_o = t_jit(x, y, z) + for test_fn in [t, t_int, t_float]: + o = t(x, y, z) + t_jit = torch.jit.script(t) + jit_o = t_jit(x, y, z) + jit_o = t_jit(x, y, z) + jit_o = t_jit(x, y, z) - self.assertEqual(o.dtype, jit_o.dtype) - self.assertEqual(o, jit_o) - self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) + self.assertEqual(o.dtype, jit_o.dtype) + self.assertEqual(o, jit_o) + self.assertGraphContains(t_jit.graph_for(x, y, z), FUSION_GUARD) @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, @@ -887,6 +909,21 @@ class TestCudaFuser(JitTestCase): self._ternary_test_helper(op, dtypes, True) # random data self._ternary_test_helper(op, dtypes, False) # special numbers + # We can't test the scalar version of rsub from python + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") + def test_rsub(self): + x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda") + y = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda") + + def rsub(x: torch.Tensor, y: torch.Tensor): + o = torch.rsub(x, y) + o = o * 2. + return o + + rsub_jit = torch.jit.script(rsub) + self._run_helper(rsub_jit, rsub, x, y) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") # legacy fuser does not work for rand_like, see issue #34361 @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @@ -1008,6 +1045,8 @@ class TestCudaFuser(JitTestCase): torch._C._jit_set_nvfuser_guard_mode(old_guard) @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") def test_random_topo(self): os.environ["PYTORCH_NVFUSER_DISABLE_FALLBACK"] = "1" self.assertTrue(runDefaultTestWithSeed(28449)) @@ -1272,7 +1311,6 @@ class TestCudaFuser(JitTestCase): self.assertTrue(self._compare("comparing rstd failed", rstd, jit_rstd, error)) self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) - @unittest.skipIf(True, "codegen failure awaiting fix") @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, @@ -1287,7 +1325,6 @@ class TestCudaFuser(JitTestCase): norm_shape = [input_shape[idx] for idx in range(dims - offset, dims)] self._native_layer_norm_helper(input_shape, norm_shape, torch.float32, "cuda", 1e-4, affine) - @unittest.skipIf(True, "codegen failure awaiting fix") @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, @@ -2306,7 +2343,7 @@ class TestCudaFuser(JitTestCase): o = x * 2.0 o = torch.softmax(o, dim=-1) o = o * 3.0 - o = torch.matmul(o, y) + o = torch._C._nn.linear(o, y) return o x = torch.randn(8, 4, dtype=torch.half, device='cuda', requires_grad=True) @@ -2380,7 +2417,7 @@ class TestCudaFuser(JitTestCase): o = x * 2.0 o = torch.softmax(o, dim=-1) o = o * 3.0 - o = torch.matmul(o, y) + o = torch._C._nn.linear(o, y) return o x = torch.randn(8, 4, dtype=torch.bfloat16, device='cuda', requires_grad=True) @@ -2731,8 +2768,8 @@ class TestCudaFuser(JitTestCase): track_running_stats=track_running_stats).to(dtype=dtype) def forward(self, x): - o = x * 2.0 - o = self.bn(o) + o = self.bn(x) + o = o * 2.0 return o x = torch.randn(batch, c, hw, hw, dtype=torch.float, device="cuda").to(dtype=dtype).requires_grad_() @@ -3055,6 +3092,7 @@ class TestCudaFuser(JitTestCase): for op in ops: self.assertGraphContainsExactly(graph, op, 0) + @unittest.skipIf(is_pre_volta(), "reduction not supported in pre volta device") @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @@ -3068,7 +3106,7 @@ class TestCudaFuser(JitTestCase): o1 = x + 1.0 o2 = x * 0.5 return o1, o2 - self._run_fwd_helper(t, ['aten::add'], x) + self._run_fwd_helper(t, ['aten::add', 'aten::mul'], x) def t2(x: torch.Tensor, y: torch.Tensor): o1 = x.sum(0) @@ -3076,7 +3114,6 @@ class TestCudaFuser(JitTestCase): return o1, o2 self._run_fwd_helper(t2, ['aten::sum', 'aten::mul'], x, y) - @unittest.skipIf(True, "Fixed in PR #68804") @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @@ -3120,6 +3157,343 @@ class TestCudaFuser(JitTestCase): graph = jitted.graph_for(x, y) self.assertGraphContainsExactly(graph, FUSION_GROUP, 0) + def _bias_view_relu_helper(self, shape, output_shape, dtype, device, error): + class BiasViewRelu(torch.nn.Module): + def __init__(self): + super(BiasViewRelu, self).__init__() + self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False) + with torch.no_grad(): + self.bias.fill_(10) + + def forward(self, inputs : torch.Tensor, view_shape : List[int]): + o = inputs + self.bias + o = o.view(view_shape) + return torch.relu(o) + + t = BiasViewRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + # profiling + jit_o = t_jit(x, output_shape) + # optimization + jit_o = t_jit(x, output_shape) + # final + jit_o = t_jit(x, output_shape) + # eager - baseline + o = t(x, output_shape) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, output_shape) + + has_inferred_dimension = any([dim == -1 for dim in output_shape]) + if has_inferred_dimension: + # prohibit fusing when view_shape contains an inferred dimension + self.assertGraphContainsExactly(graph, FUSION_GROUP, 0) + self.assertGraphContainsExactly(graph, 'prim::view_copy', 0) + else: + self.assertGraphContains(graph, FUSION_GUARD) + self.assertGraphContains(graph, 'prim::view_copy', True) + + def _alias_bias_view_relu_helper(self, shape, output_shape, dtype, device, error): + class BiasViewRelu(torch.nn.Module): + def __init__(self): + super(BiasViewRelu, self).__init__() + self.bias = torch.nn.Parameter(torch.randn(shape, dtype=dtype, device=device), requires_grad=False) + with torch.no_grad(): + self.bias.fill_(10) + + def forward(self, inputs : torch.Tensor, view_shape : List[int]): + o = inputs.view(view_shape) + inputs = inputs * self.bias + return torch.relu(o) + + t = BiasViewRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + # profiling + jit_o = t_jit(x, output_shape) + # optimization + jit_o = t_jit(x, output_shape) + # final + jit_o = t_jit(x, output_shape) + # eager - baseline + o = t(x, output_shape) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, output_shape) + self.assertGraphContainsExactly(graph, FUSION_GUARD, 0) + self.assertGraphContainsExactly(graph, 'prim::view_copy', 0) + + # generate random view given original view + def _random_view(self, original_view, max_len=8, max_views=10000): + class Moves(enum.Enum): + Merge = 0 + Split = 1 + Broadcast = 2 + ImplicitBroadcast = 3 + Keep = 4 + + def valid(old_view, new_view): + old_view_size = reduce(operator.mul, old_view) + new_view_size = reduce(operator.mul, new_view) + return old_view_size == new_view_size + + # given a random starting number, find the nearest divisor + def find_nearest_divisor(N): + if 2 >= (N - 1): + return -1 + result = random.randint(2, N - 1) + while (N % result) != 0: + result += 1 + return result + + complete_views = set([tuple(original_view)]) + + to_visit = [] + # empty new view, curent originaal view, start pos=0, move count = 0, last_move + to_visit.append(([], original_view, 0, [], Moves.Keep)) + + # depth-first search of view shapes, starting from the original view + while len(to_visit) > 0 and len(complete_views) < max_views: + new_view, old_view, odx, move_list, last_move = to_visit[-1] + to_visit.pop() + + # iterate over each move type + for idx in range(len(Moves)): + state = Moves(idx) + new_view_clone = copy.deepcopy(new_view) + old_view_clone = copy.deepcopy(old_view) + new_move_list = move_list + [state] + new_odx = odx + + # Update state using Move state + if state == Moves.Keep: + new_size = old_view_clone[odx] + new_view_clone.append(new_size) + new_odx += 1 + + elif state == Moves.Merge: + if odx + 1 < len(old_view_clone): + new_size = old_view_clone[odx] * old_view_clone[odx + 1] + new_view_clone.append(new_size) + new_odx += 2 + else: + continue + + elif state == Moves.Broadcast and last_move != Moves.Broadcast: + new_view_clone.append(1) + + elif state == Moves.Split: + new_size = find_nearest_divisor(old_view_clone[odx]) + if new_size == -1: + continue + new_view_clone.append(new_size) + old_view_clone[odx] = int(old_view[odx] / new_size) + + if old_view_clone[odx] == 1: + new_odx += 1 + + elif state == Moves.ImplicitBroadcast: + old_view_clone.insert(odx + 1, 1) + new_size = old_view[odx] * 1 + new_view_clone.append(new_size) + new_odx += 2 + + if new_odx < len(old_view_clone) and len(new_move_list) < max_len: + to_visit.append((new_view_clone, old_view_clone, new_odx, new_move_list, state)) + elif (valid(original_view, new_view_clone)): + final_new_view = tuple(new_view_clone) + complete_views.add(final_new_view) + return list(complete_views) + + # ndims - number of dimensions + # test_fn - view test function + def _view_test_generator(self, ndims, test_fn): + # create random tensor + # max value for each dimension + max_size = 10e7 + max_value = max(int(pow(max_size, 1. / ndims)), 1) + sizes = [random.randint(1, max_value) for idx in range(ndims)] + x = torch.randn(sizes) + + original_sizes = list(x.size()) + all_views = self._random_view(original_sizes) + random.shuffle(all_views) + + max_samples = 20 + max_views = min(len(all_views), max_samples) + total = 0 + correct = 0 + # test random combinations of compatible views + for idx in range(max_views): + for jdx in range(idx + 1, max_views): + total += 1 + test_fn(all_views[idx], all_views[jdx], torch.float, 'cuda', 1e-6) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_view(self): + torch._C._jit_set_nvfuser_guard_mode(True) + self._bias_view_relu_helper([2, 3, 4, 5], [-1, 4, 5], torch.float, 'cuda', 1e-6) + for ndims in range(1, 5): + self._view_test_generator(ndims, self._bias_view_relu_helper) + self._alias_bias_view_relu_helper([2, 3, 4, 5], [1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6) + + def _bias_squeeze_relu_helper(self, shape, dtype, device, error): + class BiasSqueezeRelu(torch.nn.Module): + def __init__(self): + super(BiasSqueezeRelu, self).__init__() + + def forward(self, inputs : torch.Tensor, bias : torch.Tensor): + o = inputs + bias + o = torch.squeeze(o) + return torch.relu(o) + + t = BiasSqueezeRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + o = t(x, bias) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x) + self.assertGraphContains(graph, FUSION_GUARD) + self.assertGraphContains(graph, 'prim::squeeze_copy', True) + + def _alias_bias_squeeze_relu_helper(self, shape, dtype, device, error): + class BiasSqueezeRelu(torch.nn.Module): + def __init__(self): + super(BiasSqueezeRelu, self).__init__() + + def forward(self, inputs : torch.Tensor, bias : torch.Tensor): + o = torch.squeeze(inputs) + inputs = inputs * bias + return torch.relu(o) + + t = BiasSqueezeRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + o = t(x, bias) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x, bias) + self.assertGraphContainsExactly(graph, FUSION_GUARD, 0) + self.assertGraphContainsExactly(graph, 'prim::squeeze_copy', 0) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_squeeze(self): + self._bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6) + self._alias_bias_squeeze_relu_helper([1, 6, 1, 2, 2, 5, 1], torch.float, 'cuda', 1e-6) + + def _bias_unsqueeze_relu_helper(self, shape, dtype, device, error): + class BiasUnsqueezeRelu(torch.nn.Module): + def __init__(self): + super(BiasUnsqueezeRelu, self).__init__() + + def forward(self, inputs : torch.Tensor, bias : torch.Tensor): + o = inputs + bias + o = torch.unsqueeze(o, 0) + return torch.relu(o) + + t = BiasUnsqueezeRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + o = t(x, bias) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x) + self.assertGraphContains(graph, FUSION_GUARD) + self.assertGraphContains(graph, 'prim::unsqueeze_copy', True) + + def _alias_bias_unsqueeze_relu_helper(self, shape, dtype, device, error): + class BiasUnsqueezeRelu(torch.nn.Module): + def __init__(self): + super(BiasUnsqueezeRelu, self).__init__() + + def forward(self, inputs : torch.Tensor, bias : torch.Tensor): + o = torch.squeeze(inputs) + o = torch.unsqueeze(inputs, 0) + inputs = inputs * bias + return torch.relu(o) + + t = BiasUnsqueezeRelu() + x = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + bias = torch.randn(shape, dtype=dtype, device=device, requires_grad=False) + t_jit = torch.jit.script(t) + + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + jit_o = t_jit(x, bias) + o = t(x, bias) + + self.assertEqual(o.dtype, jit_o.dtype) + self.assertTrue(self._compare("comparing output failed", o, jit_o, error)) + graph = t_jit.graph_for(x) + self.assertGraphContainsExactly(graph, FUSION_GUARD, 0) + self.assertGraphContainsExactly(graph, 'prim::unsqueeze_copy', 0) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_unsqueeze(self): + self._bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6) + self._alias_bias_unsqueeze_relu_helper([2, 3, 4, 5], torch.float, 'cuda', 1e-6) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_alias_pass_fix(self): + x = torch.randn(4, 24, 2, 2, dtype=torch.float, device="cuda") + w = torch.randn(24, 24, 1, 1, dtype=torch.float, device="cuda") + b = torch.randn(24, dtype=torch.float, device="cuda") + + def t(x, w, b): + b2 = b + 1.0 + o = torch.conv2d(x, w, b2) + return o + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, w, b) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_squeeze_negative_dim(self): + x = torch.randn(4, 24, 1, 2, dtype=torch.float, device="cuda") + + def t(x): + o = x + 1.0 + o = o.squeeze(-2) + o = o * 2.0 + return o + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires fusion optimization pass to be effective") @@ -3154,6 +3528,138 @@ class TestCudaFuser(JitTestCase): # sibling fusion should be disabled with the flag self.assertGraphContainsExactly(t_jit.graph_for(x, y, s), FUSION_GUARD, 0) + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_build_shape_expression_native_dropout(self): + x = torch.randn(4, 2, device="cuda") + + def t(x): + o, mask = torch.native_dropout(x, 0.0, True) + o1 = o.sigmoid() + o2 = mask.float().sigmoid() + return (o1, o2) + + t_jit = torch.jit.script(t) + + jit_o = t_jit(x) + jit_o = t_jit(x) + o = t(x) + for oo, jit_oo in zip(o, jit_o): + self.assertEqual(oo.dtype, jit_oo.dtype) + self.assertEqual(oo, jit_oo) + self.assertGraphContains(t_jit.graph_for(x), FUSION_GUARD) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_scalar_tensor_permuted(self): + x = torch.randn(4, 2, 3, device="cuda").permute([1, 2, 0]) + y = torch.tensor(1.0, device="cuda") + + with nvfuser_singleton_fusion(True): + def t(x, y): + return x + y + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, y) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_cpu_scalar(self): + x = torch.randn(4, 2, 3, device="cuda") + y = torch.tensor(1.0, device="cpu") + z = torch.tensor(2.0, device="cpu") + + with nvfuser_singleton_fusion(True): + # testing cpu scalar tensor promotion + def t(x, y): + return x + y + + t_jit = torch.jit.script(t) + self._run_helper(t_jit, t, x, y) + + # scalar cpu tensor add should NOT be fused + @torch.jit.script + def t1(y, z): + return y * z + for _ in range(5): + t1(y, z) + self.assertGraphContainsExactly(t1.graph_for(y, z), FUSION_GUARD, 0) + + # everything, including scalar cpu tensor add should be fused + @torch.jit.script + def t2(x, y, z): + tmp = y + z + return tmp + x + for _ in range(5): + t2(x, y, z) + self.assertGraphContainsExactly(t2.graph_for(x, y, z), 'aten::add', 0) + self.assertGraphContainsExactly(t2.graph_for(x, y, z), FUSION_GUARD, 1) + + # 'cpu_tmp = y + z' shouldn't be fused. + @torch.jit.script + def t3(x, y, z): + cpu_tmp = y + z + out = x + y + return cpu_tmp, out + for _ in range(5): + t3(x, y, z) + self.assertGraphContainsExactly(t3.graph_for(x, y, z), FUSION_GUARD, 1) + self.assertGraphContainsExactly(t3.graph_for(x, y, z), 'aten::add', 1) + + @unittest.skipIf(not RUN_CUDA, "requires CUDA") + @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, + "Requires fusion optimization pass to be effective") + def test_shape_expression(self): + x = torch.randn(4, 2, 1, 3, device="cuda") + + def t_unsqueeze(x): + t0 = x.relu() + t1 = t0.unsqueeze(1) + t2 = t1 + 1.0 + t3 = t1.size() + return t2, t3 + + def t_squeeze(x): + t0 = x.relu() + t1 = t0.squeeze() + t2 = t1 + 1.0 + t3 = t1.size() + return t2, t3 + + def t_squeeze_dim(x): + t0 = x.relu() + t1 = t0.squeeze(-2) + t2 = t1 + 1.0 + t3 = t1.size() + return t2, t3 + + # squeezing a non-size 1 dimension should be a no op + def t_squeeze_dim_no_op(x): + t0 = x.relu() + t1 = t0.squeeze(1) + t2 = t1 + 1.0 + t3 = t1.size() + return t2, t3 + + def run(fn): + jit_fn = torch.jit.script(fn) + jit_o = jit_fn(x) + jit_o = jit_fn(x) + jit_o = jit_fn(x) + o = fn(x) + # output 0 is a tensor, so we check dtype and value + self.assertEqual(o[0].dtype, jit_o[0].dtype) + self.assertEqual(o[0], jit_o[0]) + # output 1 is shape + self.assertEqual(o[1], jit_o[1]) + self.assertGraphContainsExactly(jit_fn.graph_for(x), FUSION_GUARD, 1) + + for t in [t_unsqueeze, t_squeeze, t_squeeze_dim, t_squeeze_dim_no_op]: + run(t) + class TestPassManagerCudaFuser(JitTestCase): @unittest.skipIf(not RUN_CUDA, "requires CUDA") diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index da78a7ceb4b..f63e4ea1668 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -635,7 +635,9 @@ libtorch_cuda_core_sources = [ "torch/csrc/jit/codegen/cuda/index_reference_replay.cpp", "torch/csrc/jit/codegen/cuda/instrumentation.cpp", "torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp", + "torch/csrc/jit/codegen/cuda/ir_builder.cpp", "torch/csrc/jit/codegen/cuda/ir_cloner.cpp", + "torch/csrc/jit/codegen/cuda/ir_container.cpp", "torch/csrc/jit/codegen/cuda/ir_graphviz.cpp", "torch/csrc/jit/codegen/cuda/ir_nodes.cpp", "torch/csrc/jit/codegen/cuda/ir_iostream.cpp", @@ -645,28 +647,32 @@ libtorch_cuda_core_sources = [ "torch/csrc/jit/codegen/cuda/kernel_cache.cpp", "torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp", "torch/csrc/jit/codegen/cuda/kernel_ir.cpp", - "torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp", - "torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp", + "torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp", "torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp", - "torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp", "torch/csrc/jit/codegen/cuda/lower_allocation.cpp", + "torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp", "torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp", + "torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp", "torch/csrc/jit/codegen/cuda/lower_index.cpp", "torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp", "torch/csrc/jit/codegen/cuda/lower_loops.cpp", "torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp", "torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp", "torch/csrc/jit/codegen/cuda/lower_predicate.cpp", + "torch/csrc/jit/codegen/cuda/lower_replace_size.cpp", "torch/csrc/jit/codegen/cuda/lower_shift.cpp", "torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp", + "torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp", "torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp", "torch/csrc/jit/codegen/cuda/lower_unroll.cpp", "torch/csrc/jit/codegen/cuda/lower_utils.cpp", "torch/csrc/jit/codegen/cuda/lower_validation.cpp", + "torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp", "torch/csrc/jit/codegen/cuda/lower2device.cpp", "torch/csrc/jit/codegen/cuda/manager.cpp", "torch/csrc/jit/codegen/cuda/mutator.cpp", "torch/csrc/jit/codegen/cuda/non_divisible_split.cpp", + "torch/csrc/jit/codegen/cuda/ops/alias.cpp", "torch/csrc/jit/codegen/cuda/ops/composite.cpp", "torch/csrc/jit/codegen/cuda/ops/normalization.cpp", "torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp", diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp index 2c9925cf893..d9bf46b51c7 100644 --- a/torch/csrc/jit/codegen/cuda/arith.cpp +++ b/torch/csrc/jit/codegen/cuda/arith.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -23,14 +24,15 @@ Val* newScalar(ValType vtype, DataType dtype) { case (ValType::Scalar): switch (dtype) { case DataType::Bool: - return new Bool(); + return IrBuilder::create(); case DataType::Double: case DataType::Float: case DataType::Half: case DataType::BFloat16: - return new Double(); + return IrBuilder::create(); + case DataType::Int32: case DataType::Int: - return new Int(); + return IrBuilder::create(); default: break; } @@ -103,10 +105,10 @@ TensorView* newOutputTV(const std::vector& vals, DataType dtype) { } for (const auto dim_i : c10::irange(out_domain.size())) { if (extent_vals[dim_i] != nullptr) { - out_domain[dim_i] = new IterDomain( - new Int(start_offsets[dim_i]), + out_domain[dim_i] = IrBuilder::create( + IrBuilder::create(start_offsets[dim_i]), extent_vals[dim_i], - new Int(stop_offsets[dim_i]), + IrBuilder::create(stop_offsets[dim_i]), ParallelType::Serial, iter_types[dim_i]); } else { @@ -121,13 +123,17 @@ TensorView* newOutputTV(const std::vector& vals, DataType dtype) { break; } } - out_domain[dim_i] = - new IterDomain(new Int(0), new Int(1), ParallelType::Serial, itype); + out_domain[dim_i] = IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + FusionGuard::getCurFusion()->oneVal(), + ParallelType::Serial, + itype); } } - return new TensorView( - new TensorDomain(out_domain, std::vector(out_domain.size(), true)), + return IrBuilder::create( + IrBuilder::create( + out_domain, std::vector(out_domain.size(), true)), dtype); } @@ -195,7 +201,7 @@ Val* castOp(DataType dtype, Val* v1) { } Val* out = newValLike(v1, dtype); - new UnaryOp(UnaryOpType::Cast, out, v1); + IrBuilder::create(UnaryOpType::Cast, out, v1); return out; } @@ -219,7 +225,7 @@ Val* unaryOp(UnaryOpType type, Val* v1) { // } Val* out = newValLike(v1, v1->getDataType().value()); - new UnaryOp(type, out, v1); + IrBuilder::create(type, out, v1); return out; } @@ -379,7 +385,7 @@ Val* binaryOp(BinaryOpType type, Val* v1, Val* v2, DataType common_dtype) { } else { out = newScalar(out_vtype, out_dtype); } - new BinaryOp(type, out, vals[0], vals[1]); + IrBuilder::create(type, out, vals[0], vals[1]); return out; } @@ -589,7 +595,7 @@ static TensorView* newForReduction( " of tensor ", tv); - new_domain.push_back(new IterDomain( + new_domain.push_back(IrBuilder::create( id->start(), id->extent(), id->stopOffset(), @@ -597,12 +603,12 @@ static TensorView* newForReduction( isReduction ? IterType::Reduction : id->getIterType())); } - TensorDomain* td = - new TensorDomain(new_domain, std::vector(new_domain.size(), true)); + TensorDomain* td = IrBuilder::create( + new_domain, std::vector(new_domain.size(), true)); data_type = data_type == DataType::Null ? tv->getDataType().value() : data_type; - return new TensorView(td, data_type); + return IrBuilder::create(td, data_type); } TensorView* reductionOp( @@ -652,7 +658,7 @@ TensorView* reductionOp( out_type, " and ", init_type); - new ReductionOp(reduction_op_type, init, out, tv); + IrBuilder::create(reduction_op_type, init, out, tv); if (keep_dim) { auto tv_root = TensorDomain::noReductions(tv->getRootDomain()); @@ -673,9 +679,9 @@ TensorView* sum( Val* init = nullptr; auto dtype = v1->getDataType().value(); if (isFloatingPointType(dtype)) { - init = new Double(0.0); + init = IrBuilder::create(0.0); } else if (isIntegralType(dtype)) { - init = new Int(0); + init = FusionGuard::getCurFusion()->zeroVal(); } else { TORCH_CHECK( false, @@ -693,13 +699,13 @@ TensorView* max( Val* init = nullptr; switch (v1->getDataType().value()) { case (DataType::Double): - init = new Double(std::numeric_limits::lowest()); + init = IrBuilder::create(std::numeric_limits::lowest()); break; case (DataType::Float): - init = new Double(std::numeric_limits::lowest()); + init = IrBuilder::create(std::numeric_limits::lowest()); break; case (DataType::Int): - init = new Int(INT_MIN); + init = IrBuilder::create(INT_MIN); break; default: TORCH_CHECK( @@ -718,13 +724,13 @@ TensorView* min( Val* init = nullptr; switch (v1->getDataType().value()) { case (DataType::Double): - init = new Double(DBL_MAX); + init = IrBuilder::create(DBL_MAX); break; case (DataType::Float): - init = new Double(FLT_MAX); + init = IrBuilder::create(FLT_MAX); break; case (DataType::Int): - init = new Int(INT_MAX); + init = IrBuilder::create(INT_MAX); break; default: TORCH_CHECK( @@ -767,9 +773,9 @@ TensorView* broadcast( size_t iinp = 0, ibdim = 0; while (ibdim < is_broadcast_dim.size()) { if (is_broadcast_dim[ibdim]) { - out_domain.push_back(new IterDomain( - new Int(0), - new Int(1), + out_domain.push_back(IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + FusionGuard::getCurFusion()->oneVal(), ParallelType::Serial, IterType::BroadcastWithoutStride)); } else { @@ -779,10 +785,11 @@ TensorView* broadcast( ibdim++; } - TensorView* out_tensor = new TensorView( - new TensorDomain(out_domain, std::vector(out_domain.size(), true)), + TensorView* out_tensor = IrBuilder::create( + IrBuilder::create( + out_domain, std::vector(out_domain.size(), true)), inp->getDataType().value()); - new BroadcastOp(out_tensor, inp, is_broadcast_dim); + IrBuilder::create(out_tensor, inp, is_broadcast_dim); return out_tensor; } @@ -799,6 +806,10 @@ WelfordResult Welford( TORCH_CHECK(tv->nDims() > 0, "Tried to reduce a 0-dim tensor"); TORCH_CHECK(axes.size() > 0, "No reduction axis specified"); + if (init_N == nullptr) { + init_N = FusionGuard::getCurFusion()->zeroVal(); + } + // Initial values for welford op are tensors, so their dims have to match the // output dim, // i.e. original_dims - dims_to_be_reduced @@ -819,8 +830,8 @@ WelfordResult Welford( init_avg_val = init_avg; init_var_val = init_var; } else { - init_avg_val = new Double(0); - init_var_val = new Double(0); + init_avg_val = IrBuilder::create(0); + init_var_val = IrBuilder::create(0); } // Check and collect reduction axes @@ -847,7 +858,7 @@ WelfordResult Welford( TensorView* out_var = newForReduction(tv, uint_axes); TensorView* out_N = newForReduction(tv, uint_axes, DataType::Int); - new WelfordOp( + IrBuilder::create( out_avg, out_var, out_N, /*out var/avg/count */ @@ -856,7 +867,7 @@ WelfordResult Welford( init_N, /*init var/avg/count */ tv, nullptr, - new Int(1)); /*in var/avg/count */ + FusionGuard::getCurFusion()->oneVal()); /*in var/avg/count */ return WelfordResult(out_avg, out_var, out_N); } @@ -888,10 +899,11 @@ TensorView* transpose( out_domain[i] = in_id->clone(); } - TensorView* out_tensor = new TensorView( - new TensorDomain(out_domain, std::vector(out_domain.size(), true)), + TensorView* out_tensor = IrBuilder::create( + IrBuilder::create( + out_domain, std::vector(out_domain.size(), true)), inp->getDataType().value()); - new TransposeOp(out_tensor, inp, new2old); + IrBuilder::create(out_tensor, inp, new2old); return out_tensor; } @@ -938,7 +950,7 @@ TensorView* sub_alpha(TensorView* v1, TensorView* v2, Val* v3) { return arithOpOverloads(sub_alpha, v1, v2, v3); } // lerp -TORCH_CUDA_CU_API Val* lerp(Val* start, Val* end, Val* weight) { +Val* lerp(Val* start, Val* end, Val* weight) { auto vals = maybeBroadcast({start, end, weight}); Val* intrm1 = sub(vals[1], vals[0]); Val* intrm2 = mul(vals[2], intrm1); @@ -1024,7 +1036,8 @@ Val* where(Val* c, Val* v1, Val* v2) { } else { out = newScalar(out_vtype, out_dtype); } - new TernaryOp(TernaryOpType::Where, out, vals[0], vals[1], vals[2]); + IrBuilder::create( + TernaryOpType::Where, out, vals[0], vals[1], vals[2]); return out; } @@ -1064,7 +1077,8 @@ Val* threshold(Val* in, Val* thresh, Val* value) { value = optionalCast(in->getDataType().value(), value); Val* out = newValLike(in, in->getDataType().value()); - new TernaryOp(TernaryOpType::Threshold, out, in, thresh, value); + IrBuilder::create( + TernaryOpType::Threshold, out, in, thresh, value); return out; } @@ -1084,7 +1098,7 @@ Val* clamp(Val* in, Val* min_val, Val* max_val) { max_val = optionalCast(in->getDataType().value(), max_val); Val* out = newValLike(in, in->getDataType().value()); - new TernaryOp(TernaryOpType::Clamp, out, in, min_val, max_val); + IrBuilder::create(TernaryOpType::Clamp, out, in, min_val, max_val); return out; } @@ -1186,125 +1200,157 @@ TensorView* sum_to(TensorView* in, const std::vector& sum_to_size) { } TensorView* shift(TensorView* inp, const std::vector& offsets, bool pad) { + // When pad is false, no padding is given. When it is true, padding + // sizes are set so that output domains have the same extents as + // input domains. + std::vector pad_width(offsets.size(), 0); + if (pad) { + for (const auto i : c10::irange(offsets.size())) { + pad_width[i] = std::abs(offsets[i]); + } + } + return shift(inp, offsets, pad_width); +} + +TensorView* shift( + TensorView* inp, + const std::vector& offsets, + const std::vector& pad_width_param) { + auto inp_dom = TensorDomain::noReductions(inp->getRootDomain()); + const auto ndims = inp_dom.size(); + + auto pad_width = pad_width_param; + // Default padding is set so that the extent is kept unchanged + if (pad_width.empty()) { + pad_width = offsets; + for (auto& p : pad_width) { + p = std::abs(p); + } + } + TORCH_CHECK( - TensorDomain::noReductions(inp->getRootDomain()).size() == offsets.size(), + ndims == offsets.size(), "Invalid shift offsets, number of entries in offsets expected to be ", - TensorDomain::noReductions(inp->getRootDomain()).size(), + ndims, " but received ", offsets.size()); + TORCH_CHECK( + ndims == pad_width.size(), + "Invalid padding width list, number of entries in pad_width expected to be ", + ndims, + " but received ", + pad_width.size()); + + std::for_each(pad_width.begin(), pad_width.end(), [](const auto& pad) { + TORCH_CHECK(pad >= 0, "Padding width must be >= 0: ", pad); + }); + TensorView* out = nullptr; - if (pad) { - out = newValLike(inp, inp->getDataType().value())->as(); - } else { - auto inp_dom = TensorDomain::noReductions(inp->getRootDomain()); - const auto ndims = inp_dom.size(); - std::vector out_dom; - for (const auto i : c10::irange(ndims)) { - const auto inp_axis = inp_dom[i]; - const auto offset = offsets[i]; - if (offset == 0) { - out_dom.push_back(inp_axis->clone()); - continue; - } + std::vector out_dom; + for (const auto i : c10::irange(ndims)) { + const auto inp_axis = inp_dom[i]; + const auto offset = offsets[i]; + const auto pad = pad_width[i]; - Int* current_start_offset = dynamic_cast(inp_axis->start()); - TORCH_INTERNAL_ASSERT( - current_start_offset != nullptr && current_start_offset->isConst(), - "Invalid IterDomain start value:", - current_start_offset); - - Int* current_stop_offset = dynamic_cast(inp_axis->stopOffset()); - TORCH_INTERNAL_ASSERT( - current_stop_offset != nullptr && current_stop_offset->isConst(), - "Invalid IterDomain stop offset value:", - current_stop_offset); - - const auto cur_start_offset_value = current_start_offset->value().value(); - const auto cur_stop_offset_value = current_stop_offset->value().value(); - - Val* out_start_offset = nullptr; - Val* out_stop_offset = nullptr; - - if (offset > 0) { - // shift to right; extent remains the same, start and stop - // positions are moved right - out_start_offset = new Int(cur_start_offset_value + offset); - out_stop_offset = - new Int(std::max(cur_stop_offset_value - offset, int64_t(0))); - } else { - // shift to left; extent remains the same, start and stop - // positions are moved left - out_start_offset = - new Int(std::max(cur_start_offset_value + offset, int64_t(0))); - out_stop_offset = new Int(cur_stop_offset_value - offset); - } - - out_dom.push_back(new IterDomain( - out_start_offset, - inp_axis->extent(), - out_stop_offset, - ParallelType::Serial, - inp_axis->getIterType())); + if (offset == 0) { + out_dom.push_back(inp_axis->clone()); + continue; } - out = new TensorView( - new TensorDomain(out_dom, std::vector(out_dom.size(), true)), - inp->getDataType().value()); + Int* current_start_offset = dynamic_cast(inp_axis->start()); + TORCH_INTERNAL_ASSERT( + current_start_offset != nullptr && current_start_offset->isConst(), + "Invalid IterDomain start value:", + current_start_offset); + + Int* current_stop_offset = dynamic_cast(inp_axis->stopOffset()); + TORCH_INTERNAL_ASSERT( + current_stop_offset != nullptr && current_stop_offset->isConst(), + "Invalid IterDomain stop offset value:", + current_stop_offset); + + const auto cur_start_offset_value = current_start_offset->value().value(); + const auto cur_stop_offset_value = current_stop_offset->value().value(); + + int64_t out_start_offset = 0; + int64_t out_stop_offset = 0; + + if (offset > 0) { + // shift to right; extent remains the same, start and stop + // positions are moved right + out_start_offset = cur_start_offset_value + offset - pad; + out_stop_offset = std::max(cur_stop_offset_value - offset, int64_t(0)); + // If pad > offset, the extent of the output ID could be larger than the + // input, and the start offset of the output domain could become + // negative, which is not supported. + TORCH_CHECK( + out_start_offset >= 0, + "Invalid shift offset and padding. Padding must not be larger than the absolute extent of shift offset. Padding: ", + pad, + ". Shift: ", + offset, + "."); + } else { + // shift to left; extent remains the same, start and stop + // positions are moved left + out_start_offset = std::max(cur_start_offset_value + offset, int64_t(0)); + out_stop_offset = cur_stop_offset_value - offset - pad; + // Similar to the above case whwere offset is positive, if pad > + // -offset (note offset is negative), the extent of the output + // ID could be larger than the input, and the stop offset of the + // output domain could become negative. + TORCH_CHECK( + out_stop_offset >= 0, + "Invalid shift offset and padding. Padding must not be larger than the absolute extent of shift offset. Padding: ", + pad, + ". Shift: ", + offset, + "."); + } + + out_dom.push_back(IrBuilder::create( + IrBuilder::create(out_start_offset), + inp_axis->extent(), + IrBuilder::create(out_stop_offset), + ParallelType::Serial, + inp_axis->getIterType())); } - new ShiftOp(out, inp, offsets, pad); + out = IrBuilder::create( + IrBuilder::create( + out_dom, std::vector(out_dom.size(), true)), + inp->getDataType().value()); + + IrBuilder::create(out, inp, offsets, pad_width); return out; } namespace { -std::vector convertToIntVector(const std::vector& x) { - std::vector converted; - std::transform(x.begin(), x.end(), std::back_inserter(converted), [](int x) { - return new Int(x); - }); - return converted; -} -} // namespace -TensorView* gather( - TensorView* inp, - const std::vector& window_shape, - const std::vector>& pad_width, - const std::vector& strides) { - std::vector window_shape_int = convertToIntVector(window_shape); - std::vector> pad_width_int; - std::transform( - pad_width.begin(), - pad_width.end(), - std::back_inserter(pad_width_int), - [](const std::vector& x) { return convertToIntVector(x); }); - return gather(inp, window_shape_int, pad_width_int, strides); -} - -namespace { - -// Return a new TensorDomain with given root domains. Apply strides if -// necessary. With non-unit strides, strided domains become an rfactor -// domain. +// Return a new TensorDomain with given root domains. Apply +// strides if necessary. With non-unit strides, strided domains become an +// rfactor domain. TensorDomain* generateTensorDomainWithStrides( const std::vector& root_domains, - const std::vector& strides) { + const std::vector& strides, + bool skip_unit_stride) { std::vector strided_domains; // If strides are just unit strides, don't apply striding - if (strides.empty() || std::all_of(strides.begin(), strides.end(), [](int s) { - return s == 1; - })) { - return new TensorDomain( + if (strides.empty() || + (skip_unit_stride && + std::all_of( + strides.begin(), strides.end(), [](int s) { return s == 1; }))) { + return IrBuilder::create( root_domains, std::vector(root_domains.size(), true)); } for (const auto i : c10::irange(root_domains.size())) { auto root_dom = root_domains.at(i); - if (i >= strides.size() || strides[i] == 1) { + if (i >= strides.size() || (skip_unit_stride && strides[i] == 1)) { strided_domains.push_back(root_dom); continue; } @@ -1317,7 +1363,7 @@ TensorDomain* generateTensorDomainWithStrides( auto contig_vector_size = strided_domains.size(); - auto strided_td = new TensorDomain( + auto strided_td = IrBuilder::create( root_domains, strided_domains, strided_domains, @@ -1330,9 +1376,10 @@ TensorDomain* generateTensorDomainWithStrides( TensorView* gather( TensorView* inp, - const std::vector& window_shape, - const std::vector>& pad_width, - const std::vector& strides) { + const std::vector& window_shape, + const std::vector>& pad_width, + const std::vector& strides, + bool trim_out_of_bounds) { auto inp_dom = TensorDomain::noReductions(inp->getRootDomain()); const auto ndims = inp_dom.size(); @@ -1343,6 +1390,10 @@ TensorView* gather( " but received ", window_shape.size()); + std::for_each(window_shape.begin(), window_shape.end(), [](const auto& w) { + TORCH_CHECK(w > 0, "Window size must be > 0: ", w); + }); + TORCH_CHECK( ndims == pad_width.size(), "Invalid pad width: number of entries expected to be ", @@ -1354,6 +1405,10 @@ TensorView* gather( TORCH_CHECK( p.size() == 2, "Each entry of pad_width must have two non-negative integers."); + std::for_each(p.begin(), p.end(), [](const auto& p_left_or_right) { + TORCH_CHECK( + p_left_or_right >= 0, "Padding must be >= 0: ", p_left_or_right); + }); }); TORCH_CHECK( @@ -1363,6 +1418,10 @@ TensorView* gather( " but received ", strides.size()); + std::for_each(strides.begin(), strides.end(), [](const auto& s) { + TORCH_CHECK(s > 0, "Stride must be > 0: ", s); + }); + std::vector out_root_domains; std::vector out_gather_dom; @@ -1371,40 +1430,57 @@ TensorView* gather( const auto window_dim = window_shape[i]; const auto pad_left = pad_width[i][0]; const auto pad_right = pad_width[i][1]; + // This may be over-conservative TORCH_INTERNAL_ASSERT(inp_axis->start()->isZeroInt()); + const auto inp_stop_offset = inp_axis->stopOffset()->getInt(); + TORCH_INTERNAL_ASSERT( + inp_stop_offset.has_value(), + "Dynamic stop offset not supported: ", + inp_axis); + const auto extent_adjustment = window_dim - 1 - pad_left - pad_right; + TORCH_CHECK( + extent_adjustment >= 0, + "Invalid gather window and padding as output extent would be larger than input.", + " Window: ", + window_dim, + ". Padding left: ", + pad_left, + ". Padding right: ", + pad_right); + const auto out_stop_offset = inp_stop_offset.value() + extent_adjustment; Val* out_axis_dim = nullptr; - if (window_dim->isConst() && pad_left->isConst() && pad_right->isConst()) { - const int64_t extent_adjustment = - -(-window_dim->value().value() + 1 + pad_left->value().value() + - pad_right->value().value()); - out_axis_dim = extent_adjustment == 0 - ? inp_axis->extent() - : sub(inp_axis->extent(), new Int(extent_adjustment)); - } else { - out_axis_dim = - add(add(sub(inp_axis->extent(), window_dim), new Int(1)), - add(pad_left, pad_right)); - } - // TODO: out_axis_dim is assumed to be the same as the extent of - // the input domain. Throw an error if it isn't the case. - out_root_domains.push_back(new IterDomain( - new Int(0), - out_axis_dim, + out_root_domains.push_back(IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + inp_axis->extent(), + IrBuilder::create(out_stop_offset), ParallelType::Serial, inp_axis->getIterType())); // create a new axis for the gathered domain - out_gather_dom.push_back(new IterDomain( - new Int(0), window_dim, ParallelType::Serial, IterType::Gather)); + out_gather_dom.push_back(IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + IrBuilder::create(window_dim), + ParallelType::Serial, + IterType::Gather)); } out_root_domains.insert( out_root_domains.end(), out_gather_dom.begin(), out_gather_dom.end()); - auto out_td = generateTensorDomainWithStrides(out_root_domains, strides); + TensorDomain* out_td = nullptr; - auto out_tv = new TensorView(out_td, inp->getDataType().value()); + if (trim_out_of_bounds) { + // If no stride vector is given, just use stride 1. It does not do + // any striding effect, but out-of-bounds values are trimmed. + auto s = strides.empty() ? std::vector(ndims, 1) : strides; + out_td = generateTensorDomainWithStrides(out_root_domains, strides, false); + } else { + out_td = generateTensorDomainWithStrides(out_root_domains, strides, true); + } - new GatherOp(out_tv, inp, window_shape, pad_width); + auto out_tv = + IrBuilder::create(out_td, inp->getDataType().value()); + + IrBuilder::create(out_tv, inp, window_shape, pad_width); return out_tv; } diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h index 5652d68eab8..1f18f65666a 100644 --- a/torch/csrc/jit/codegen/cuda/arith.h +++ b/torch/csrc/jit/codegen/cuda/arith.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -114,7 +114,9 @@ TORCH_CUDA_CU_API WelfordResult Welford( const std::vector& axes, TensorView* init_avg = nullptr, TensorView* init_var = nullptr, - Int* init_N = new Int(0)); + // Initializes to 0 in function definition, doing this so we don't have to + // import IrBuilder just for this one interface. + Int* init_N = nullptr); // UNARY OPERATIONS // abs @@ -484,19 +486,27 @@ TORCH_CUDA_CU_API TensorView* sum_to( //! t1[i, j] = 0, otherwise //! //! The pad option controls how out-of-boundary accesses are -//! handled. When pad is true, shifting works as if the source tensor -//! is padded by zero. Otherwise, it does not modify the output tensor -//! region whose source coordinates are out-of-boundry. In both cases, -//! the size of output tensor does not change. However, when pad is -//! false, the start or stop value of the shifted axis is adjusted -//! accordingly. For example, when a shift offset is one, the axis start -//! value would be incremented by one. +//! handled. It specifies how many zeros are logically padded. If no +//! pad option is given, it automatically pads the input tensor so +//! that the output tensor has the same extent for each axis. //! -//! \param pad If true, out-of-boundary access returns zero. +//! When a padding value is smaller than the absolute value of a shift +//! offset, the output axis still has the same extent but its start or +//! stop offset is moved inward to signify those outside of the offset +//! are invalid. +//! +//! It is not allowed to use padding values that are larger than shift +//! offsets, which would mean output extentes would be larger than +//! input extents TORCH_CUDA_CU_API TensorView* shift( TensorView* inp, const std::vector& offsets, - bool pad = true); + const std::vector& pad_width = {}); + +TORCH_CUDA_CU_API TensorView* shift( + TensorView* inp, + const std::vector& offsets, + bool pad); //! Gather a window of nearby elements for each element. //! @@ -508,8 +518,13 @@ TORCH_CUDA_CU_API TensorView* shift( //! implemented with strided split, whose outer output domain becomes //! the root domain for subsequent consumers. The inner output domain //! becomes a Stride domain, which is ignored by subsequent consumers. +//! Only valid input ranges are fed into strided splits. //! -//! Example: +//! When trim_out_of_bounds is true, the values at the first and last +//! ends that are outside of the start and stop offsets are +//! effetively trimmed by partial split by 1. +//! +//! Example 1: //! t0: 2D tensor of [N, M] //! t1 = gather(t0, {1, 3}, {{0, 0}, {1, 1}}); //! @@ -517,23 +532,34 @@ TORCH_CUDA_CU_API TensorView* shift( //! t1: [N, M, 1, 3] //! t1[i, j, k, l] = The value at the window position of [k, l] //! for t0[i, j] +//! +//! Example 2.1 (without trimming): +//! t0: 2D tensor of [N, M] +//! t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}}); +//! +//! then: +//! t1: [N (stop offset: 1), M (stop offset: 1, 2, 2)] +//! +//! Example 2.1 (with trimming) +//! t0: 2D tensor of [N, M] +//! t1 = gather(t0, {2, 2}, {{0, 0}, {0, 0}}, true); +//! +//! then: +//! t1: [ceilDiv(N - 1, 1), ceilDiv(M - 1, 1), 2, 2] +//! +//! Example 3: +//! t0: 2D tensor of [N, M] +//! t1 = gather(t0, {3, 3}, {{0, 0}, {0, 0}}, {3, 3}); +//! +//! then: +//! t1: [ceilDiv(N - 2, 3), ceilDiv(M - 2, 3), 2, 2] +//! TORCH_CUDA_CU_API TensorView* gather( TensorView* inp, const std::vector& window_shape, const std::vector>& pad_width, - const std::vector& strides = {}); - -//! Gather a window of nearby elements for each element. -//! -//! Same as the another gather interface but with Int* parameters. -//! -//! TODO: Remove this interface as we do not intend to support dynamic -//! window shapes at this moment. -TORCH_CUDA_CU_API TensorView* gather( - TensorView* inp, - const std::vector& window_shape, - const std::vector>& pad_width, - const std::vector& strides = {}); + const std::vector& strides = {}, + bool trim_out_of_bounds = false); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/codegen.cpp b/torch/csrc/jit/codegen/cuda/codegen.cpp index 709c810efe3..67926e92672 100644 --- a/torch/csrc/jit/codegen/cuda/codegen.cpp +++ b/torch/csrc/jit/codegen/cuda/codegen.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -19,7 +20,7 @@ namespace codegen { namespace { -class CudaKernelGenerator : private kir::IrVisitor { +class CudaKernelGenerator : private OptOutConstDispatch { static constexpr const char* kTab = " "; public: @@ -45,7 +46,7 @@ class CudaKernelGenerator : private kir::IrVisitor { code_ << "__global__ void " << kernel_name << "("; - std::vector params; + std::vector params; // Inputs & Outputs for (auto val : kernel_->inputs()) { @@ -56,13 +57,16 @@ class CudaKernelGenerator : private kir::IrVisitor { } // Generate parameter declarations - for (kir::Val* val : params) { - if (const auto tv = dynamic_cast(val)) { - code_ << "Tensor<" << val->dtype() << ", " - << TensorDomain::noReductions( - tv->fuserTv()->getMaybeRFactorDomain()) - .size() + for (Val* val : params) { + if (const auto tv = dynamic_cast(val)) { + if (tv->isCpuScalar()) { + code_ << " CpuScalarTensor<" << val->dtype() << "> " << varName(tv); + } else { + code_ + << "Tensor<" << val->dtype() << ", " + << TensorDomain::noReductions(tv->getMaybeRFactorDomain()).size() << "> " << varName(tv); + } } else { TORCH_INTERNAL_ASSERT(val->isScalar()); // NOLINT (LLVM bug 48525) TORCH_INTERNAL_ASSERT(val->definition() == nullptr); @@ -76,17 +80,17 @@ class CudaKernelGenerator : private kir::IrVisitor { // Global buffers for (auto allocate : kernel_summary.global_allocations) { - TORCH_INTERNAL_ASSERT(allocate->buffer()->isA()); - const auto tv = allocate->buffer()->as(); + TORCH_INTERNAL_ASSERT(allocate->buffer()->isA()); + const auto tv = allocate->buffer()->as(); const auto& maybe_rfactor_domain = tv->domain()->hasRFactor() - ? tv->domain()->rfactorDomain() - : tv->domain()->rootDomain(); + ? tv->domain()->getRFactorDomain() + : tv->domain()->getRootDomain(); const auto nDims = std::count_if( maybe_rfactor_domain.begin(), maybe_rfactor_domain.end(), - [](const kir::IterDomain* id) { + [](const IterDomain* id) { return !id->isReduction() && - id->iterType() != IterType::BroadcastWithoutStride; + id->getIterType() != IterType::BroadcastWithoutStride; }); code_ << ", Tensor<" << tv->dtype() << ", " << nDims << "> " << varName(tv); @@ -177,7 +181,7 @@ class CudaKernelGenerator : private kir::IrVisitor { void genBody() { for (auto expr : kernel_->topLevelExprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } } @@ -204,100 +208,93 @@ class CudaKernelGenerator : private kir::IrVisitor { return code_; } - std::string gen(const kir::Node* node) { + std::string gen(const Statement* stmt) { std::stringstream tmp_code; std::swap(tmp_code, code_); - auto replacement = replacement_map_.find(node); + auto replacement = replacement_map_.find(stmt); if (replacement != replacement_map_.end()) { - node = replacement->second; + stmt = replacement->second; } - node->accept(this); + OptOutConstDispatch::handle(stmt); std::swap(tmp_code, code_); return tmp_code.str(); } - // TODO(kir): consider automatic var naming - std::string varName(const kir::Val* val) { - std::string prefix = ""; - if (val->isA()) { - prefix = "T"; + std::string varName(const Val* val) { + std::stringstream name; + if (val->isA()) { + name << "T"; } else { - prefix = typePrefix(val->dtype()); + name << typePrefix(val->dtype()); } - - std::stringstream value_name; - if (val->name() != kInvalidStmName) { - value_name << prefix << val->name(); - } else { - value_name << "k" << prefix << val->id(); - } - return value_name.str(); + name << val->name(); + return name.str(); } - std::string genInline(const kir::Node* node) { + std::string genInline(const Statement* stmt) { const bool saved_inline = print_inline_; print_inline_ = true; - auto result = gen(node); + auto result = gen(stmt); print_inline_ = saved_inline; // NOLINTNEXTLINE(performance-no-automatic-move) return result; } - void visit(const kir::Predicate* node) final { - TORCH_INTERNAL_ASSERT(node->hasValue()); - code_ << gen(node->value()); + void handle(const kir::Predicate* pred) final { + TORCH_INTERNAL_ASSERT(pred->hasValue()); + code_ << gen(pred->value()); } - void visit(const kir::Bool* node) final { - const auto def = node->definition(); + void handle(const Bool* pred) final { + const auto def = pred->definition(); if (print_inline_ && def != nullptr) { code_ << "(" << gen(def) << ")"; - } else if (node->isConst()) { - code_ << (*node->value() ? "true" : "false"); + } else if (pred->isConst()) { + code_ << (*pred->value() ? "true" : "false"); } else { - code_ << varName(node); + code_ << varName(pred); } } - void visit(const kir::Double* node) final { - const auto def = node->definition(); + void handle(const Double* d) final { + const auto def = d->definition(); if (print_inline_ && def != nullptr) { code_ << "(" << gen(def) << ")"; - } else if (node->isConst()) { + } else if (d->isConst()) { const int digits = std::numeric_limits::max_digits10; - code_ << std::setprecision(digits) << *node->value(); + code_ << std::setprecision(digits) << *d->value(); } else { - code_ << varName(node); + code_ << varName(d); } } - void visit(const kir::Int* node) final { - const auto def = node->definition(); + void handle(const Int* i) final { + const auto def = i->definition(); if (print_inline_ && def != nullptr) { code_ << "(" << gen(def) << ")"; - } else if (node->isConst()) { - code_ << *node->value(); + } else if (i->isConst()) { + code_ << *i->value(); } else { - code_ << varName(node); + code_ << varName(i); } } - void visit(const kir::NamedScalar* node) final { + void handle(const NamedScalar* ns) final { // dim3 components are unsigned int. Cast to signed integer to // support negative indexing - if (node->getParallelIndex().has_value() || - node->getParallelDim().has_value()) { - code_ << "((nvfuser_index_t)" << node->name() << ")"; + if (ns->getParallelIndex().has_value() || + ns->getParallelDim().has_value()) { + code_ << "((nvfuser_index_t)" << ns->name() << ")"; } else { - code_ << node->name(); + code_ << ns->name(); } } - void visit(const kir::TensorIndex* node) final { - code_ << varName(node->view()) << "["; + void handle(const kir::TensorIndex* ti) final { + code_ << varName(ti->view()) << "["; bool first = true; - for (auto* ind : node->indices()) { + for (auto* ind : ti->indices()) { if (!ind->isZeroInt()) { if (!first) { code_ << " + "; @@ -314,29 +311,29 @@ class CudaKernelGenerator : private kir::IrVisitor { code_ << "]"; } - void visit(const kir::IterDomain* node) final { - TORCH_INTERNAL_ASSERT(false && "Unreachable"); + void handle(const IterDomain*) final { + TORCH_INTERNAL_ASSERT(false, "Unreachable"); } - void visit(const kir::TensorDomain* node) final { - TORCH_INTERNAL_ASSERT(false && "Unreachable"); + void handle(const TensorDomain*) final { + TORCH_INTERNAL_ASSERT(false, "Unreachable"); } - void visit(const kir::TensorView* tv) final { - TORCH_INTERNAL_ASSERT(false && "Unreachable"); + void handle(const TensorView*) final { + TORCH_INTERNAL_ASSERT(false, "Unreachable"); } - void visit(const kir::UnaryOp* node) final { + void handle(const UnaryOp* uop) final { bool is_vector_op = false; size_t vector_word_size = 1; - if (vectorize_scope_ && node->out()->isA()) { - auto ti = node->out()->as(); + if (vectorize_scope_ && uop->out()->isA()) { + auto ti = uop->out()->as(); bool vectorize_op = false; bool misaligned_op = false; - for (auto id : ti->view()->fuserTv()->domain()->domain()) { + for (auto id : ti->view()->domain()->domain()) { if (!isParallelTypeVectorize(id->getParallelType())) { continue; } @@ -358,84 +355,84 @@ class CudaKernelGenerator : private kir::IrVisitor { if (vectorize_op) { TORCH_INTERNAL_ASSERT( - node->operation() == UnaryOpType::Set, + uop->getUnaryOpType() == UnaryOpType::Set, "Cannot vectorize operations that are not sets. ", "Use cache_before and cache_after to store/load with vectorized reads into buffers."); is_vector_op = true; } if (misaligned_op) { - is_vector_op = (node->operation() == UnaryOpType::Set); + is_vector_op = (uop->getUnaryOpType() == UnaryOpType::Set); } - if (is_vector_op && !node->in()->isScalar()) { + if (is_vector_op && !uop->in()->isScalar()) { TORCH_INTERNAL_ASSERT( - node->out()->dtype() == node->in()->dtype(), + uop->out()->dtype() == uop->in()->dtype(), "Vectorized store/load requires input and output datatypes match."); } } if (is_vector_op) { - if (node->in()->isScalar()) { + if (uop->in()->isScalar()) { indent() << "reinterpret_cast<" - << "Array<" << node->out()->dtype() << ", " << vector_word_size + << "Array<" << uop->out()->dtype() << ", " << vector_word_size << ">*>" - << "(&" << gen(node->out()) << ")->set(" << gen(node->in()) + << "(&" << gen(uop->out()) << ")->set(" << gen(uop->in()) << ");\n"; } else { indent() << "*reinterpret_cast<" - << "Array<" << node->out()->dtype() << ", " << vector_word_size + << "Array<" << uop->out()->dtype() << ", " << vector_word_size << ">*>" - << "(&" << gen(node->out()) << ")" + << "(&" << gen(uop->out()) << ")" << " = *reinterpret_cast<" - << "Array<" << node->in()->dtype() << ", " << vector_word_size + << "Array<" << uop->in()->dtype() << ", " << vector_word_size << ">*>" - << "(&" << gen(node->in()) << ");\n"; + << "(&" << gen(uop->in()) << ");\n"; } return; } - if (node->out()->isA()) { - const auto op_type = node->operation(); + if (uop->out()->isA()) { + const auto op_type = uop->getUnaryOpType(); if (auto op = inline_op_str(op_type)) { - indent() << gen(node->out()) << " = " << *op << genInline(node->in()) + indent() << gen(uop->out()) << " = " << *op << genInline(uop->in()) << ";\n"; } return; } if (!print_inline_) { - indent() << gen(node->out()); - if (!node->out()->isScalar() && !node->in()->isScalar()) { + indent() << gen(uop->out()); + if (!uop->out()->isScalar() && !uop->in()->isScalar()) { code_ << "\n"; indent() << kTab; } code_ << " = "; } - const auto op_type = node->operation(); + const auto op_type = uop->getUnaryOpType(); if (auto op = inline_op_str(op_type)) { if (alsoBooleanOperator(op_type) && - node->out()->dtype() == DataType::Bool) { - code_ << stringifyBooleanOp(op_type) << gen(node->in()); + uop->out()->dtype() == DataType::Bool) { + code_ << stringifyBooleanOp(op_type) << gen(uop->in()); } else { - code_ << *op << gen(node->in()); + code_ << *op << gen(uop->in()); } } else { if (op_type == UnaryOpType::Cast) { const auto cast_str = - cast_func_str({node->in()->dtype(), node->out()->dtype()}); + cast_func_str({uop->in()->dtype(), uop->out()->dtype()}); TORCH_INTERNAL_ASSERT( cast_str.has_value(), "Invalid cast. Input type: ", - node->in()->dtype(), + uop->in()->dtype(), ", output type: ", - node->out()->dtype()); + uop->out()->dtype()); code_ << cast_str.value(); } else { code_ << op_type; if (needFloatSuffix(op_type) && - node->out()->dtype() == DataType::Float) { + uop->out()->dtype() == DataType::Float) { code_ << "f"; } } @@ -444,7 +441,7 @@ class CudaKernelGenerator : private kir::IrVisitor { if (op_type == UnaryOpType::RandLike) { code_ << "rnd"; } else { - code_ << gen(node->in()); + code_ << gen(uop->in()); } code_ << ")"; } @@ -456,7 +453,7 @@ class CudaKernelGenerator : private kir::IrVisitor { std::string genBinaryOp( BinaryOpType op_type, - kir::Val* out, + Val* out, const std::string& lhs, const std::string& rhs) { std::stringstream expr; @@ -485,7 +482,7 @@ class CudaKernelGenerator : private kir::IrVisitor { // If one argument is a tensorview and the other is a scalar, make sure we // cast the scalar to the tensorview type - std::string scalarCast(kir::Val* lhs, kir::Val* rhs) { + std::string scalarCast(Val* lhs, Val* rhs) { // If neither are scalars return if (!((lhs->isScalar() || rhs->isScalar()) && (lhs->isA() || rhs->isA()))) { @@ -520,18 +517,18 @@ class CudaKernelGenerator : private kir::IrVisitor { } // If possible, replace pow with mul. Return true when successful. - bool genPowerWithMul(const kir::BinaryOp* node) { - if (node->operation() != BinaryOpType::Pow) { + bool genPowerWithMul(const BinaryOp* bop) { + if (bop->getBinaryOpType() != BinaryOpType::Pow) { return false; } - auto rhs = node->rhs(); + auto rhs = bop->rhs(); c10::optional exponent; - if (auto val_int = dynamic_cast(rhs)) { + if (auto val_int = dynamic_cast(rhs)) { if (val_int->isConst()) { exponent = val_int->value().value(); } - } else if (auto val_float = dynamic_cast(rhs)) { + } else if (auto val_float = dynamic_cast(rhs)) { if (val_float->isConst()) { auto fp_exp = val_float->value().value(); double int_exp = 0; @@ -550,7 +547,7 @@ class CudaKernelGenerator : private kir::IrVisitor { return false; } - auto lhs = gen(node->lhs()); + auto lhs = gen(bop->lhs()); if (print_inline_) { code_ << lhs << " * " << lhs; @@ -558,8 +555,8 @@ class CudaKernelGenerator : private kir::IrVisitor { code_ << " * " << lhs; } } else { - indent() << gen(node->out()); - if (node->out()->isScalar()) { + indent() << gen(bop->out()); + if (bop->out()->isScalar()) { code_ << " = " << lhs << " * " << lhs; if (exponent.value() == 3) { code_ << " * " << lhs; @@ -579,24 +576,24 @@ class CudaKernelGenerator : private kir::IrVisitor { return true; } - void visit(const kir::BinaryOp* node) final { + void handle(const BinaryOp* bop) final { // Try replacing pow with mul - if (genPowerWithMul(node)) { + if (genPowerWithMul(bop)) { return; } - const auto op_type = node->operation(); + const auto op_type = bop->getBinaryOpType(); if (print_inline_) { // Inline expression: `lhs op rhs` code_ << genBinaryOp( - op_type, node->out(), gen(node->lhs()), gen(node->rhs())); + op_type, bop->out(), gen(bop->lhs()), gen(bop->rhs())); } else { - indent() << gen(node->out()); - if (node->out()->isScalar()) { + indent() << gen(bop->out()); + if (bop->out()->isScalar()) { // Single line: `out = lhs op rhs;` code_ << " = " << genBinaryOp( - op_type, node->out(), gen(node->lhs()), gen(node->rhs())); + op_type, bop->out(), gen(bop->lhs()), gen(bop->rhs())); } else { // Split TensorView expressions across multiple lines: // @@ -605,64 +602,64 @@ class CudaKernelGenerator : private kir::IrVisitor { // op rhs; // - auto cast = scalarCast(node->lhs(), node->rhs()); + auto cast = scalarCast(bop->lhs(), bop->rhs()); if (auto op = inline_op_str(op_type)) { code_ << "\n"; - indent() << kTab << "= " << (node->lhs()->isScalar() ? cast : "") - << gen(node->lhs()) << "\n"; + indent() << kTab << "= " << (bop->lhs()->isScalar() ? cast : "") + << gen(bop->lhs()) << "\n"; indent() << kTab; if (alsoBooleanOperator(op_type) && - node->out()->dtype() == DataType::Bool) { + bop->out()->dtype() == DataType::Bool) { code_ << stringifyBooleanOp(op_type); } else { code_ << *op; } - code_ << " " << (node->rhs()->isScalar() ? cast : "") - << gen(node->rhs()); + code_ << " " << (bop->rhs()->isScalar() ? cast : "") + << gen(bop->rhs()); } else { - if (integer_op_str(op_type) && isIntegralType(node->out()->dtype())) { + if (integer_op_str(op_type) && isIntegralType(bop->out()->dtype())) { auto int_op = integer_op_str(op_type); code_ << " = " << *int_op << "(\n"; } else { std::stringstream op_str; op_str << op_type; if (needFloatSuffix(op_type) && - node->out()->dtype() == DataType::Float) { + bop->out()->dtype() == DataType::Float) { op_str << "f"; } code_ << " = " << op_str.str() << "(\n"; } - indent() << kTab << (node->lhs()->isScalar() ? cast : "") - << gen(node->lhs()) << ",\n"; - indent() << kTab << (node->rhs()->isScalar() ? cast : "") - << gen(node->rhs()) << ")"; + indent() << kTab << (bop->lhs()->isScalar() ? cast : "") + << gen(bop->lhs()) << ",\n"; + indent() << kTab << (bop->rhs()->isScalar() ? cast : "") + << gen(bop->rhs()) << ")"; } } code_ << ";\n"; } } - void visit(const kir::TernaryOp* node) final { + void handle(const TernaryOp* top) final { if (!print_inline_) { - indent() << gen(node->out()); - if (!node->out()->isScalar()) { + indent() << gen(top->out()); + if (!top->out()->isScalar()) { code_ << "\n"; indent() << kTab; } code_ << " = "; } - code_ << node->operation() << "(" << gen(node->in1()) << ", "; + code_ << top->getTernaryOpType() << "(" << gen(top->in1()) << ", "; // Make sure the two operands of where has the same // type. Note that compiling "where(0.0f, 0.0)" fails because of // the overloading ambiguity. - if (node->operation() == TernaryOpType::Where) { - auto cast = scalarCast(node->in2(), node->in3()); - code_ << (node->in2()->isScalar() ? cast : "") << gen(node->in2()) << ", " - << (node->in3()->isScalar() ? cast : "") << gen(node->in3()) << ")"; + if (top->getTernaryOpType() == TernaryOpType::Where) { + auto cast = scalarCast(top->in2(), top->in3()); + code_ << (top->in2()->isScalar() ? cast : "") << gen(top->in2()) << ", " + << (top->in3()->isScalar() ? cast : "") << gen(top->in3()) << ")"; } else { - code_ << gen(node->in2()) << ", " << gen(node->in3()) << ")"; + code_ << gen(top->in2()) << ", " << gen(top->in3()) << ")"; } if (!print_inline_) { @@ -670,7 +667,7 @@ class CudaKernelGenerator : private kir::IrVisitor { } } - std::string genReductionOp(BinaryOpType op_type, kir::Val* out) { + std::string genReductionOp(BinaryOpType op_type, Val* out) { std::stringstream lambda; DataType data_type = out->dtype(); lambda << "[](" << data_type << " &a, " << data_type << " b) " @@ -678,47 +675,45 @@ class CudaKernelGenerator : private kir::IrVisitor { return lambda.str(); } - void visit(const kir::BroadcastOp* node) final { - TORCH_INTERNAL_ASSERT(node->out()->isA()); - const auto tensor_index = node->out()->as(); + void handle(const BroadcastOp* stmt) final { + TORCH_INTERNAL_ASSERT(stmt->out()->isA()); + const auto tensor_index = stmt->out()->as(); - const ParallelTypeBitmap domains = - kernel_->predicateMap().getParallelBroadcastDomains( - tensor_index->view()->fuserTv()); + const ParallelTypeBitmap parallel_types = + kernel_->summary().broadcast_parallel_types.at(stmt); - const bool thread_x = domains.get(ParallelType::TIDx); - const bool thread_y = domains.get(ParallelType::TIDy); - const bool thread_z = domains.get(ParallelType::TIDz); - const bool block_x = domains.get(ParallelType::BIDx); - const bool block_y = domains.get(ParallelType::BIDy); - const bool block_z = domains.get(ParallelType::BIDz); - - const bool grid_broadcast_needed = block_x || block_y || block_z; - const bool block_broadcast_needed = thread_x || thread_y || thread_z; + if (parallel_types.none()) { + // Not parallelized + indent() << gen(stmt->out()) << "\n"; + indent() << kTab << " = " << gen(stmt->in()) << ";\n"; + return; + } TORCH_INTERNAL_ASSERT( - !grid_broadcast_needed, - "Parallel broadcast across blocks not supported"); + !parallel_types.hasBID(), + "Parallel broadcast across blocks should have been translated to a GridBroadcast IR node"); - if (block_broadcast_needed) { - const auto data_type = node->out()->dtype(); - indent() << "broadcast::blockBroadcast<" << (thread_x ? "true" : "false") - << ", " << (thread_y ? "true" : "false") << ", " - << (thread_z ? "true" : "false") << ">(\n"; - indent() << kTab << gen(node->out()) << ",\n"; - indent() << kTab << gen(node->in()) << ",\n"; - indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; - TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - indent() << kTab << genInline(node->predicate()) << ");\n"; - } else { - indent() << gen(node->out()) << "\n"; - indent() << kTab << " = " << gen(node->in()) << ";\n"; + std::stringstream flags_str; + for (const ParallelType pt : kParallelTypeTIDs) { + const bool parallel_bcast = parallel_types.get(pt); + if (pt != kParallelTypeTIDs[0]) { + flags_str << ", "; + } + flags_str << (parallel_bcast ? "true" : "false"); } + + const auto data_type = stmt->out()->dtype(); + indent() << "broadcast::blockBroadcast<" << flags_str.str() << ">(\n"; + indent() << kTab << gen(stmt->out()) << ",\n"; + indent() << kTab << gen(stmt->in()) << ",\n"; + indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; + TORCH_INTERNAL_ASSERT( + stmt->predicate() != nullptr && stmt->predicate()->hasValue()); + indent() << kTab << genInline(stmt->predicate()) << ");\n"; } void genWarpReductionOp( - const kir::ReductionOp* node, + const ReductionOp* rop, const IterDomain* reduction_id) { bool is_single_warp = kernel_->getWarpPaddedParallelInfo().is_tidx_single_warp; @@ -729,24 +724,25 @@ class CudaKernelGenerator : private kir::IrVisitor { } else { code_ << "(\n"; } - indent() << kTab << gen(node->out()) << ",\n"; - indent() << kTab << gen(node->in()) << ",\n"; - indent() << kTab << genReductionOp(node->operation(), node->out()) << ",\n"; + indent() << kTab << gen(rop->out()) << ",\n"; + indent() << kTab << gen(rop->in()) << ",\n"; + indent() << kTab << genReductionOp(rop->getReductionOpType(), rop->out()) + << ",\n"; indent() << kTab << "threadIdx,\n"; indent() << kTab << "blockDim,\n"; - indent() << kTab << "static_cast<" << node->out()->dtype() + indent() << kTab << "static_cast<" << rop->out()->dtype() << "*>(shared_mem),\n"; TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - indent() << kTab << genInline(node->predicate()) << ",\n"; - indent() << kTab << node->out()->dtype() << "(" << genInline(node->init()) + rop->predicate() != nullptr && rop->predicate()->hasValue()); + indent() << kTab << genInline(rop->predicate()) << ",\n"; + indent() << kTab << rop->out()->dtype() << "(" << genInline(rop->init()) << "));\n"; } - void visit(const kir::ReductionOp* node) final { - TORCH_INTERNAL_ASSERT(node->out()->isA()); + void handle(const ReductionOp* rop) final { + TORCH_INTERNAL_ASSERT(rop->out()->isA()); - const auto out = node->out()->as(); + const auto out = rop->out()->as(); const auto domain = out->view()->domain(); const bool has_block_reduce = domain->hasBlockReduction(); @@ -754,18 +750,18 @@ class CudaKernelGenerator : private kir::IrVisitor { if (!has_block_reduce && !has_grid_reduce) { const auto gen_out = gen(out); - const auto op_type = node->operation(); + const auto op_type = rop->getReductionOpType(); indent() << gen_out << " = " - << genBinaryOp(op_type, out, gen_out, gen(node->in())) << ";\n"; + << genBinaryOp(op_type, out, gen_out, gen(rop->in())) << ";\n"; return; } - if (auto reduction_id = ir_utils::getMaybeWarpReductionDim(node)) { - genWarpReductionOp(node, reduction_id.value()); + if (auto reduction_id = ir_utils::getMaybeWarpReductionDim(rop)) { + genWarpReductionOp(rop, reduction_id.value()); return; } - const auto par_domains = ir_utils::getParallelDomains(node->out()); + const auto par_domains = ir_utils::getParallelDomains(rop->out()); // Get parallel reduction domains const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end() && @@ -777,14 +773,14 @@ class CudaKernelGenerator : private kir::IrVisitor { par_domains.find(ParallelType::TIDz) != par_domains.end() && par_domains.at(ParallelType::TIDz)->isReduction(); - const auto data_type = node->out()->dtype(); - const auto op_type = node->operation(); + const auto data_type = rop->out()->dtype(); + const auto op_type = rop->getReductionOpType(); if (has_block_reduce) { if (has_grid_reduce) { indent() << data_type << " " << "block_result_" << block_reduce_name_ << "=" - << gen(node->init()) << ";\n"; + << gen(rop->init()) << ";\n"; } indent() << "blockReduce<" << (tidx ? "true" : "false") << ", " << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") @@ -792,44 +788,43 @@ class CudaKernelGenerator : private kir::IrVisitor { if (has_grid_reduce) { indent() << kTab << "block_result_" << block_reduce_name_ << ",\n"; } else { - indent() << kTab << gen(node->out()) << ",\n"; + indent() << kTab << gen(rop->out()) << ",\n"; } - indent() << kTab << gen(node->in()) << ",\n"; - indent() << kTab << genReductionOp(op_type, node->out()) << ",\n"; + indent() << kTab << gen(rop->in()) << ",\n"; + indent() << kTab << genReductionOp(op_type, rop->out()) << ",\n"; indent() << kTab << "threadIdx,\n"; indent() << kTab << "blockDim,\n"; indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - auto read_pred = genInline(node->predicate()); + rop->predicate() != nullptr && rop->predicate()->hasValue()); + auto read_pred = genInline(rop->predicate()); indent() << kTab << read_pred << ",\n"; // Pass the write predicate if available and different from the // default predicate. The blockReduce runtime function uses the // default predicate for both read and write when only the // default one is given. - if (node->writePredicate() != nullptr) { - TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue()); - auto write_pred = genInline(node->writePredicate()); + if (rop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(rop->writePredicate()->hasValue()); + auto write_pred = genInline(rop->writePredicate()); indent() << kTab << write_pred << ",\n"; } - indent() << kTab << data_type << "(" << genInline(node->init()) - << "));\n"; + indent() << kTab << data_type << "(" << genInline(rop->init()) << "));\n"; } } - void visit(const kir::WelfordOp* node) final { - TORCH_INTERNAL_ASSERT(node->out()->isA()); + void handle(const WelfordOp* wop) final { + TORCH_INTERNAL_ASSERT(wop->out()->isA()); - const auto out = node->out()->as(); + const auto out = wop->out()->as(); const auto domain = out->view()->domain(); - const auto out_var = node->outVar(); - const auto out_avg = node->outAvg(); - const auto out_N = node->outN(); + const auto out_var = wop->outVar(); + const auto out_avg = wop->outAvg(); + const auto out_N = wop->outN(); - const auto in_var = node->inVar(); - const auto in_avg = node->inAvg(); - const auto in_N = node->inN(); + const auto in_var = wop->inVar(); + const auto in_avg = wop->inAvg(); + const auto in_N = wop->inN(); const bool has_block_reduce = domain->hasBlockReduction(); const bool has_grid_reduce = domain->hasGridReduction(); @@ -852,7 +847,7 @@ class CudaKernelGenerator : private kir::IrVisitor { return; } - const auto par_domains = ir_utils::getParallelDomains(node->out()); + const auto par_domains = ir_utils::getParallelDomains(wop->out()); // Get parallel reduction domains const bool tidx = par_domains.find(ParallelType::TIDx) != par_domains.end() && @@ -864,20 +859,20 @@ class CudaKernelGenerator : private kir::IrVisitor { par_domains.find(ParallelType::TIDz) != par_domains.end() && par_domains.at(ParallelType::TIDz)->isReduction(); - const auto data_type = node->out()->dtype(); + const auto data_type = wop->out()->dtype(); if (has_block_reduce) { if (has_grid_reduce) { // allocate block result indent() << data_type << " " << "block_result_avg_" << block_reduce_name_ << " = " - << gen(node->initAvg()) << ";\n"; + << gen(wop->initAvg()) << ";\n"; indent() << data_type << " " << "block_result_var_" << block_reduce_name_ << " = " - << gen(node->initVar()) << ";\n"; + << gen(wop->initVar()) << ";\n"; indent() << DataType::Int << " " << "block_result_n_" << block_reduce_name_ << " = " - << gen(node->initN()) << ";\n"; + << gen(wop->initN()) << ";\n"; } indent() << "blockWelford<" << (tidx ? "true" : "false") << ", " << (tidy ? "true" : "false") << ", " << (tidz ? "true" : "false") @@ -887,9 +882,9 @@ class CudaKernelGenerator : private kir::IrVisitor { << kTab << "block_result_var_" << block_reduce_name_ << ",\n" << kTab << "block_result_n_" << block_reduce_name_ << ",\n"; } else { - indent() << kTab << gen(node->outAvg()) << ",\n"; - indent() << kTab << gen(node->outVar()) << ",\n"; - indent() << kTab << gen(node->outN()) << ",\n"; + indent() << kTab << gen(wop->outAvg()) << ",\n"; + indent() << kTab << gen(wop->outVar()) << ",\n"; + indent() << kTab << gen(wop->outN()) << ",\n"; } indent() << " " << gen(in_avg) << ",\n"; if (in_var) { @@ -907,14 +902,14 @@ class CudaKernelGenerator : private kir::IrVisitor { << "*>(shared_mem_var),\n"; indent() << kTab << "reinterpret_cast<" << DataType::Int << "*>(shared_mem_n),\n"; - TORCH_INTERNAL_ASSERT(node->predicate() != nullptr); + TORCH_INTERNAL_ASSERT(wop->predicate() != nullptr); TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - auto read_pred = genInline(node->predicate()); + wop->predicate() != nullptr && wop->predicate()->hasValue()); + auto read_pred = genInline(wop->predicate()); indent() << kTab << read_pred << ",\n"; - if (node->writePredicate() != nullptr) { - TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue()); - auto write_pred = genInline(node->writePredicate()); + if (wop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(wop->writePredicate()->hasValue()); + auto write_pred = genInline(wop->writePredicate()); indent() << kTab << write_pred << ",\n"; } indent() << kTab << data_type << "(0));\n"; @@ -954,8 +949,8 @@ class CudaKernelGenerator : private kir::IrVisitor { return flags.str(); } - void visit(const kir::GridReduction* node) final { - const auto rop = node->reduction_op(); + void handle(const kir::GridReduction* grop) final { + const auto rop = grop->reduction_op(); TORCH_INTERNAL_ASSERT(rop->out()->isA()); const auto out = rop->out()->as(); @@ -963,19 +958,17 @@ class CudaKernelGenerator : private kir::IrVisitor { TORCH_INTERNAL_ASSERT(domain->hasGridReduction()); const auto data_type = rop->out()->dtype(); - const auto op_type = rop->operation(); + const auto op_type = rop->getReductionOpType(); TORCH_INTERNAL_ASSERT( - node->reduction_buffer()->buffer()->isA()); - TORCH_INTERNAL_ASSERT( - node->sync_buffer()->buffer()->isA()); + grop->reduction_buffer()->buffer()->isA()); + TORCH_INTERNAL_ASSERT(grop->sync_buffer()->buffer()->isA()); const auto work_buffer = - node->reduction_buffer()->buffer()->as(); - const auto sync_buffer = - node->sync_buffer()->buffer()->as(); + grop->reduction_buffer()->buffer()->as(); + const auto sync_buffer = grop->sync_buffer()->buffer()->as(); const std::string flags_str = - generateGridReduceTemplateFlags(rop, node->threadPredicate()); + generateGridReduceTemplateFlags(rop, grop->threadPredicate()); const bool persistent_sync = kernel_->summary().has_cooperative_grid_reduction; @@ -996,44 +989,46 @@ class CudaKernelGenerator : private kir::IrVisitor { indent() << kTab << varName(sync_buffer) << ",\n"; indent() << kTab << "static_cast<" << data_type << "*>(shared_mem),\n"; TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - auto read_pred = genInline(node->predicate()); + grop->predicate() != nullptr && grop->predicate()->hasValue()); + auto read_pred = genInline(grop->predicate()); indent() << kTab << read_pred << ",\n"; - if (node->writePredicate() != nullptr) { - TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue()); - auto write_pred = genInline(node->writePredicate()); + if (grop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(grop->writePredicate()->hasValue()); + auto write_pred = genInline(grop->writePredicate()); indent() << kTab << write_pred << ",\n"; } else { indent() << kTab << read_pred << ",\n"; } indent() << kTab << data_type << "(" - << genInline(node->reduction_op()->init()) << "));\n"; + << genInline(grop->reduction_op()->init()) << "));\n"; } - void visit(const kir::GridBroadcast* node) final { - const auto bop = node->broadcast_op(); + void handle(const kir::GridBroadcast* grop) final { + const auto bop = grop->broadcast_op(); TORCH_INTERNAL_ASSERT(bop->out()->isA()); + const ParallelTypeBitmap parallel_types = + kernel_->summary().broadcast_parallel_types.at(bop); + + TORCH_INTERNAL_ASSERT( + parallel_types.hasBID(), + "GridBroadcast needs to be used with a broadcast op that is parallelized with the BID parallel types"); + const auto out = bop->out()->as(); const auto domain = out->view()->domain(); - TORCH_INTERNAL_ASSERT(domain->hasGridBroadcast()); const auto data_type = bop->out()->dtype(); TORCH_INTERNAL_ASSERT( - node->broadcast_buffer()->buffer()->isA()); - TORCH_INTERNAL_ASSERT( - node->sync_buffer()->buffer()->isA()); + grop->broadcast_buffer()->buffer()->isA()); + TORCH_INTERNAL_ASSERT(grop->sync_buffer()->buffer()->isA()); const auto work_buffer = - node->broadcast_buffer()->buffer()->as(); - const auto sync_buffer = - node->sync_buffer()->buffer()->as(); + grop->broadcast_buffer()->buffer()->as(); + const auto sync_buffer = grop->sync_buffer()->buffer()->as(); - const auto par_domains = ir_utils::getParallelDomains(out); std::stringstream flags_str; for (const ParallelType pt : kParallelTypeThreads) { - const bool parallel_bcast = par_domains.find(pt) != par_domains.end() && - par_domains.at(pt)->isBroadcast(); + const bool parallel_bcast = parallel_types.get(pt); if (pt != kParallelTypeThreads[0]) { flags_str << ", "; } @@ -1041,7 +1036,7 @@ class CudaKernelGenerator : private kir::IrVisitor { } // Since block-level broadcast has not necessarily been performed before - // this function call, so grid broadcast may be broadcasting across both + // this function call, so grid broadcast may be broadcasting across both // the grid and the block level. indent() << "grid_broadcast::broadcast<" << flags_str.str() << ">(\n"; indent() << kTab << gen(bop->out()) << ",\n"; @@ -1049,12 +1044,12 @@ class CudaKernelGenerator : private kir::IrVisitor { indent() << kTab << "&" << varName(work_buffer) << "[0],\n"; indent() << kTab << varName(sync_buffer) << ",\n"; TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - indent() << kTab << genInline(node->predicate()) << ");\n"; + grop->predicate() != nullptr && grop->predicate()->hasValue()); + indent() << kTab << genInline(grop->predicate()) << ");\n"; } - void visit(const kir::GridWelford* node) final { - const auto wop = node->welford_op(); + void handle(const kir::GridWelford* gwop) final { + const auto wop = gwop->welford_op(); TORCH_INTERNAL_ASSERT(wop->outAvg()->isA()); const auto out = wop->out()->as(); @@ -1063,21 +1058,19 @@ class CudaKernelGenerator : private kir::IrVisitor { const auto data_type = out->dtype(); - TORCH_INTERNAL_ASSERT(node->var_buffer()->buffer()->isA()); - TORCH_INTERNAL_ASSERT( - node->sync_buffer()->buffer()->isA()); + TORCH_INTERNAL_ASSERT(gwop->var_buffer()->buffer()->isA()); + TORCH_INTERNAL_ASSERT(gwop->sync_buffer()->buffer()->isA()); - const auto avg_buffer = node->avg_buffer()->buffer()->as(); - const auto var_buffer = node->var_buffer()->buffer()->as(); - const auto n_buffer = node->N_buffer()->buffer()->as(); - const auto sync_buffer = - node->sync_buffer()->buffer()->as(); + const auto avg_buffer = gwop->avg_buffer()->buffer()->as(); + const auto var_buffer = gwop->var_buffer()->buffer()->as(); + const auto n_buffer = gwop->N_buffer()->buffer()->as(); + const auto sync_buffer = gwop->sync_buffer()->buffer()->as(); const bool persistent_sync = kernel_->summary().has_cooperative_grid_reduction; const std::string flags_str = - generateGridReduceTemplateFlags(wop, node->threadPredicate()); + generateGridReduceTemplateFlags(wop, gwop->threadPredicate()); // Since block-level reduction is already done, those dimensions // with tidx/y/z being true do not participate in the grid reduction. @@ -1112,12 +1105,12 @@ class CudaKernelGenerator : private kir::IrVisitor { indent() << kTab << "reinterpret_cast<" << wop->outN()->dtype() << "*>(shared_mem_n),\n"; TORCH_INTERNAL_ASSERT( - node->predicate() != nullptr && node->predicate()->hasValue()); - auto read_pred = genInline(node->predicate()); + gwop->predicate() != nullptr && gwop->predicate()->hasValue()); + auto read_pred = genInline(gwop->predicate()); indent() << kTab << read_pred << ",\n"; - if (node->writePredicate() != nullptr) { - TORCH_INTERNAL_ASSERT(node->writePredicate()->hasValue()); - auto write_pred = genInline(node->writePredicate()); + if (gwop->writePredicate() != nullptr) { + TORCH_INTERNAL_ASSERT(gwop->writePredicate()->hasValue()); + auto write_pred = genInline(gwop->writePredicate()); indent() << kTab << write_pred << ",\n"; } else { indent() << kTab << read_pred << ",\n"; @@ -1128,27 +1121,26 @@ class CudaKernelGenerator : private kir::IrVisitor { void handleScope(const kir::Scope& scope) { for (auto expr : scope.exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } } - void visit(const kir::ForLoop* node) final { - // TODO(kir): handle this during lowering - if (node->iter_domain()->isBroadcast()) { - handleScope(node->body()); + void handle(const kir::ForLoop* loop) final { + if (loop->iter_domain()->isBroadcast()) { + handleScope(loop->body()); return; - } else if (node->vectorize()) { - vectorize_scope_ = node->vectorize(); - handleScope(node->body()); + } else if (loop->vectorize()) { + vectorize_scope_ = loop->vectorize(); + handleScope(loop->body()); vectorize_scope_ = false; return; - } else if (node->iter_domain()->isStride()) { + } else if (loop->iter_domain()->isStride()) { // A stride domain only executes the loop body with the loop // index being zero. indent() << "constexpr " << "nvfuser_index_t" - << " " << gen(node->index()) << " = 0;\n"; - handleScope(node->body()); + << " " << gen(loop->index()) << " = 0;\n"; + handleScope(loop->body()); return; } @@ -1168,56 +1160,82 @@ class CudaKernelGenerator : private kir::IrVisitor { // necessary since the loop stop value just needs to be <= the // IterDomain extent. However, at this point, this conservative // analysis seems sufficient. - if (node->stop() == node->iter_domain()->extent() && - node->iter_domain()->isThread()) { + if (loop->stop() == loop->iter_domain()->extent() && + loop->iter_domain()->isThread()) { // Register a replacement of references to the loop index with // the loop start value. - replacement_map_.insert({node->index(), node->start()}); - handleScope(node->body()); - replacement_map_.erase(node->index()); + replacement_map_.insert({loop->index(), loop->start()}); + handleScope(loop->body()); + replacement_map_.erase(loop->index()); return; } - if (node->start()->isZeroInt() && node->stop()->isOneInt()) { + if (loop->start()->isZeroInt() && loop->stop()->isOneInt()) { indent() << "constexpr " << "nvfuser_index_t" - << " " << gen(node->index()) << " = 0;\n"; - handleScope(node->body()); + << " " << gen(loop->index()) << " = 0;\n"; + handleScope(loop->body()); + return; + } else if ( + // Special case handling for a pattern where start == end - 1. + loop->start()->definition() != nullptr && + loop->start()->definition()->isA() && + loop->start()->definition()->as()->getBinaryOpType() == + BinaryOpType::Sub && + loop->start()->definition()->as()->lhs() == loop->stop() && + loop->start()->definition()->as()->rhs()->isOneInt()) { + indent() << "const " + << "nvfuser_index_t" + << " " << gen(loop->index()) << " = " << genInline(loop->start()) + << ";\n"; + handleScope(loop->body()); return; } - const auto gen_index = gen(node->index()); - const auto gen_start = genInline(node->start()); - const auto gen_stop = genInline(node->stop()); - const auto gen_step = genInline(node->step()); + const auto gen_index = gen(loop->index()); + const auto gen_start = genInline(loop->start()); + const auto gen_stop = genInline(loop->stop()); + const auto gen_step = genInline(loop->step()); std::stringstream step_code; - if (node->step()->isOneInt()) { + if (loop->step()->isOneInt()) { step_code << "++" << gen_index; } else { step_code << gen_index << " += " << gen_step; } - if (node->isUnrolled()) { + if (loop->isUnrolled()) { indent() << "#pragma unroll\n"; } else { indent() << "#pragma unroll 1\n"; } - indent() << "for(nvfuser_index_t " << gen_index << " = " << gen_start - << "; " << gen_index << " < " << gen_stop << "; " - << step_code.str() << ") "; + + indent() << "for(nvfuser_index_t " << gen_index; + if (loop->iter_domain()->isParallelized()) { + code_ << " = " << gen_start << "; "; + } else { + // Do not start at the start of the ID when not parallelized. Instead, + // start at 0. Predicates will protect buffers between 0 and ID->start(), + // however if we started at ID->start and extent == ID->start, we could + // have a "degenerate" loop (loop with no iterations). It may not be an + // issue to have a 0-sized loop, but all potential consequences haven't + // been covered. One example is WAR analysis which could incorrectly think + // a barrier inside a 0-sized loop actually provides protection. + code_ << " = 0; "; + } + code_ << gen_index << " < " << gen_stop << "; " << step_code.str() << ") "; startBlock(true); - handleScope(node->body()); + handleScope(loop->body()); endBlock(); } - void visit(const kir::IfThenElse* node) final { - auto conditional = node->predicate()->value(); + void handle(const kir::IfThenElse* ite) final { + auto conditional = ite->predicate()->value(); if (conditional->isConst()) { // If the conditional is a constant, then the IfThenElse is not required if (conditional->value().value()) { - handleScope(node->thenBody()); + handleScope(ite->thenBody()); } else { - handleScope(node->elseBody()); + handleScope(ite->elseBody()); } return; } @@ -1226,41 +1244,40 @@ class CudaKernelGenerator : private kir::IrVisitor { // "then" block startBlock(true); - handleScope(node->thenBody()); + handleScope(ite->thenBody()); // "else" block (optional) - if (node->hasElse()) { + if (ite->hasElse()) { endBlock(" else "); startBlock(true); - handleScope(node->elseBody()); + handleScope(ite->elseBody()); } endBlock(); } - // TODO(kir): fold initialization into Allocate - void visit(const kir::Allocate* node) final { - const auto buffer_dtype = node->buffer()->dtype(); + void handle(const kir::Allocate* alloc) final { + const auto buffer_dtype = alloc->buffer()->dtype(); - if (!node->buffer()->isA()) { - indent() << buffer_dtype << " " << gen(node->buffer()) << ";\n"; + if (!alloc->buffer()->isA()) { + indent() << buffer_dtype << " " << gen(alloc->buffer()) << ";\n"; return; } - const auto tv = node->buffer()->as(); + const auto tv = alloc->buffer()->as(); - const auto size = node->size(); + const auto size = alloc->size(); TORCH_INTERNAL_ASSERT(size != nullptr); - if (node->alias() != nullptr) { - // Allocate alias another Allocate node - const auto alias_tv = node->alias()->buffer()->as(); - indent() << "// Alias Allocation - " << node->memoryType() << "\n"; + if (alloc->alias() != nullptr) { + // Allocate alias another Allocate stmt + const auto alias_tv = alloc->alias()->buffer()->as(); + indent() << "// Alias Allocation - " << alloc->memoryType() << "\n"; indent() << buffer_dtype << "* " << varName(tv) << " = " << varName(alias_tv) << ";\n"; } else { // Standard Memory Allocation - switch (tv->memoryType()) { + switch (tv->getMemoryType()) { case MemoryType::Global: indent() << "// Allocate global tensor " << varName(tv) << "\n"; break; @@ -1292,7 +1309,7 @@ class CudaKernelGenerator : private kir::IrVisitor { } } - void visit(const kir::Sync* node) final { + void handle(const kir::Sync*) final { // Use a custom synchronization method if enabled if (std::getenv("PYTORCH_NVFUSER_USE_BLOCK_SYNC_ATOMIC")) { indent() << "block_sync::sync();\n"; @@ -1301,11 +1318,11 @@ class CudaKernelGenerator : private kir::IrVisitor { } } - void visit(const kir::InitMagicZero* node) final { + void handle(const kir::InitMagicZero*) final { indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n"; } - void visit(const kir::UpdateMagicZero* node) final { + void handle(const kir::UpdateMagicZero*) final { indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n"; } @@ -1314,15 +1331,13 @@ class CudaKernelGenerator : private kir::IrVisitor { const kir::Kernel* kernel_; int block_nest_level_ = 0; int block_reduce_name_ = 0; - - // TODO(kir): replace with explicit assignment statements bool print_inline_ = false; // Mark when we are inside of a vectorized for-loop bool vectorize_scope_ = false; //! Holds active replacement mappings during codegen - std::unordered_map replacement_map_; + std::unordered_map replacement_map_; }; } // namespace diff --git a/torch/csrc/jit/codegen/cuda/codegen.h b/torch/csrc/jit/codegen/cuda/codegen.h index 2ffbb872155..31e4fb70736 100644 --- a/torch/csrc/jit/codegen/cuda/codegen.h +++ b/torch/csrc/jit/codegen/cuda/codegen.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/compute_at.cpp b/torch/csrc/jit/codegen/cuda/compute_at.cpp index 45f744d7e2f..f51e0fe1bc9 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at.cpp @@ -59,14 +59,8 @@ bool validateDomain(TensorView* tv, TensorDomain* new_td) { unsigned int getReplayablePosPasC( TensorView* producer, TensorView* consumer, - const ComputeAtRootDomainMap& root_map_, + const std::unordered_set& unmappable_producer_dims, ComputeAtMode mode) { - // Grab dimensions in producer and consumer that are mappable to eachother - // based on the computeAtRootDomainMap. This will tell us which dimensions - // can be inlined based on avoiding trying to inline reduction structures. - auto mappable_roots = - root_map_.getMappableDims(producer->domain(), consumer->domain()); - // Check if any consumer dimensions are marked as vectorize as producer can // not be inlined to vectorized dimensions in consumer. auto c_dom = consumer->domain()->domain(); @@ -124,9 +118,14 @@ unsigned int getReplayablePosPasC( if (std::any_of( consumer_root_dim_ids.begin(), consumer_root_dim_ids.end(), - [&mappable_roots, &c2p_root_map](IterDomain* root_id) { - return mappable_roots.find(root_id) == mappable_roots.end() && - c2p_root_map.find(root_id) != c2p_root_map.end(); + [&unmappable_producer_dims, &c2p_root_map](IterDomain* c_root_id) { + auto p_root_id_it = c2p_root_map.find(c_root_id); + if (p_root_id_it == c2p_root_map.end()) { + return false; + } + auto p_id = p_root_id_it->second; + return unmappable_producer_dims.find(p_id) != + unmappable_producer_dims.end(); })) { continue; } @@ -146,14 +145,8 @@ unsigned int getReplayablePosPasC( unsigned int getReplayablePosCasP( TensorView* consumer, TensorView* producer, - const ComputeAtRootDomainMap& root_map_, + const std::unordered_set& unmappable_producer_dims, ComputeAtMode mode) { - // Grab dimensions in producer and consumer that are mappable to eachother - // based on the computeAtRootDomainMap. This will tell us which dimensions - // can be inlined based on avoiding trying to inline reduction structures. - auto mappable_roots = - root_map_.getMappableDims(producer->domain(), consumer->domain()); - auto p_dom = producer->domain()->domain(); auto first_reduction = std::find_if(p_dom.begin(), p_dom.end(), [](IterDomain* id) { @@ -208,10 +201,11 @@ unsigned int getReplayablePosCasP( if (std::any_of( producer->getMaybeRFactorDomain().begin(), producer->getMaybeRFactorDomain().end(), - [&mappable_roots, &all_vals](IterDomain* root_id) { - return std::find(all_vals.begin(), all_vals.end(), root_id) != + [&unmappable_producer_dims, &all_vals](IterDomain* p_root_id) { + return std::find(all_vals.begin(), all_vals.end(), p_root_id) != all_vals.end() && - mappable_roots.find(root_id) == mappable_roots.end(); + unmappable_producer_dims.find(p_root_id) != + unmappable_producer_dims.end(); })) { continue; } @@ -446,7 +440,8 @@ unsigned int ComputeAt::backwardComputeAt_impl( FUSER_PERF_SCOPE("backwardComputeAt_impl"); auto max_consumer_compute_at_pos = - getReplayablePosPasC(producer, consumer, root_map_, mode_); + getReplayablePosPasC(producer, consumer, unmappable_dims_, mode_); + if (mode_ == ComputeAtMode::BestEffort) { consumer_compute_at_pos = std::min(consumer_compute_at_pos, max_consumer_compute_at_pos); @@ -517,7 +512,7 @@ unsigned int ComputeAt::forwardComputeAt_impl( FUSER_PERF_SCOPE("forwardComputeAt_impl"); auto max_producer_compute_at_pos = - getReplayablePosCasP(consumer, producer, root_map_, mode_); + getReplayablePosCasP(consumer, producer, unmappable_dims_, mode_); if (mode_ == ComputeAtMode::BestEffort) { producer_compute_at_pos = @@ -865,6 +860,25 @@ void ComputeAt::runPass() { } } +void ComputeAt::buildUnmappableDims() { + auto all_tvs = ir_utils::allTvs(producer_->fusion()); + for (auto tv : all_tvs) { + auto consumers = ir_utils::consumerTvsOf(tv); + for (auto consumer : consumers) { + // Grab dimensions in producer and consumer that are mappable to eachother + // based on the computeAtRootDomainMap. This will tell us which dimensions + // can be inlined based on avoiding trying to inline reduction structures. + auto mappable_roots = + root_map_.getMappableDims(tv->domain(), consumer->domain()); + for (auto tv_root_id : tv->getMaybeRFactorDomain()) { + if (mappable_roots.find(tv_root_id) == mappable_roots.end()) { + unmappable_dims_.emplace(tv_root_id); + } + } + } + } +} + ComputeAt::ComputeAt( TensorView* _producer, TensorView* _consumer, @@ -903,6 +917,8 @@ ComputeAt::ComputeAt( setCommonConsumer(); root_map_.build(); + + buildUnmappableDims(); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/compute_at.h b/torch/csrc/jit/codegen/cuda/compute_at.h index 391225218db..75fca5705ed 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at.h +++ b/torch/csrc/jit/codegen/cuda/compute_at.h @@ -2,11 +2,12 @@ #include +#include #include -#include #include #include +#include #include namespace torch { @@ -68,6 +69,10 @@ class ComputeAt { // call. void setCommonConsumer(); + // Iterate through all TVs and collect the dimensions of each TV that don't + // map to all its consumer TVs. + void buildUnmappableDims(); + // Propagate backward from consumer to producer, check if it increase // computeAt position on tensors, if so take it! void traverseBackward(); @@ -106,6 +111,9 @@ class ComputeAt { // Producer use chains set in, used in a few spots. std::deque> producer_use_chains_; + // Root domains in producer that's unmappable to any of its consumers + std::unordered_set unmappable_dims_; + ComputeAt( TensorView* _producer, TensorView* _consumer, diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp index 6671fc37546..f46a7495163 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp +++ b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp @@ -1,7 +1,6 @@ #include #include -#include #include #include #include @@ -488,71 +487,6 @@ void ComputeAtMap::build(Fusion* fusion, GpuLower* gpu_lower) { } } } - - if (gpu_lower != nullptr) { - convertToKir(fusion, gpu_lower); - } -} - -void ComputeAtMap::convertToKir(Fusion* fusion, GpuLower* gpu_lower) { - TORCH_INTERNAL_ASSERT(fusion != nullptr); - TORCH_INTERNAL_ASSERT(gpu_lower != nullptr); - - has_lowered_kir_ = true; - - std::unordered_map< - std::shared_ptr>, - std::shared_ptr>> - disjoint_set_2_kir; - - for (const auto& disjoint_iter_set : disjoint_iter_set_maps_) { - auto fusion_set = disjoint_iter_set.second; - auto kir_set_it = disjoint_set_2_kir.find(fusion_set); - std::shared_ptr> kir_set; - if (kir_set_it == disjoint_set_2_kir.end()) { - kir_set = std::make_shared>(); - std::transform( - fusion_set->begin(), - fusion_set->end(), - std::inserter(*kir_set, kir_set->begin()), - [&gpu_lower](IterDomain* id) { - return gpu_lower->lowerValue(id)->as(); - }); - disjoint_set_2_kir.emplace(std::make_pair(fusion_set, kir_set)); - } else { - kir_set = kir_set_it->second; - } - kir_disjoint_iter_set_maps_.emplace(std::make_pair( - gpu_lower->lowerValue(disjoint_iter_set.first)->as(), - kir_set)); - } - - for (auto entry : concrete_id_map_) { - kir_concrete_id_map_.emplace(std::make_pair( - gpu_lower->lowerValue(entry.first)->as(), - gpu_lower->lowerValue(entry.second)->as())); - } - - for (const auto& entry : disjoint_iter_set_maps_) { - kir_2_fusion_[gpu_lower->lowerValue(entry.first)->as()] = - entry.first; - } - - // Make sure we have all IterDomains that could be used to generate a ForLoop - for (auto expr : fusion->exprs()) { - if (!expr->outputs()[0]->isA()) { - continue; - } - - auto tv_outputs = ir_utils::filterByType(expr->outputs()); - - for (auto out : tv_outputs) { - for (auto entry : out->domain()->domain()) { - kir_2_fusion_[gpu_lower->lowerValue(entry)->as()] = - entry; - } - } - } } bool ComputeAtMap::areMapped(IterDomain* id0, IterDomain* id1) const { @@ -568,20 +502,6 @@ bool ComputeAtMap::areMapped(IterDomain* id0, IterDomain* id1) const { return (set0_it->second.get() == set1_it->second.get()); } -bool ComputeAtMap::areMapped(kir::IterDomain* id0, kir::IterDomain* id1) const { - assertLowered(has_lowered_kir_); - if (id0 == id1) { - return true; - } - auto set0_it = kir_disjoint_iter_set_maps_.find(id0); - auto set1_it = kir_disjoint_iter_set_maps_.find(id1); - if (set0_it == kir_disjoint_iter_set_maps_.end() || - set1_it == kir_disjoint_iter_set_maps_.end()) { - return false; - } - return (set0_it->second.get() == set1_it->second.get()); -} - IterDomain* ComputeAtMap::getConcreteMappedID(IterDomain* id) const { auto it = concrete_id_map_.find(id); if (it != concrete_id_map_.end()) { @@ -590,25 +510,6 @@ IterDomain* ComputeAtMap::getConcreteMappedID(IterDomain* id) const { return id; } -kir::IterDomain* ComputeAtMap::getConcreteMappedID(kir::IterDomain* id) const { - assertLowered(has_lowered_kir_); - auto it = kir_concrete_id_map_.find(id); - if (it != kir_concrete_id_map_.end()) { - return it->second; - } - return id; -} - -IterDomain* ComputeAtMap::toFusion(kir::IterDomain* kir) const { - assertLowered(has_lowered_kir_); - auto kir_2_fusion_it = kir_2_fusion_.find(kir); - TORCH_INTERNAL_ASSERT( - kir_2_fusion_it != kir_2_fusion_.end(), - "Kernel ir is not guarneteed to be reversible into fusion ir, could not find fusion entry. ", - kir::toString(kir, false)); - return kir_2_fusion_it->second; -} - std::string ComputeAtMap::toString() const { std::stringstream ss; diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.h b/torch/csrc/jit/codegen/cuda/compute_at_map.h index b2b70f8997d..8b7f9acd8fe 100644 --- a/torch/csrc/jit/codegen/cuda/compute_at_map.h +++ b/torch/csrc/jit/codegen/cuda/compute_at_map.h @@ -67,34 +67,18 @@ class TORCH_CUDA_CU_API ComputeAtMap { //! same loop nest in the lowered code bool areMapped(IterDomain* id0, IterDomain* id1) const; - bool areMapped(kir::IterDomain* id0, kir::IterDomain* id1) const; - //! Returns an iter domain that is the maximum expanded size of all iter //! domains the one provided maps to. Useful for opening loops to the correct //! iteration size. Not guarenteed to return the same ID every call, but is //! guarenteed to return iter domains in the same disjoint set. IterDomain* getConcreteMappedID(IterDomain* id) const; - kir::IterDomain* getConcreteMappedID(kir::IterDomain* id) const; - - // TODO: Would be great if we didn't need this, but we have nice functionality - // in iter_visitor that isn't moved over. Use of this is limited to indexing - // and this should definitely be removed by building out kernel ir to have - // better parity with fusion ir. - IterDomain* toFusion(kir::IterDomain* kir) const; - // Prints mapping information via Fusion IR std::string toString() const; private: - bool has_lowered_kir_ = false; - void mapIds(IterDomain* id0, IterDomain* id1); - //! Convert everything to lowered structures (kernel ir), as we will use - //! this class frequently during lowering. - void convertToKir(Fusion* fusion, GpuLower* gpu_lower); - private: MappingMode mapping_mode_ = MappingMode::LOOP; @@ -109,11 +93,6 @@ class TORCH_CUDA_CU_API ComputeAtMap { std::unordered_map>> disjoint_iter_set_maps_; - std::unordered_map< - kir::IterDomain*, - std::shared_ptr>> - kir_disjoint_iter_set_maps_; - // Keep a list of disjoint_iter_sets that's deterministic to iterate over std::deque>> disjoint_iter_sets_; @@ -125,12 +104,6 @@ class TORCH_CUDA_CU_API ComputeAtMap { // For each IterDomain set we will track how many concrete root domains were // used to generate the IterDomain std::unordered_map concrete_id_map_; - - std::unordered_map kir_concrete_id_map_; - - // Map kir::IterDomain* back to the fusion IR IterDomain*. - // TODO: Would be great if we didn't need this. - std::unordered_map kir_2_fusion_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/dispatch.cpp b/torch/csrc/jit/codegen/cuda/dispatch.cpp index cea8b24e7ff..1702de93bdd 100644 --- a/torch/csrc/jit/codegen/cuda/dispatch.cpp +++ b/torch/csrc/jit/codegen/cuda/dispatch.cpp @@ -37,7 +37,7 @@ T* ptr(T* obj) { * } * * And therefore dispatch should never call: - * ptr(mutator)->handle(this->as()); + * ptr(mutator)->mutate(this->as()); */ template @@ -58,6 +58,10 @@ void Val::dispatch(T handler, Val* val) { break; } break; + case ValType::NamedScalar: + ptr(handler)->handle(val->as()); + return; + case ValType::IterDomain: ptr(handler)->handle(val->as()); return; @@ -67,8 +71,11 @@ void Val::dispatch(T handler, Val* val) { case ValType::TensorView: ptr(handler)->handle(val->as()); return; - case ValType::NamedScalar: - ptr(handler)->handle(val->as()); + case ValType::Predicate: + ptr(handler)->handle(val->as()); + return; + case ValType::TensorIndex: + ptr(handler)->handle(val->as()); return; default: break; @@ -79,12 +86,6 @@ void Val::dispatch(T handler, Val* val) { template void Expr::dispatch(T handler, Expr* expr) { switch (*(expr->getExprType())) { - case ExprType::Split: - ptr(handler)->handle(expr->as()); - return; - case ExprType::Merge: - ptr(handler)->handle(expr->as()); - return; case ExprType::UnaryOp: ptr(handler)->handle(expr->as()); return; @@ -103,6 +104,13 @@ void Expr::dispatch(T handler, Expr* expr) { case ExprType::BroadcastOp: ptr(handler)->handle(expr->as()); return; + + case ExprType::Split: + ptr(handler)->handle(expr->as()); + return; + case ExprType::Merge: + ptr(handler)->handle(expr->as()); + return; case ExprType::TransposeOp: ptr(handler)->handle(expr->as()); return; @@ -115,6 +123,34 @@ void Expr::dispatch(T handler, Expr* expr) { case ExprType::ViewOp: ptr(handler)->handle(expr->as()); return; + + case ExprType::Allocate: + ptr(handler)->handle(expr->as()); + return; + case ExprType::Sync: + ptr(handler)->handle(expr->as()); + return; + case ExprType::InitMagicZero: + ptr(handler)->handle(expr->as()); + return; + case ExprType::UpdateMagicZero: + ptr(handler)->handle(expr->as()); + return; + case ExprType::ForLoop: + ptr(handler)->handle(expr->as()); + return; + case ExprType::IfThenElse: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridReduction: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridBroadcast: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridWelford: + ptr(handler)->handle(expr->as()); + return; default: TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!"); } @@ -148,6 +184,10 @@ void Val::constDispatch(T handler, const Val* val) { break; } break; + case ValType::NamedScalar: + ptr(handler)->handle(val->as()); + return; + case ValType::IterDomain: ptr(handler)->handle(val->as()); return; @@ -157,8 +197,11 @@ void Val::constDispatch(T handler, const Val* val) { case ValType::TensorView: ptr(handler)->handle(val->as()); return; - case ValType::NamedScalar: - ptr(handler)->handle(val->as()); + case ValType::Predicate: + ptr(handler)->handle(val->as()); + return; + case ValType::TensorIndex: + ptr(handler)->handle(val->as()); return; default: break; @@ -169,12 +212,6 @@ void Val::constDispatch(T handler, const Val* val) { template void Expr::constDispatch(T handler, const Expr* expr) { switch (*(expr->getExprType())) { - case ExprType::Split: - ptr(handler)->handle(expr->as()); - return; - case ExprType::Merge: - ptr(handler)->handle(expr->as()); - return; case ExprType::UnaryOp: ptr(handler)->handle(expr->as()); return; @@ -193,6 +230,13 @@ void Expr::constDispatch(T handler, const Expr* expr) { case ExprType::BroadcastOp: ptr(handler)->handle(expr->as()); return; + + case ExprType::Split: + ptr(handler)->handle(expr->as()); + return; + case ExprType::Merge: + ptr(handler)->handle(expr->as()); + return; case ExprType::TransposeOp: ptr(handler)->handle(expr->as()); return; @@ -205,6 +249,34 @@ void Expr::constDispatch(T handler, const Expr* expr) { case ExprType::ViewOp: ptr(handler)->handle(expr->as()); return; + + case ExprType::Allocate: + ptr(handler)->handle(expr->as()); + return; + case ExprType::Sync: + ptr(handler)->handle(expr->as()); + return; + case ExprType::InitMagicZero: + ptr(handler)->handle(expr->as()); + return; + case ExprType::UpdateMagicZero: + ptr(handler)->handle(expr->as()); + return; + case ExprType::ForLoop: + ptr(handler)->handle(expr->as()); + return; + case ExprType::IfThenElse: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridReduction: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridBroadcast: + ptr(handler)->handle(expr->as()); + return; + case ExprType::GridWelford: + ptr(handler)->handle(expr->as()); + return; default: TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!"); } @@ -232,28 +304,42 @@ void Statement::constDispatch(T handler, const Statement* stmt) { * ptr(mutator)->mutate(this->as()); */ template -Statement* Val::mutatorDispatch(T mutator, Val* val) { +void Val::mutatorDispatch(T mutator, Val* val) { switch (*(val->getValType())) { case ValType::Scalar: switch (*(val->getDataType())) { case DataType::Bool: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; case DataType::Double: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; case DataType::Int: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; default: break; } break; - case ValType::IterDomain: - return ptr(mutator)->mutate(val->as()); - case ValType::TensorDomain: - return ptr(mutator)->mutate(val->as()); - case ValType::TensorView: - return ptr(mutator)->mutate(val->as()); case ValType::NamedScalar: - return ptr(mutator)->mutate(val->as()); + ptr(mutator)->mutate(val->as()); + return; + + case ValType::IterDomain: + ptr(mutator)->mutate(val->as()); + return; + case ValType::TensorDomain: + ptr(mutator)->mutate(val->as()); + return; + case ValType::TensorView: + ptr(mutator)->mutate(val->as()); + return; + case ValType::Predicate: + ptr(mutator)->mutate(val->as()); + return; + case ValType::TensorIndex: + ptr(mutator)->mutate(val->as()); + return; default: break; } @@ -261,44 +347,87 @@ Statement* Val::mutatorDispatch(T mutator, Val* val) { } template -Statement* Expr::mutatorDispatch(T mutator, Expr* expr) { +void Expr::mutatorDispatch(T mutator, Expr* expr) { switch (*(expr->getExprType())) { - case ExprType::Split: - return ptr(mutator)->mutate(expr->as()); - case ExprType::Merge: - return ptr(mutator)->mutate(expr->as()); case ExprType::UnaryOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::BinaryOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::TernaryOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::ReductionOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::WelfordOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::BroadcastOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; + + case ExprType::Split: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::Merge: + ptr(mutator)->mutate(expr->as()); + return; case ExprType::TransposeOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::ShiftOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::GatherOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; case ExprType::ViewOp: - return ptr(mutator)->mutate(expr->as()); + ptr(mutator)->mutate(expr->as()); + return; + + case ExprType::Allocate: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::Sync: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::InitMagicZero: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::UpdateMagicZero: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::ForLoop: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::IfThenElse: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GridReduction: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GridBroadcast: + ptr(mutator)->mutate(expr->as()); + return; + case ExprType::GridWelford: + ptr(mutator)->mutate(expr->as()); + return; default: TORCH_INTERNAL_ASSERT(false, "Unknown exprtype in dispatch!"); } } template -Statement* Statement::mutatorDispatch(T mutator, Statement* stmt) { +void Statement::mutatorDispatch(T mutator, Statement* stmt) { if (stmt->isVal()) { - return ptr(mutator)->mutate(stmt->as()); + ptr(mutator)->mutate(stmt->as()); + return; } if (stmt->isExpr()) { - return ptr(mutator)->mutate(stmt->as()); + ptr(mutator)->mutate(stmt->as()); + return; } TORCH_INTERNAL_ASSERT(false, "Unknown stmttype in dispatch!"); } @@ -308,11 +437,11 @@ Statement* Statement::mutatorDispatch(T mutator, Statement* stmt) { * classes. Actual visitors/mutators should inhereit from these classes and call * ->dispatch(this) to avoid needing an explicit instantiation. */ -template void Statement::dispatch(OptOutDispatch, Statement*); +template void Statement::dispatch(OptOutDispatch&, Statement*); template void Statement::dispatch(OptOutDispatch*, Statement*); -template void Val::dispatch(OptOutDispatch, Val*); +template void Val::dispatch(OptOutDispatch&, Val*); template void Val::dispatch(OptOutDispatch*, Val*); -template void Expr::dispatch(OptOutDispatch, Expr*); +template void Expr::dispatch(OptOutDispatch&, Expr*); template void Expr::dispatch(OptOutDispatch*, Expr*); template void Statement::dispatch(OptInDispatch, Statement*); @@ -322,33 +451,26 @@ template void Val::dispatch(OptInDispatch*, Val*); template void Expr::dispatch(OptInDispatch, Expr*); template void Expr::dispatch(OptInDispatch*, Expr*); -template void Statement::constDispatch(OptOutConstDispatch, const Statement*); +template void Statement::constDispatch(OptOutConstDispatch&, const Statement*); template void Statement::constDispatch(OptOutConstDispatch*, const Statement*); -template void Val::constDispatch(OptOutConstDispatch, const Val*); +template void Val::constDispatch(OptOutConstDispatch&, const Val*); template void Val::constDispatch(OptOutConstDispatch*, const Val*); -template void Expr::constDispatch(OptOutConstDispatch, const Expr*); +template void Expr::constDispatch(OptOutConstDispatch&, const Expr*); template void Expr::constDispatch(OptOutConstDispatch*, const Expr*); -template void Statement::constDispatch(OptInConstDispatch, const Statement*); +template void Statement::constDispatch(OptInConstDispatch&, const Statement*); template void Statement::constDispatch(OptInConstDispatch*, const Statement*); -template void Val::constDispatch(OptInConstDispatch, const Val*); +template void Val::constDispatch(OptInConstDispatch&, const Val*); template void Val::constDispatch(OptInConstDispatch*, const Val*); -template void Expr::constDispatch(OptInConstDispatch, const Expr*); +template void Expr::constDispatch(OptInConstDispatch&, const Expr*); template void Expr::constDispatch(OptInConstDispatch*, const Expr*); -template Statement* Statement::mutatorDispatch(OptOutMutator, Statement*); -template Statement* Statement::mutatorDispatch(OptOutMutator*, Statement*); -template Statement* Val::mutatorDispatch(OptOutMutator, Val*); -template Statement* Val::mutatorDispatch(OptOutMutator*, Val*); -template Statement* Expr::mutatorDispatch(OptOutMutator, Expr*); -template Statement* Expr::mutatorDispatch(OptOutMutator*, Expr*); - -template Statement* Statement::mutatorDispatch(OptInMutator, Statement*); -template Statement* Statement::mutatorDispatch(OptInMutator*, Statement*); -template Statement* Val::mutatorDispatch(OptInMutator, Val*); -template Statement* Val::mutatorDispatch(OptInMutator*, Val*); -template Statement* Expr::mutatorDispatch(OptInMutator, Expr*); -template Statement* Expr::mutatorDispatch(OptInMutator*, Expr*); +template void Statement::mutatorDispatch(OptOutMutator&, Statement*); +template void Statement::mutatorDispatch(OptOutMutator*, Statement*); +template void Val::mutatorDispatch(OptOutMutator&, Val*); +template void Val::mutatorDispatch(OptOutMutator*, Val*); +template void Expr::mutatorDispatch(OptOutMutator&, Expr*); +template void Expr::mutatorDispatch(OptOutMutator*, Expr*); void OptOutDispatch::handle(Statement* s) { Statement::dispatch(this, s); @@ -362,18 +484,6 @@ void OptOutDispatch::handle(Val* v) { Val::dispatch(this, v); } -void OptInDispatch::handle(Statement* s) { - Statement::dispatch(this, s); -} - -void OptInDispatch::handle(Expr* e) { - Expr::dispatch(this, e); -} - -void OptInDispatch::handle(Val* v) { - Val::dispatch(this, v); -} - void OptOutConstDispatch::handle(const Statement* s) { Statement::constDispatch(this, s); } @@ -386,46 +496,224 @@ void OptOutConstDispatch::handle(const Val* v) { Val::constDispatch(this, v); } -void OptInConstDispatch::handle(const Statement* s) { - Statement::constDispatch(this, s); +void OptInConstDispatch::unhandled(const Statement* stmt) { + if (stmt->isExpr()) { + TORCH_INTERNAL_ASSERT( + false, "Handle not overriden for ", stmt->getExprType().value(), "."); + } else if (stmt->isVal()) { + TORCH_INTERNAL_ASSERT( + false, "Handle not overriden for ", stmt->getValType().value(), "."); + } else { + TORCH_INTERNAL_ASSERT(false, "Unrecognized statement type."); + } } -void OptInConstDispatch::handle(const Expr* e) { - Expr::constDispatch(this, e); +void OptInDispatch::unhandled(Statement* stmt) { + if (stmt->isExpr()) { + TORCH_INTERNAL_ASSERT( + false, "Handle not overriden for ", stmt->getExprType().value(), "."); + } else if (stmt->isVal()) { + TORCH_INTERNAL_ASSERT( + false, "Handle not overriden for ", stmt->getValType().value(), "."); + } else { + TORCH_INTERNAL_ASSERT(false, "Unrecognized statement type."); + } } -void OptInConstDispatch::handle(const Val* v) { - Val::constDispatch(this, v); +// Vals +void OptOutConstDispatch::handle(const Bool* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const Double* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const Int* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const NamedScalar* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const IterDomain* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const TensorDomain* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const TensorView* stmt) { + unhandled(stmt); } -Statement* OptInMutator::mutate(Statement* s) { - return Statement::mutatorDispatch(this, s); +void OptOutConstDispatch::handle(const kir::Predicate* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::TensorIndex* stmt) { + unhandled(stmt); } -Statement* OptInMutator::mutate(Expr* e) { - return Expr::mutatorDispatch(this, e); +// Exprs +void OptOutConstDispatch::handle(const UnaryOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const BinaryOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const TernaryOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const ReductionOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const WelfordOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const BroadcastOp* stmt) { + unhandled(stmt); } -Statement* OptInMutator::mutate(Val* v) { - // If value is already mutated, return the mutation - if (mutations.find(v) != mutations.end()) - return mutations[v]; - return Val::mutatorDispatch(this, v); +void OptOutConstDispatch::handle(const Split* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const Merge* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const TransposeOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const ShiftOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const GatherOp* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const ViewOp* stmt) { + unhandled(stmt); } -Statement* OptOutMutator::mutate(Statement* s) { - return Statement::mutatorDispatch(this, s); +void OptOutConstDispatch::handle(const kir::Allocate* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::Sync* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::InitMagicZero* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::UpdateMagicZero* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::ForLoop* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::IfThenElse* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::GridReduction* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::GridBroadcast* stmt) { + unhandled(stmt); +} +void OptOutConstDispatch::handle(const kir::GridWelford* stmt) { + unhandled(stmt); } -Statement* OptOutMutator::mutate(Expr* e) { - return Expr::mutatorDispatch(this, e); +void OptOutDispatch::unhandled(Statement*) {} + +// Vals +void OptOutDispatch::handle(Bool* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(Double* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(Int* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(NamedScalar* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(IterDomain* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(TensorDomain* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(TensorView* stmt) { + unhandled(stmt); } -Statement* OptOutMutator::mutate(Val* v) { - // If value is already mutated, return the mutation - if (mutations.find(v) != mutations.end()) - return mutations[v]; - return Val::mutatorDispatch(this, v); +void OptOutDispatch::handle(kir::Predicate* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::TensorIndex* stmt) { + unhandled(stmt); +} + +// Exprs +void OptOutDispatch::handle(UnaryOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(BinaryOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(TernaryOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(ReductionOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(WelfordOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(BroadcastOp* stmt) { + unhandled(stmt); +} + +void OptOutDispatch::handle(Split* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(Merge* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(TransposeOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(ShiftOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(GatherOp* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(ViewOp* stmt) { + unhandled(stmt); +} + +void OptOutDispatch::handle(kir::Allocate* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::Sync* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::InitMagicZero* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::UpdateMagicZero* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::ForLoop* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::IfThenElse* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::GridReduction* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::GridBroadcast* stmt) { + unhandled(stmt); +} +void OptOutDispatch::handle(kir::GridWelford* stmt) { + unhandled(stmt); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/dispatch.h b/torch/csrc/jit/codegen/cuda/dispatch.h index c1be76eb950..6961ebd6a15 100644 --- a/torch/csrc/jit/codegen/cuda/dispatch.h +++ b/torch/csrc/jit/codegen/cuda/dispatch.h @@ -1,9 +1,9 @@ #pragma once -#include - +#include #include -#include + +#include #include @@ -48,7 +48,7 @@ namespace torch { namespace jit { namespace fuser { namespace cuda { - +class IrContainer; class Fusion; // Hierarchal dispatch functions for handle @@ -60,14 +60,13 @@ class Val; class IterDomain; class TensorDomain; class TensorView; + class Bool; class Double; class Int; class NamedScalar; // Exprs -class Split; -class Merge; class UnaryOp; class BinaryOp; class TernaryOp; @@ -79,9 +78,35 @@ class ShiftOp; class GatherOp; class ViewOp; +// Exprs +class Split; +class Merge; +class TransposeOp; +class ShiftOp; +class GatherOp; +class ViewOp; + +namespace kir { +class Predicate; +class TensorIndex; + +class Allocate; +class Sync; +class ForLoop; +class IfThenElse; +class GridReduction; +class GridBroadcast; +class GridWelford; +class InitMagicZero; +class UpdateMagicZero; +} // namespace kir + // By default, all IR nodes are handled in this dispatch, and will call an empty // function on all nodes. class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase { + protected: + virtual void unhandled(const Statement*) {} + public: // Hierarchal dispatch functions for handle virtual void handle(const Statement*); @@ -89,30 +114,47 @@ class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase { virtual void handle(const Val*); // Vals - virtual void handle(const IterDomain*) {} - virtual void handle(const TensorDomain*) {} - virtual void handle(const TensorView*) {} - virtual void handle(const Bool*) {} - virtual void handle(const Double*) {} - virtual void handle(const Int*) {} - virtual void handle(const NamedScalar*) {} + virtual void handle(const IterDomain* stmt); + virtual void handle(const TensorDomain* stmt); + virtual void handle(const TensorView* stmt); + virtual void handle(const Bool* stmt); + virtual void handle(const Double* stmt); + virtual void handle(const Int* stmt); + virtual void handle(const NamedScalar* stmt); + + virtual void handle(const kir::Predicate*); + virtual void handle(const kir::TensorIndex*); // Exprs - virtual void handle(const Split*) {} - virtual void handle(const Merge*) {} - virtual void handle(const UnaryOp*) {} - virtual void handle(const BinaryOp*) {} - virtual void handle(const TernaryOp*) {} - virtual void handle(const ReductionOp*) {} - virtual void handle(const WelfordOp*) {} - virtual void handle(const BroadcastOp*) {} - virtual void handle(const TransposeOp*) {} - virtual void handle(const ShiftOp*) {} - virtual void handle(const GatherOp*) {} - virtual void handle(const ViewOp*) {} + virtual void handle(const UnaryOp* stmt); + virtual void handle(const BinaryOp* stmt); + virtual void handle(const TernaryOp* stmt); + virtual void handle(const ReductionOp* stmt); + virtual void handle(const WelfordOp* stmt); + virtual void handle(const BroadcastOp* stmt); + + virtual void handle(const Split* stmt); + virtual void handle(const Merge* stmt); + virtual void handle(const TransposeOp* stmt); + virtual void handle(const ShiftOp* stmt); + virtual void handle(const GatherOp* stmt); + virtual void handle(const ViewOp* stmt); + + virtual void handle(const kir::Allocate*); + virtual void handle(const kir::Sync*); + virtual void handle(const kir::InitMagicZero*); + virtual void handle(const kir::UpdateMagicZero*); + virtual void handle(const kir::ForLoop*); + virtual void handle(const kir::IfThenElse*); + virtual void handle(const kir::GridReduction*); + virtual void handle(const kir::GridBroadcast*); + virtual void handle(const kir::GridWelford*); }; class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase { + protected: + virtual void unhandled(Statement*); + public: // Hierarchal dispatch functions for handle virtual void handle(Statement*); @@ -120,190 +162,88 @@ class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase { virtual void handle(Val*); // Vals - virtual void handle(IterDomain*) {} - virtual void handle(TensorDomain*) {} - virtual void handle(TensorView*) {} - virtual void handle(Bool*) {} - virtual void handle(Double*) {} - virtual void handle(Int*) {} - virtual void handle(NamedScalar*) {} + virtual void handle(Bool* stmt); + virtual void handle(Double* stmt); + virtual void handle(Int* stmt); + virtual void handle(NamedScalar* stmt); + virtual void handle(IterDomain* stmt); + virtual void handle(TensorDomain* stmt); + virtual void handle(TensorView* stmt); + + virtual void handle(kir::Predicate*); + virtual void handle(kir::TensorIndex*); // Exprs - virtual void handle(Split*) {} - virtual void handle(Merge*) {} - virtual void handle(UnaryOp*) {} - virtual void handle(BinaryOp*) {} - virtual void handle(TernaryOp*) {} - virtual void handle(ReductionOp*) {} - virtual void handle(WelfordOp*) {} - virtual void handle(BroadcastOp*) {} - virtual void handle(TransposeOp*) {} - virtual void handle(ShiftOp*) {} - virtual void handle(GatherOp*) {} - virtual void handle(ViewOp*) {} + virtual void handle(UnaryOp* stmt); + virtual void handle(BinaryOp* stmt); + virtual void handle(TernaryOp* stmt); + virtual void handle(ReductionOp* stmt); + virtual void handle(WelfordOp* stmt); + virtual void handle(BroadcastOp* stmt); + + virtual void handle(Split* stmt); + virtual void handle(Merge* stmt); + virtual void handle(TransposeOp* stmt); + virtual void handle(ShiftOp* stmt); + virtual void handle(GatherOp* stmt); + virtual void handle(ViewOp* stmt); + + virtual void handle(kir::Allocate* stmt); + virtual void handle(kir::Sync* stmt); + virtual void handle(kir::InitMagicZero* stmt); + virtual void handle(kir::UpdateMagicZero* stmt); + virtual void handle(kir::ForLoop* stmt); + virtual void handle(kir::IfThenElse* stmt); + virtual void handle(kir::GridReduction* stmt); + virtual void handle(kir::GridBroadcast* stmt); + virtual void handle(kir::GridWelford* stmt); }; -class TORCH_CUDA_CU_API OptInConstDispatch : public PolymorphicBase { +class TORCH_CUDA_CU_API OptInConstDispatch : public OptOutConstDispatch { public: - // Hierarchal dispatch functions for handle - virtual void handle(const Statement*); - virtual void handle(const Expr*); - virtual void handle(const Val*); + using OptOutConstDispatch::handle; - // Vals - virtual void handle(const IterDomain*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for IterDomain."); - } - virtual void handle(const TensorDomain*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorDomain."); - } - virtual void handle(const TensorView*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorView."); - } - virtual void handle(const Bool*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool."); - } - virtual void handle(const Double*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double."); - } - virtual void handle(const Int*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int."); - } - virtual void handle(const NamedScalar*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for NamedScalar."); - } - - // Exprs - virtual void handle(const Split*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Split."); - } - virtual void handle(const Merge*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Merge."); - } - virtual void handle(const UnaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for UnaryOp."); - } - virtual void handle(const BinaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp."); - } - virtual void handle(const WelfordOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp."); - } - virtual void handle(const TernaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TernaryOp."); - } - virtual void handle(const ReductionOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp."); - } - virtual void handle(const BroadcastOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp."); - } - virtual void handle(const TransposeOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp."); - } - virtual void handle(const ShiftOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp."); - } - virtual void handle(const GatherOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp."); - } - virtual void handle(const ViewOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ViewOp."); - } + protected: + virtual void unhandled(const Statement* stmt) final; }; -class TORCH_CUDA_CU_API OptInDispatch : public PolymorphicBase { +class TORCH_CUDA_CU_API OptInDispatch : public OptOutDispatch { public: - // Hierarchal dispatch functions for handle - virtual void handle(Statement* s); - virtual void handle(Expr* e); - virtual void handle(Val* v); + using OptOutDispatch::handle; - // Vals - virtual void handle(IterDomain*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for IterDomain."); - } - virtual void handle(TensorDomain*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorDomain."); - } - virtual void handle(TensorView*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TensorView."); - } - virtual void handle(Bool*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Bool."); - } - virtual void handle(Double*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Double."); - } - virtual void handle(Int*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Int."); - } - virtual void handle(NamedScalar*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for NamedScalar."); - } - - // Exprs - virtual void handle(Split*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Split."); - } - virtual void handle(Merge*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for Merge."); - } - virtual void handle(UnaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for UnaryOp."); - } - virtual void handle(BinaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp."); - } - virtual void handle(TernaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TernaryOp."); - } - virtual void handle(ReductionOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp."); - } - virtual void handle(WelfordOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for WelfordOp."); - } - virtual void handle(BroadcastOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BroadcastOp."); - } - virtual void handle(TransposeOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for TransposeOp."); - } - virtual void handle(ShiftOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ShiftOp."); - } - virtual void handle(GatherOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for GatherOp."); - } - virtual void handle(ViewOp*) { - TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ViewOp."); - } + protected: + virtual void unhandled(Statement* stmt) final; }; +// Class to perform mutations on Fusion IR. Exprs can simply be redefined, but +// when mutating values they have to be registered through registerMutation so +// that exprs can detect there's been a muatation and know to modify all +// instances of that Val. This means each Val should be mutated "consistently". +// Otherwise behavior may be difficult to understand as it depends on which +// order mutate is called in. This class expects user to topologically call the +// statments of interest so inputs are called and mutated before exprs depending +// on them. +// +// Warning: TensorViews need to be treated carefully. As we don't generally +// register their mutation when their tensor domains only change. If a TV needs +// to be swapped out, it needs to be registered as a "proper" mutation like +// other vals, on top of TensorDomain being updated in the mutated TensorView. +// // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase { public: // Hierarchal dispatch functions for handle - virtual Statement* mutate(Statement* s); - virtual Statement* mutate(Expr* e); - virtual Statement* mutate(Val* v); + virtual void mutate(Statement* s); + virtual void mutate(Expr* e); + virtual void mutate(Val* v); - // We always want to dispatch through a Val, so we can capture and dispatch - // correctly members of nodes like Split->TensorDomain If we don't call the - // below function or manually cast to use mutate(Val* v) we can't intercept - // and mutate by capturing mutate(Val* v), which is what we do when we want to - // replace all instances of a value. - Statement* mutateAsVal(Val* v) { - return mutate(v); - } + void registerMutation(Val* val, Val* mutation); - void registerMutation(Val* val, Val* mutation) { - TORCH_INTERNAL_ASSERT( - mutations.find(val) == mutations.end(), - " The same value is incorrectly being mutated twice.", - " One mutation per mutation pass is allowed."); - mutations[val] = mutation; + Val* maybeMutated(Val* val) { + if (mutations.find(val) == mutations.end()) { + return val; + } + return mutations.at(val); } std::unordered_map mutations; @@ -311,105 +251,44 @@ class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase { //****Functions below defined in mutator.cpp***** // Vals - virtual Statement* mutate(IterDomain*); - virtual Statement* mutate(TensorDomain*); - virtual Statement* mutate(TensorView*); - virtual Statement* mutate(Bool*); - virtual Statement* mutate(Double*); - virtual Statement* mutate(Int*); - virtual Statement* mutate(NamedScalar*); + virtual void mutate(Bool*); + virtual void mutate(Double*); + virtual void mutate(Int*); + virtual void mutate(NamedScalar*); + virtual void mutate(IterDomain*); + virtual void mutate(TensorDomain*); + virtual void mutate(TensorView*); + + virtual void mutate(kir::Predicate*); + virtual void mutate(kir::TensorIndex*); // Exprs - virtual Statement* mutate(Split*); - virtual Statement* mutate(Merge*); - virtual Statement* mutate(UnaryOp*); - virtual Statement* mutate(BinaryOp*); - virtual Statement* mutate(TernaryOp*); - virtual Statement* mutate(ReductionOp*); - virtual Statement* mutate(WelfordOp*); - virtual Statement* mutate(BroadcastOp*); - virtual Statement* mutate(TransposeOp*); - virtual Statement* mutate(ShiftOp*); - virtual Statement* mutate(GatherOp*); - virtual Statement* mutate(ViewOp*); -}; + virtual void mutate(UnaryOp*); + virtual void mutate(BinaryOp*); + virtual void mutate(TernaryOp*); + virtual void mutate(ReductionOp*); + virtual void mutate(WelfordOp*); + virtual void mutate(BroadcastOp*); -// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class TORCH_CUDA_CU_API OptInMutator : public PolymorphicBase { - public: - std::unordered_map mutations; + virtual void mutate(Split*); + virtual void mutate(Merge*); + virtual void mutate(TransposeOp*); + virtual void mutate(ShiftOp*); + virtual void mutate(GatherOp*); + virtual void mutate(ViewOp*); - public: - void registerMutation(Val* val, Val* mutation) { - TORCH_INTERNAL_ASSERT( - mutations.find(val) == mutations.end(), - " The same value is incorrectly being mutated twice.", - " One mutation per mutation pass is allowed."); - mutations[val] = mutation; - } + virtual void mutate(kir::Allocate*); + virtual void mutate(kir::Sync*); + virtual void mutate(kir::InitMagicZero*); + virtual void mutate(kir::UpdateMagicZero*); + virtual void mutate(kir::ForLoop*); + virtual void mutate(kir::IfThenElse*); + virtual void mutate(kir::GridReduction*); + virtual void mutate(kir::GridBroadcast*); + virtual void mutate(kir::GridWelford*); - // Hierarchal dispatch functions for mutate - virtual Statement* mutate(Statement*); - virtual Statement* mutate(Expr*); - virtual Statement* mutate(Val*); - - // Vals - virtual Statement* mutate(IterDomain*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for IterDomain."); - } - virtual Statement* mutate(TensorDomain*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TensorDomain."); - } - virtual Statement* mutate(TensorView*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TensorView."); - } - virtual Statement* mutate(Bool*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Bool."); - } - virtual Statement* mutate(Int*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Int."); - } - virtual Statement* mutate(NamedScalar*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for NamedScalar."); - } - - // Exprs - virtual Statement* mutate(Split*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Split."); - } - virtual Statement* mutate(Merge*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for Merge."); - } - virtual Statement* mutate(UnaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for UnaryOp."); - } - virtual Statement* mutate(BinaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BinaryOp."); - } - virtual Statement* mutate(TernaryOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TernaryOp."); - } - virtual Statement* mutate(ReductionOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ReductionOp."); - } - virtual Statement* mutate(WelfordOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for WelfordOp."); - } - virtual Statement* mutate(BroadcastOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BroadcastOp."); - } - virtual Statement* mutate(TransposeOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for TransposeOp."); - } - virtual Statement* mutate(ShiftOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ShiftOp."); - } - virtual Statement* mutate(GatherOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for GatherOp."); - } - virtual Statement* mutate(ViewOp*) { - TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ViewOp."); - } + protected: + void removeExpr(IrContainer*, Expr*); }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp b/torch/csrc/jit/codegen/cuda/evaluator_common.cpp index 288dbb198b0..09481319569 100644 --- a/torch/csrc/jit/codegen/cuda/evaluator_common.cpp +++ b/torch/csrc/jit/codegen/cuda/evaluator_common.cpp @@ -1,9 +1,11 @@ -#include #include #include +#include #include #include +#include + namespace torch { namespace jit { namespace fuser { @@ -68,8 +70,8 @@ std::vector makeSortedEvaluationList(std::vector input) { //! Kernel IR utility, collects all the symbolic integers //! used in allocation nodes. void collectBufferSizes( - std::vector& into, - const std::vector& exprs) { + std::vector& into, + const std::vector& exprs) { for (auto expr : exprs) { if (auto allocate = dynamic_cast(expr)) { into.push_back(allocate->size()); @@ -82,56 +84,44 @@ void collectBufferSizes( } } -//! Kernel IR utility, collects all the kir symbolic +//! Kernel IR utility, collects all the kernel symbolic //! integers we will need at runtime, i.e. after the //! generated cuda kernel has already been compiled. //! The values are to be used for runtime logic, like //! `computeLaunchparams`. -std::vector collectRuntimeUsedIntegers( - Fusion* fusion, - GpuLower* lower) { - std::vector ret; - +std::vector collectRuntimeUsedIntegers(kir::Kernel* kernel) { + std::vector ret; + auto all_tvs = ir_utils::allTvs(kernel); // Collect extent and integer inputs - for (auto val : fusion->usedMathVals()) { - auto kir_val = lower->lowerValue(val); - if (auto kir_tv = dynamic_cast(kir_val)) { - for (auto id : kir_tv->domain()->domain()) { - ret.push_back(id->extent()); - } - } else if (val->isFusionInput()) { - if (kir_val->isA()) { - ret.push_back(kir_val); - } + for (auto tv : all_tvs) { + for (auto id : tv->domain()->domain()) { + ret.push_back(id->extent()); + } + } + for (auto inp : kernel->inputs()) { + if (inp->isA()) { + ret.push_back(inp); } } - // Collect allocation sizes: - collectBufferSizes(ret, lower->kernel()->topLevelExprs()); - + collectBufferSizes(ret, kernel->topLevelExprs()); return makeSortedEvaluationList(ret); } -//! Fusion IR utility, collects all the fusionIR symbolic -//! integers we will need at runtime, i.e. after the -//! generated cuda kernel has already been compiled. -//! The values are to be used for runtime logic, like -//! `canSchedule` in heuristic look up. + std::vector collectRuntimeUsedIntegers(Fusion* fusion) { std::vector ret; - + auto all_tvs = ir_utils::allTvs(fusion); // Collect extent and integer inputs - for (auto val : fusion->usedMathVals()) { - if (auto tv = dynamic_cast(val)) { - for (auto id : tv->domain()->domain()) { - ret.push_back(id->extent()); - } - } else if (val->isFusionInput()) { - if (val->isA()) { - ret.push_back(val); - } + for (auto tv : all_tvs) { + for (auto id : tv->domain()->domain()) { + ret.push_back(id->extent()); + } + } + for (auto inp : fusion->inputs()) { + if (inp->isA()) { + ret.push_back(inp); } } - return makeSortedEvaluationList(ret); } @@ -140,7 +130,7 @@ std::vector collectRuntimeUsedIntegers(Fusion* fusion) { template void PrecomputedIntegersBase::initializeValueList( typename IRContext::EVALUATOR_TYPE& const_evaluator, - const std::vector& sorted_value_list) { + const std::vector& sorted_value_list) { // Initialize workspace num_of_values_ = sorted_value_list.size(); defined_ = std::vector(num_of_values_, false); @@ -161,7 +151,7 @@ void PrecomputedIntegersBase::initializeValueList( template c10::optional PrecomputedIntegersBase::getMaybeValueFor( - const IR_VAL* val) { + const Val* val) { auto index = val->evaluatorIndex(); if (index < 0) { return c10::nullopt; @@ -172,6 +162,17 @@ c10::optional PrecomputedIntegersBase::getMaybeValueFor( return values_[index]; } +template +void PrecomputedIntegersBase::print() const { + std::cout << "Precomputed Integers:\n"; + for (auto i : c10::irange(symbols_.size())) { + if (defined_[i]) { + std::cout << symbols_[i]->toInlineString() << " = " << values_[i] + << std::endl; + } + } +} + template void PrecomputedIntegersBase::evaluate() { FUSER_PERF_SCOPE("PrecomputedIntegers::Evaluate"); @@ -208,10 +209,9 @@ NaiveIntegerMachine::NaiveIntegerMachine( for (auto val : precomputed_integers_.symbols_) { auto def = val->definition(); if (def) { - if (auto uop = dynamic_cast(def)) { + if (auto uop = dynamic_cast(def)) { makeUnaryOp(uop); - } else if ( - auto bop = dynamic_cast(def)) { + } else if (auto bop = dynamic_cast(def)) { makeBinaryOp(bop); } else { TORCH_INTERNAL_ASSERT(false, "Unsupported expr"); @@ -234,8 +234,7 @@ void NaiveIntegerMachine::run() { } template -void NaiveIntegerMachine::makeUnaryOp( - typename IRContext::UNARY_OP_TYPE* uop) { +void NaiveIntegerMachine::makeUnaryOp(UnaryOp* uop) { int in = uop->inputs()[0]->evaluatorIndex(); int out = uop->outputs()[0]->evaluatorIndex(); TORCH_INTERNAL_ASSERT(in >= 0, "Integer Machine: unknown input: ", uop); @@ -249,8 +248,7 @@ void NaiveIntegerMachine::makeUnaryOp( } template -void NaiveIntegerMachine::makeBinaryOp( - typename IRContext::BINARY_OP_TYPE* bop) { +void NaiveIntegerMachine::makeBinaryOp(BinaryOp* bop) { int in0 = bop->inputs()[0]->evaluatorIndex(); int in1 = bop->inputs()[1]->evaluatorIndex(); int out = bop->outputs()[0]->evaluatorIndex(); @@ -377,11 +375,8 @@ void NaiveIntegerMachine::runBinaryOp(int index) { precomputed_integers_.defined_[dest_index] = true; } -KernelPrecomputedIntegers::KernelPrecomputedIntegers( - Fusion* fusion, - GpuLower& lower) - : lower_(&lower) { - loadSymbols(collectRuntimeUsedIntegers(fusion, lower_)); +KernelPrecomputedIntegers::KernelPrecomputedIntegers(kir::Kernel* kernel) { + loadSymbols(collectRuntimeUsedIntegers(kernel)); kir::ExpressionEvaluator evaluator; initializeValueList(evaluator, symbols()); initializeNamedScalars(); @@ -389,11 +384,11 @@ KernelPrecomputedIntegers::KernelPrecomputedIntegers( } void KernelPrecomputedIntegers::bindTensorMetaData( - kir::TensorView* tv, + TensorView* tv, const at::Tensor& at_tensor) { - std::vector> ret; + std::vector> ret; const auto root_domain = - kir::TensorDomain::noReductions(tv->domain()->rootDomain()); + TensorDomain::noReductions(tv->domain()->getRootDomain()); TORCH_INTERNAL_ASSERT( at_tensor.ndimension() == static_cast(root_domain.size()), "Something went wrong configuring launch. Inputs do not match."); @@ -411,7 +406,7 @@ namespace { //! and returns the corresponding parallel type if a match //! is found. c10::optional getMaybeThreadSizeParallelType( - kir::NamedScalar* named_scalar) { + NamedScalar* named_scalar) { auto& var_name = named_scalar->name(); for (auto ptype : kParallelTypeThreads) { if (var_name == stringifyThreadSize(ptype)) { @@ -425,7 +420,7 @@ c10::optional getMaybeThreadSizeParallelType( void KernelPrecomputedIntegers::initializeNamedScalars() { for (auto val : symbols()) { - if (auto named_scalar = dynamic_cast(val)) { + if (auto named_scalar = dynamic_cast(val)) { auto maybe_parallel_type = getMaybeThreadSizeParallelType(named_scalar); if (maybe_parallel_type.has_value()) { auto& index_list = @@ -440,17 +435,17 @@ void KernelPrecomputedIntegers::initializeNamedScalars() { } void KernelPrecomputedIntegers::bindKernelInputs( + kir::Kernel* kernel, const at::ArrayRef& aten_inputs) { if (hasValidValues()) { invalidate(); } - auto kernel = lower_->kernel(); const auto& inputs = kernel->inputs(); for (const auto i : c10::irange(inputs.size())) { const auto input = inputs[i]; - if (auto tensor_input = dynamic_cast(input)) { + if (auto tensor_input = dynamic_cast(input)) { const auto aten_tensor = aten_inputs[i].toTensor(); bindTensorMetaData(tensor_input, aten_tensor); } else if (input->isScalar() && input->dtype() == DataType::Int) { diff --git a/torch/csrc/jit/codegen/cuda/evaluator_common.h b/torch/csrc/jit/codegen/cuda/evaluator_common.h index 0c16e2a8b04..7cbe37c602b 100644 --- a/torch/csrc/jit/codegen/cuda/evaluator_common.h +++ b/torch/csrc/jit/codegen/cuda/evaluator_common.h @@ -35,18 +35,14 @@ class ExpressionEvaluator; //! Context for using generic logic on FusionIR class FusionIRContext { public: - using VAL_TYPE = Val; - using EXPR_TYPE = Expr; using TV_TYPE = TensorView; using EVALUATOR_TYPE = ExpressionEvaluator; - using BINARY_OP_TYPE = BinaryOp; - using UNARY_OP_TYPE = UnaryOp; - static BinaryOpType getOpType(BINARY_OP_TYPE* bop) { + static BinaryOpType getOpType(BinaryOp* bop) { return bop->getBinaryOpType(); } - static UnaryOpType getOpType(UNARY_OP_TYPE* uop) { + static UnaryOpType getOpType(UnaryOp* uop) { return uop->getUnaryOpType(); } }; @@ -54,19 +50,14 @@ class FusionIRContext { //! Context for using generic logic on KernelIR class KernelIRContext { public: - using VAL_TYPE = kir::Val; - using EXPR_TYPE = kir::Expr; - using TV_TYPE = kir::TensorView; using EVALUATOR_TYPE = kir::ExpressionEvaluator; - using BINARY_OP_TYPE = kir::BinaryOp; - using UNARY_OP_TYPE = kir::UnaryOp; - static BinaryOpType getOpType(BINARY_OP_TYPE* bop) { - return bop->operation(); + static BinaryOpType getOpType(BinaryOp* bop) { + return bop->getBinaryOpType(); } - static UnaryOpType getOpType(UNARY_OP_TYPE* uop) { - return uop->operation(); + static UnaryOpType getOpType(UnaryOp* uop) { + return uop->getUnaryOpType(); } }; @@ -97,10 +88,10 @@ class NaiveIntegerMachine { private: //! Convert an unary IR expr to an instruction - void makeUnaryOp(typename IRContext::UNARY_OP_TYPE* uop); + void makeUnaryOp(UnaryOp* uop); //! Convert an binary IR expr to an instruction - void makeBinaryOp(typename IRContext::BINARY_OP_TYPE* bop); + void makeBinaryOp(BinaryOp* bop); //! Create an empty instruction with all default values //! and place it at the end of the instruction buffer. @@ -169,11 +160,6 @@ class NaiveIntegerMachine { //! integers and store them in the workspace ahead of time. template class PrecomputedIntegersBase { - using IR_UNARY_OP = typename IRContext::UNARY_OP_TYPE; - using IR_BINARY_OP = typename IRContext::BINARY_OP_TYPE; - using IR_VAL = typename IRContext::VAL_TYPE; - using IR_EXPR = typename IRContext::EXPR_TYPE; - using IR_TV = typename IRContext::TV_TYPE; using INTEGER_MACHINE = NaiveIntegerMachine; public: @@ -190,7 +176,10 @@ class PrecomputedIntegersBase { //! Returns value for the given IR node if it's stored //! in the workspace and has been evaluated. - c10::optional getMaybeValueFor(const IR_VAL* val); + c10::optional getMaybeValueFor(const Val* val); + + //! Debugging helper, prints all the currently known values + void print() const; protected: //! Initialize the workspace before first use. @@ -198,7 +187,7 @@ class PrecomputedIntegersBase { //! been topologically sorted. void initializeValueList( typename IRContext::EVALUATOR_TYPE& evaluator, - const std::vector& sorted_value_list); + const std::vector& sorted_value_list); //! Bind concrete value to the given index //! if the index is valid. @@ -215,12 +204,12 @@ class PrecomputedIntegersBase { void invalidate(); //! Interface for subclasses to access symbols_ - void loadSymbols(std::vector symbols) { + void loadSymbols(std::vector symbols) { symbols_ = std::move(symbols); } //! Interface for subclasses to access symbols_ - std::vector& symbols() { + std::vector& symbols() { return symbols_; } @@ -267,7 +256,7 @@ class PrecomputedIntegersBase { std::vector values_; //! Stores the IR nodes corresponding to each index. - std::vector symbols_; + std::vector symbols_; //! An internal log to keep track of all the bindings //! used in each evaluation cycle. To be used for @@ -308,12 +297,14 @@ class KernelPrecomputedIntegers public: using ParallelExtentMap = - std::unordered_map, TypeHash>; + std::unordered_map, TypeHash>; - KernelPrecomputedIntegers(Fusion* fusion, GpuLower& lower); + KernelPrecomputedIntegers(kir::Kernel* kernel); //! Bind concrete values from fusion runtime inputs - void bindKernelInputs(const at::ArrayRef& aten_inputs); + void bindKernelInputs( + kir::Kernel* kernel, + const at::ArrayRef& aten_inputs); //! Bind concrete values from launch constraints void bindParallelExtents( @@ -326,7 +317,7 @@ class KernelPrecomputedIntegers void bindConcreteParallelTypeValue(ParallelType pt, int64_t value); private: - void bindTensorMetaData(kir::TensorView* tv, const at::Tensor& at_tensor); + void bindTensorMetaData(TensorView* tv, const at::Tensor& at_tensor); //! Iterate through all the named scalars corresponding //! to thread sizes and pre-group them by their parallel @@ -334,8 +325,6 @@ class KernelPrecomputedIntegers void initializeNamedScalars(); private: - GpuLower* lower_ = nullptr; - //! Contains all the named scalars correspond //! to thread size of each parallel type. std::unordered_map>, TypeHash> diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index 647cf4ec0e2..5e6f2d9375e 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -1,3 +1,4 @@ + #include #include @@ -8,21 +9,11 @@ #include #include #include -#include #include #include #include #include - -#ifndef AT_PER_OPERATOR_HEADERS -#include -#include -#else -#include -#include -#endif - #include #include #include @@ -108,8 +99,6 @@ void FusionExecutor::debugCompileFusionFromStr( const std::string& name, int id, CompileOptions options) { - fusion_ = *fusion; - FusionGuard fg(&fusion_); options_ = options; if (isDebugDumpEnabled(DebugDumpOption::FusionIr)) { @@ -126,11 +115,12 @@ void FusionExecutor::debugCompileFusionFromStr( << std::endl; } - setUsedTVs(); + lowered_ = std::make_unique(fusion); + const auto kernel = lowered_->kernel(); + fusion_ = lowered_->kernel(); fusion_id_ = id; - lowered_ = GpuLower(&fusion_); - const auto kernel = lowered_.kernel(); + setUsedTVs(); if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) { kernel->print(); @@ -155,9 +145,9 @@ void FusionExecutor::debugCompileFusionFromStr( void FusionExecutor::compileFusion( Fusion* fusion, - CompileOptions options, const at::ArrayRef& inputs, - const LaunchParams& launch_constraints) { + const LaunchParams& launch_constraints, + CompileOptions options) { FUSER_PERF_SCOPE("compileFusion"); TORCH_INTERNAL_ASSERT( @@ -175,9 +165,6 @@ void FusionExecutor::compileFusion( fusion->printMath(); } - // Clone the fusion so we can store it - fusion_ = *fusion; - FusionGuard fg(&fusion_); options_ = options; c10::DeviceGuard dg(options_.device); @@ -187,11 +174,12 @@ void FusionExecutor::compileFusion( max_device_smem = properties->sharedMemPerBlock; warp_size_ = properties->warpSize; - setUsedTVs(); + lowered_ = std::make_unique(fusion); + const auto kernel = lowered_->kernel(); + fusion_ = lowered_->kernel()->as(); fusion_id_ = ++fusion_id_counter_; - lowered_ = GpuLower(&fusion_); - const auto kernel = lowered_.kernel(); + setUsedTVs(); if (isDebugDumpEnabled(DebugDumpOption::KernelIr)) { kernel->print(); @@ -216,7 +204,7 @@ void FusionExecutor::compileFusion( std::stringstream ss; ss << "Allocations must be based on constant integers for local memory. However, found: "; for (auto alloc : kernel_summary.dynamic_lmem_allocations) { - ss << toString(alloc->buffer(), false) << ", "; + ss << alloc->buffer()->toString() << ", "; } ss << " have dynamic allocations but are placed in local memory."; TORCH_INTERNAL_ASSERT(false, ss.str()); @@ -233,6 +221,8 @@ void FusionExecutor::compileFusion( block_size > 0, "launch param inferred block size < 0"); } + block_size_high_water_mark = + block_size.has_value() ? block_size.value() : block_size_high_water_mark; compiled_kernel_ = executor_utils::nvrtcCompile( structured_code, (kernelNamespace() + "::" + kernelName()).c_str(), @@ -245,8 +235,8 @@ void FusionExecutor::compileFusion( namespace { at::Tensor inferAndAlloc( - const kir::TensorView* tv, - const std::vector& sizes, + const TensorView* tv, + const std::vector& sizes, kir::ExpressionEvaluator& expr_eval, const CompileOptions& options, bool zero_init = false) { @@ -260,9 +250,11 @@ at::Tensor inferAndAlloc( TORCH_INTERNAL_ASSERT( inferred_val.has_value(), "Could not launch kernel as program could not infer ", - kir::toString(size), - " for the buffer ", - kir::toString(tv)); + size->toString(), + "(", + size->name(), + ") for the buffer ", + tv->toString()); inferred_sizes.push_back(inferred_val.value()); } @@ -283,19 +275,20 @@ at::Tensor inferAndAlloc( } at::Tensor inferAndAllocOutput( - const kir::TensorView* tv, + const TensorView* tv, kir::ExpressionEvaluator& expr_eval, const CompileOptions& options, bool zero_init = false) { const auto domain = tv->domain(); - const auto maybe_rfactor_domain = - domain->hasRFactor() ? domain->rfactorDomain() : domain->rootDomain(); + const auto maybe_rfactor_domain = domain->hasRFactor() + ? domain->getRFactorDomain() + : domain->getRootDomain(); - std::vector sizes; + std::vector sizes; for (const auto id : maybe_rfactor_domain) { if (id->isReduction() || id->isStride() || - id->iterType() == IterType::BroadcastWithoutStride) { + id->getIterType() == IterType::BroadcastWithoutStride) { continue; } sizes.push_back(id->extent()); @@ -348,8 +341,7 @@ LaunchParams FusionExecutor::computeLaunchParams( auto data_cache = compileTimeDataCache(); - auto& lower = lowered_; - + auto lower = lowered_.get(); auto& used_tvs = getUsedTVs(); auto parallel_binding_ids_entry = executor_utils::caching::ExecutorCompileTimeEntry< @@ -364,9 +356,8 @@ LaunchParams FusionExecutor::computeLaunchParams( auto parallel_iter_extent_entry = executor_utils::caching::ExecutorCompileTimeEntry< executor_utils::caching::ParallelIterExtentMap>( - data_cache, [¶llel_binding_ids, &lower]() { - return executor_utils::getParallelIterExtents( - lower, parallel_binding_ids); + data_cache, [¶llel_binding_ids]() { + return executor_utils::getParallelIterExtents(parallel_binding_ids); }); auto& parallel_iter_extents = parallel_iter_extent_entry.get(); @@ -385,7 +376,7 @@ LaunchParams FusionExecutor::computeLaunchParams( executor_utils::caching::WarpPaddedParallelExtents>( data_cache, [¶llel_binding_ids, &lower]() { return executor_utils::getWarpPaddedExtentsInfo( - lower, parallel_binding_ids); + lower->kernel(), parallel_binding_ids); }); auto& warp_padded_extent_set = warp_padded_parallel_entry.get().warp_padded_extent_set; @@ -446,7 +437,9 @@ LaunchParams FusionExecutor::computeLaunchParams( auto val = expr_eval.evaluate(extent); TORCH_INTERNAL_ASSERT( val.has_value(), - "Tried to evaluate the extent of ", + "Tried to evaluate the extent, ", + extent->toInlineString(), + " for the ptype: ", p_type, " to set launch bounds but could not."); @@ -481,14 +474,15 @@ LaunchParams FusionExecutor::computeLaunchParams( expr_eval.precomputedIntegers()->evaluate(); } - const auto kernel = lowered_.kernel(); + const auto kernel = lowered_->kernel(); const auto& kernel_summary = kernel->summary(); // Calculate Dynamic Shared Memory Size // Add workspace for reduction and broadcast uint64_t reduction_broadcast_workspace = 0; const bool has_workspace = kernel_summary.has_block_reductions || - kernel_summary.has_grid_reductions || kernel_summary.has_block_broadcasts; + kernel_summary.has_grid_reductions || + kernel_summary.has_block_broadcasts || kernel_summary.has_grid_broadcasts; if (has_workspace && kernel_summary.largest_smem_data_type != DataType::Null) { // Not using nThreads here since it does not handle uninitialized value @@ -533,14 +527,14 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals( kir::ExpressionEvaluator& expr_eval) { FUSER_PERF_SCOPE("FusionExecutor::AllocGlobalVals"); GlobalBuffers global_buffers; - const auto kernel = lowered_.kernel(); - const auto& kernel_summary = lowered_.kernel()->summary(); + const auto kernel = lowered_->kernel(); + const auto& kernel_summary = lowered_->kernel()->summary(); for (auto alloc : kernel_summary.global_allocations) { TORCH_INTERNAL_ASSERT( - alloc->buffer()->isA(), + alloc->buffer()->isA(), "Cannot allocate global buffers that are not tensors."); - auto tv = alloc->buffer()->as(); - if (kernel->isOutput(tv)) { + auto tv = alloc->buffer()->as(); + if (tv->isFusionOutput()) { continue; } if (alloc->zeroInit()) { @@ -561,14 +555,14 @@ std::vector FusionExecutor::allocOutputs( kir::ExpressionEvaluator& expr_eval, const std::unordered_set& alias_indices) { FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs"); - const auto kernel = lowered_.kernel(); + const auto kernel = lowered_->kernel(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector outputs; for (const auto i : c10::irange(kernel->outputs().size())) { TORCH_INTERNAL_ASSERT( - kernel->outputs()[i]->isA(), + kernel->outputs()[i]->isA(), "Cannot allocate outputs that are not tensors."); - auto output = kernel->outputs()[i]->as(); + auto output = kernel->outputs()[i]->as(); if (alias_indices.count(i) == 0) { outputs.push_back( inferAndAllocOutput(output, expr_eval, options_, false)); @@ -581,7 +575,7 @@ std::vector FusionExecutor::allocOutputs( } void FusionExecutor::setUsedTVs() { - auto used_vals = fusion_.usedMathVals(); + auto used_vals = fusion_->usedMathVals(); auto used_tvs = ir_utils::filterByType(used_vals); used_tvs_.clear(); @@ -595,7 +589,7 @@ std::vector FusionExecutor::runFusion( const LaunchParams& launch_constraints, const c10::optional& opt_code) { FUSER_PERF_SCOPE("FusionExecutor::RunFusion"); - + TORCH_INTERNAL_ASSERT(compiled()); TORCH_INTERNAL_ASSERT( fusion_id_ > 0, "Cannot run fusion, it was not compiled."); TORCH_INTERNAL_ASSERT( @@ -607,11 +601,10 @@ std::vector FusionExecutor::runFusion( executor_entry = &executor_entry_lookup_[*opt_code]; } - FusionGuard fg(&fusion_); c10::DeviceGuard dg(options_.device); auto stream = at::cuda::getCurrentCUDAStream(); executor_utils::initializeCudaContext(); - + TORCH_INTERNAL_ASSERT(lowered_); LaunchParams launch_params; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) std::vector allocated_outputs = outputs; @@ -642,7 +635,7 @@ std::vector FusionExecutor::runFusion( } } else { TORCH_INTERNAL_ASSERT( - outputs.size() == fusion_.outputs().size(), + outputs.size() == fusion_->outputs().size(), __func__, " provided number of outputs does match fusion output"); } @@ -672,20 +665,35 @@ std::vector FusionExecutor::runFusion( // code path to take when either: // 1. no opt_code is provided or // 2. `executor_entry` is not initialized - executor_utils::validateKernelInputs(&fusion_, inputs, options_.device); + executor_utils::validateKernelInputs(fusion_, inputs, options_.device); if (!evaluator_precomputed_integers_) { evaluator_precomputed_integers_ = - std::make_unique(&fusion_, lowered_); + std::make_unique(lowered_->kernel()); } kir::ExpressionEvaluator expr_eval; - evaluator_precomputed_integers_->bindKernelInputs(inputs); + evaluator_precomputed_integers_->bindKernelInputs( + lowered_->kernel(), inputs); expr_eval.precomputedIntegers() = evaluator_precomputed_integers_.get(); launch_params = computeLaunchParams(launch_constraints, expr_eval, warp_size_); + // Recompile the kernel if the number of threads in the block has increased + if (launch_params.nThreads() > block_size_high_water_mark) { + const auto kernel = lowered_->kernel(); + const auto kernel_code = + codegen::generateCudaKernel(kernel, kernelName()); + const auto structured_code = getStructuredCode(kernel_code); + block_size_high_water_mark = launch_params.nThreads(); + compiled_kernel_ = executor_utils::nvrtcCompile( + structured_code, + (kernelNamespace() + "::" + kernelName()).c_str(), + fusion_id_, + block_size_high_water_mark); + } + if (kernel()->summary().has_cooperative_grid_reduction) { #ifndef __HIP_PLATFORM_HCC__ int num_blocks_per_SM = -1; @@ -716,16 +724,18 @@ std::vector FusionExecutor::runFusion( } executor_utils::validateVectorizedTensors( - &fusion_, inputs, outputs, lowered_, compileTimeDataCache(), expr_eval); - - auto& fusion = fusion_; + lowered_.get()->kernel(), + inputs, + outputs, + compileTimeDataCache(), + expr_eval); auto alias_indices_entry = executor_utils::caching::ExecutorCompileTimeEntry< executor_utils::caching::InputAliasIndices>( - compileTimeDataCache(), [&fusion]() { + compileTimeDataCache(), [&]() { return std::make_unique>>( - fusion.getInputAliasIndices()); + fusion_->getInputAliasIndices()); }); auto& alias_indices = alias_indices_entry.get(); @@ -736,9 +746,9 @@ std::vector FusionExecutor::runFusion( auto output_alias_indices_entry = executor_utils::caching::ExecutorCompileTimeEntry< executor_utils::caching::OutputAliasIndices>( - compileTimeDataCache(), [&fusion]() { + compileTimeDataCache(), [&]() { return std::make_unique>( - fusion.getOutputAliasIndices()); + fusion_->getOutputAliasIndices()); }); auto& output_alias_indices = output_alias_indices_entry.get(); @@ -753,7 +763,7 @@ std::vector FusionExecutor::runFusion( } else { // TODO: Update this as well; executor_utils::validateKernelOutputs( - &fusion_, allocated_outputs, options_.device); + fusion_, allocated_outputs, options_.device); } global_buffers = allocGlobalVals(expr_eval); @@ -802,7 +812,7 @@ std::vector FusionExecutor::runFusion( kernel_arguments.push(inputs); kernel_arguments.push(allocated_outputs); kernel_arguments.push(global_buffers.buffers); - if (lowered_.kernel()->summary().is_stochastic) { + if (lowered_->kernel()->summary().is_stochastic) { kernel_arguments.appendPhiloxRNGSeed(rand_offset); } } diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h index 523f2aa0e4b..40accbfb520 100644 --- a/torch/csrc/jit/codegen/cuda/executor.h +++ b/torch/csrc/jit/codegen/cuda/executor.h @@ -35,9 +35,9 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { void compileFusion( Fusion* fusion, - CompileOptions options = CompileOptions(), const at::ArrayRef& inputs = {}, - const LaunchParams& launch_constraints = LaunchParams()); + const LaunchParams& launch_constraints = LaunchParams(), + CompileOptions options = CompileOptions()); std::vector runFusion( const at::ArrayRef& inputs, @@ -55,7 +55,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { // function to query whether a `FusionExecutor` has a compiled kernel to // execute bool compiled() const { - return fusion_id_ != -1; + return fusion_id_ != -1 && lowered_; }; void evictCache(size_t cache_id) { @@ -85,7 +85,8 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { executor_utils::caching::ExecutorCompileTimeInfoCache; kir::Kernel* kernel() const { - return lowered_.kernel(); + TORCH_INTERNAL_ASSERT(lowered_); + return lowered_->kernel(); } //! Internal knob used for debugging/profiling only @@ -178,8 +179,6 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { } private: - Fusion fusion_; - CompileOptions options_; size_t max_device_smem = std::numeric_limits().max(); int warp_size_ = 0; @@ -192,7 +191,13 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable { int fusion_id_ = -1; static int fusion_id_counter_; - GpuLower lowered_; + std::unique_ptr lowered_; + // Copy of lowered_->kernel() + Fusion* fusion_ = nullptr; + + // Track the block size this kernel was compiled with. If the block size + // increases, recompile to adjust maxregister count. + int64_t block_size_high_water_mark = 1; // lookup table to take short cut to retrieve recorded information in order to // launch kernels without re-inference parameters. diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp index 968570c1086..883fae207c5 100644 --- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.cpp @@ -1,4 +1,3 @@ -#include #include // Extract size and strides @@ -65,7 +64,7 @@ std::unique_ptr getTensorArg(int nDims) { false, "Tried to generate a tensor to run a generated kernel with ", nDims, - " dimensions, however it must be a size 0 to 8 dimensional tensor."); + " dimensions, however only 0 to 8 dimensional tensor are supported."); } return nullptr; } @@ -98,8 +97,6 @@ std::unique_ptr getTensorArg( } } -} // namespace - std::unique_ptr getTensorArg( c10::ScalarType dtype, int nDims, @@ -117,20 +114,73 @@ std::unique_ptr getTensorArg( return nullptr; } +} // namespace + // Push a tensor to the arguments void KernelArgumentHolder::push(const at::Tensor& tensor) { changed_ = true; - int nDims = tensor.ndimension(); + if (is_cpu_scalar(tensor)) { + switch (tensor.scalar_type()) { + case c10::ScalarType::Double: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Float: + arguments_.push_back( + std::make_unique>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Half: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::BFloat16: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Bool: + arguments_.push_back( + std::make_unique>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Long: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + case c10::ScalarType::Int: + arguments_.push_back( + std::make_unique< + CpuScalarTensorArg>>( + tensor.data_ptr()[0])); + break; + default: + TORCH_CHECK( + false, + "Dtype: ", + tensor.scalar_type(), + " not currently supported in code generated kernels."); + } + } else { + int nDims = tensor.ndimension(); - c10::ScalarType dtype = tensor.scalar_type(); - std::unique_ptr tensor_arg = - getTensorArg(dtype, nDims, index_mode_); - tensor_arg->setPointer(tensor.data_ptr()); - for (const auto i : c10::irange(nDims)) { - tensor_arg->setSize(i, tensor.sizes()[i]); - tensor_arg->setStride(i, tensor.strides()[i]); + c10::ScalarType dtype = tensor.scalar_type(); + std::unique_ptr tensor_arg = + getTensorArg(dtype, nDims, index_mode_); + tensor_arg->setPointer(tensor.data_ptr()); + for (const auto i : c10::irange(nDims)) { + tensor_arg->setSize(i, tensor.sizes()[i]); + tensor_arg->setStride(i, tensor.strides()[i]); + } + arguments_.push_back(std::move(tensor_arg)); } - arguments_.push_back(std::move(tensor_arg)); } // Push a scalar or integer to the arguments diff --git a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h index d306683c43d..d457a69adb2 100644 --- a/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h +++ b/torch/csrc/jit/codegen/cuda/executor_kernel_arg.h @@ -33,6 +33,7 @@ struct TensorArgCodegen { } }; +// 0-Dim GPU based tensor template struct TensorArgCodegen { T& operator[](nvfuser_index_t ind) { @@ -51,6 +52,17 @@ struct TensorArgCodegen { } }; +// Specialization for 0-dim case that's easy to pass in a CPU based tensor +// without memcpy +template +struct CpuScalarTensorCodegen { + T& operator[](int) { + return data; + }; + + T data; +}; + struct ArgAbstract { virtual ~ArgAbstract() = default; virtual void* arg() = 0; @@ -67,7 +79,7 @@ struct PhiloxCudaStateArg : public ArgAbstract { struct LongArg : public ArgAbstract { int64_t val_; - explicit LongArg(int64_t _val) : val_(_val){}; + explicit LongArg(int64_t _val) : val_(_val) {} // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions) void* arg() { return &val_; @@ -76,7 +88,7 @@ struct LongArg : public ArgAbstract { struct DoubleArg : public ArgAbstract { double val_; - explicit DoubleArg(double _val) : val_(_val){}; + explicit DoubleArg(double _val) : val_(_val) {} // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions) void* arg() { return &val_; @@ -85,7 +97,7 @@ struct DoubleArg : public ArgAbstract { struct BoolArg : public ArgAbstract { bool val_; - explicit BoolArg(bool _val) : val_(_val){}; + explicit BoolArg(bool _val) : val_(_val) {} // NOLINTNEXTLINE(modernize-use-override,cppcoreguidelines-explicit-virtual-functions) void* arg() { return &val_; @@ -119,9 +131,20 @@ struct TensorArg : public TensorArgAbstract { } }; -std::unique_ptr getTensorArg( - c10::ScalarType dtype, - int nDims); +template +struct CpuScalarTensorArg : public ArgAbstract { + CPU_TENSOR_TYPE instance_; + + CpuScalarTensorArg() = delete; + + explicit CpuScalarTensorArg(decltype(CPU_TENSOR_TYPE::data) _data) { + instance_.data = _data; + } + + void* arg() override { + return &instance_; + } +}; class KernelArgumentHolder { public: diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index 13cdc29099e..5323036e5df 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -110,6 +109,16 @@ bool validateKernelArgTensor( return false; } + if (is_cpu_scalar(arg) && !param->as()->isCpuScalar()) { + msg << "Argument is CPU Scalar Tensor, but parameter is not.\n"; + return false; + } + + if (!is_cpu_scalar(arg) && !arg.is_cuda()) { + msg << "Argumnet is a CPU tensor which is not supported in fusions.\n"; + return false; + } + // Check the rank of the tensors. size_t arg_dim = arg.dim(); // Note: This requires current Fusion to be active. @@ -126,7 +135,7 @@ bool validateKernelArgTensor( return false; } - if (arg.device() != device) { + if (!is_cpu_scalar(arg) && arg.device() != device) { msg << "Argument is on device that is not compiled for." << "\n"; return false; @@ -339,6 +348,8 @@ void validateKernelOutputs( !mismatch, "Found one or more invalid arguments: ", msg.str()); } +namespace { + bool canVectorize(const IValue& aten_val, int word_size) { if (!aten_val.isTensor()) { return false; @@ -371,16 +382,18 @@ bool canVectorize(const IValue& aten_val, int word_size) { return true; } +// Returns true if a TV can be used with ParallelType::Vectorize. When +// input or output tensors are involved, the other version of +// canVectorize is used. bool canVectorize( - TensorView* fusion_tv, + TensorView* tv, int word_size, - GpuLower& lower, kir::ExpressionEvaluator& expr_eval) { IterDomain* last_root_dim = nullptr; - // TODO: Should this be rfactor instead of root?? - for (size_t i = fusion_tv->getRootDomain().size(); i > 0; i--) { - auto r_id = fusion_tv->getRootDomain()[i - 1]; - if (r_id->isReduction() || r_id->isBroadcast()) { + for (size_t i = tv->getRootDomain().size(); i > 0; i--) { + auto r_id = tv->getRootDomain()[i - 1]; + if (r_id->isReduction() || r_id->isTrivialReduction() || + r_id->isBroadcast()) { continue; } last_root_dim = r_id; @@ -391,8 +404,7 @@ bool canVectorize( return false; } - auto last_dim_size = - expr_eval.evaluate(lower.lowerValue(last_root_dim->extent())); + auto last_dim_size = expr_eval.evaluate(last_root_dim->extent()); if (!last_dim_size.has_value()) { return false; @@ -405,8 +417,6 @@ bool canVectorize( return true; } -namespace { - // Check if there's any split that is non-divisible and vectorized. If // found, Vectorize is illegal. void validateVectorizedSplits( @@ -418,12 +428,12 @@ void validateVectorizedSplits( TORCH_INTERNAL_ASSERT( input_extent.has_value(), "Could not check if a split with vectorization is divisible because the extent, ", - kir::toString(extent_factor.first), + extent_factor.first->toString(), ", is not possible to evaluate."); TORCH_INTERNAL_ASSERT( input_extent.has_value(), "Could not check if a split with vectorization is divisible because the split factor, ", - kir::toString(extent_factor.second), + extent_factor.second->toString(), ", is not possible to evaluate."); TORCH_INTERNAL_ASSERT( input_extent.value() % split_factor.value() == 0, @@ -435,16 +445,144 @@ void validateVectorizedSplits( } } +//! Returns the position information of vectorized input/output tensors +//! in the given fusion. +std::unique_ptr getVectorizedTensorValidationInfo( + Fusion* fusion) { + auto vectorized_tensor_info_ptr = + std::make_unique(); + auto& tv_to_vector_word_size = + vectorized_tensor_info_ptr->tv_to_vector_word_size; + auto& global_inp_misaligned_tv = + vectorized_tensor_info_ptr->global_inp_misaligned_tv; + auto& global_out_misaligned_tv = + vectorized_tensor_info_ptr->global_out_misaligned_tv; + + kir::ExpressionEvaluator expr_eval; + + // Find all vectorized tensors and their word size + for (auto expr : fusion->exprs()) { + if (!expr->isA() || + expr->as()->getUnaryOpType() != UnaryOpType::Set) { + continue; + } + auto uop = expr->as(); + if (!uop->out()->isA() || !uop->in()->isA()) { + continue; + } + auto out_tv = uop->out()->as(); + auto in_tv = uop->in()->as(); + IterDomain* vector_dim = nullptr; + for (auto id : out_tv->domain()->domain()) { + if (id->getParallelType() == ParallelType::Vectorize || + id->getParallelType() == ParallelType::MisalignedVectorize) { + TORCH_INTERNAL_ASSERT( + vector_dim == nullptr, + "Found multiple vectorized dimensions on tensor ", + out_tv); + vector_dim = id; + } + } + if (vector_dim == nullptr) { + continue; + } + auto vector_word_size = expr_eval.evaluate(vector_dim->extent()); + TORCH_INTERNAL_ASSERT( + vector_word_size.has_value(), + "Non constant vector dimension found in ", + out_tv); + + // The expression here must be a UnaryOp::Set, so checking either of the + // input or output tensor should be sufficient. When the output is a + // fusion output, check the tensor as its size information is available + // without using the expression evaluator. + auto tv_to_verify = out_tv->isFusionOutput() ? out_tv : in_tv; + tv_to_vector_word_size[tv_to_verify] = vector_word_size.value(); + + if (vector_dim->getParallelType() == ParallelType::MisalignedVectorize) { + TORCH_INTERNAL_ASSERT( + in_tv->isFusionInput() || out_tv->isFusionOutput(), + "MisalignedVectorize is assumed to be used with either input or output tensor"); + if (out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Local) { + global_out_misaligned_tv.insert(out_tv); + } else if ( + in_tv->getMemoryType() == MemoryType::Global && + out_tv->getMemoryType() == MemoryType::Local) { + global_inp_misaligned_tv.insert(in_tv); + } else { + TORCH_INTERNAL_ASSERT( + false, + "Unsupported memory configuration for misaligned vectorization."); + } + } + } + + // Check striding information on input and outputs as well as size information + // of all + auto& inp_misaligned_tensors_pos = + vectorized_tensor_info_ptr->inp_misaligned_tensors_pos; + auto& out_misaligned_tensors_pos = + vectorized_tensor_info_ptr->out_misaligned_tensors_pos; + auto& inp_pos_to_word_size_map_to_verify = + vectorized_tensor_info_ptr->inp_pos_to_word_size_map_to_verify; + auto& out_pos_to_word_size_map_to_verify = + vectorized_tensor_info_ptr->out_pos_to_word_size_map_to_verify; + auto& intermediate_tv_to_word_size_map_to_verify = + vectorized_tensor_info_ptr->intermediate_tv_to_word_size_map_to_verify; + + for (auto entry : tv_to_vector_word_size) { + auto tv = entry.first; + auto word_size = entry.second; + if (tv->isFusionInput()) { + auto inp_it = + std::find(fusion->inputs().begin(), fusion->inputs().end(), tv); + TORCH_INTERNAL_ASSERT( + inp_it != fusion->inputs().end(), + "Could not find ", + tv, + " in fusion inputs."); + auto inp_pos = std::distance(fusion->inputs().begin(), inp_it); + + if (global_inp_misaligned_tv.find(tv) != global_inp_misaligned_tv.end()) { + inp_misaligned_tensors_pos.emplace_back(inp_pos); + } else { + // Shouldn't visit same pos twice here, assert ? + inp_pos_to_word_size_map_to_verify[inp_pos] = word_size; + } + } else if (tv->isFusionOutput()) { + auto out_it = + std::find(fusion->outputs().begin(), fusion->outputs().end(), tv); + TORCH_INTERNAL_ASSERT( + out_it != fusion->outputs().end(), + "Could not find ", + tv, + " in provided fusion outputs."); + auto out_pos = std::distance(fusion->outputs().begin(), out_it); + + if (global_out_misaligned_tv.find(tv) != global_out_misaligned_tv.end()) { + out_misaligned_tensors_pos.emplace_back(out_pos); + } else { + out_pos_to_word_size_map_to_verify[out_pos] = word_size; + } + } else { + // Intermediate tensors. Note that this must be Vectorize as + // MisalignedVectorize is only supported for inputs and outputs. + intermediate_tv_to_word_size_map_to_verify[tv] = word_size; + } + } + + return vectorized_tensor_info_ptr; +} } // namespace // Misaligned vectorization check. Currently misaligned vectorization is limited // to global-register and register-global load/store patterns. However, this // could be improved to include shared memory. void validateVectorizedTensors( - Fusion* fusion, + kir::Kernel* kernel, const at::ArrayRef& inputs, const std::vector& outputs, - GpuLower& lower, caching::ExecutorCompileTimeInfoCache* data_cache, kir::ExpressionEvaluator& expr_eval) { FUSER_PERF_SCOPE("FusionExecutor::validateVectorizedTensors"); @@ -452,9 +590,8 @@ void validateVectorizedTensors( auto tensor_vectorization_validation_entry = executor_utils::caching::ExecutorCompileTimeEntry< executor_utils::caching::VectorizedTensorValidation>( - data_cache, [fusion, &lower]() { - return executor_utils::getVectorizedTensorValidationInfo( - fusion, lower); + data_cache, [kernel]() { + return executor_utils::getVectorizedTensorValidationInfo(kernel); }); // Validate all the canVectorizes: @@ -463,7 +600,7 @@ void validateVectorizedTensors( TORCH_INTERNAL_ASSERT( canVectorize(inputs[it.first], it.second), "Error vectorizing, ", - fusion->inputs()[it.first], + kernel->inputs()[it.first], " as input provided does not allowed vectorization by word size, ", it.second); } @@ -474,12 +611,24 @@ void validateVectorizedTensors( TORCH_INTERNAL_ASSERT( canVectorize(outputs[it.first], it.second), "Error vectorizing, ", - fusion->outputs()[it.first], + kernel->outputs()[it.first], " as output provided does not allowed vectorization by word size, ", it.second); } } + for (auto it : tensor_vectorization_validation_entry.get() + .intermediate_tv_to_word_size_map_to_verify) { + auto tv = it.first; + auto vec_width = it.second; + TORCH_INTERNAL_ASSERT( + canVectorize(tv, vec_width, expr_eval), + "Error vectorizing, ", + tv->toString(), + " as the extent of the vectorized axis does not allowed vectorization by word size, ", + vec_width); + } + std::vector inp_misaligned_tensors; std::vector out_misaligned_tensors; @@ -511,7 +660,7 @@ void validateVectorizedTensors( out_misaligned_tensors), "All global tensors must have the same stride for misaligned vectorization."); - validateVectorizedSplits(lower.kernel(), expr_eval); + validateVectorizedSplits(kernel, expr_eval); } kir::ExpressionEvaluator bindKernelInputs( @@ -530,7 +679,7 @@ kir::ExpressionEvaluator bindKernelInputs( for (const auto i : c10::irange(inputs.size())) { const auto input = inputs[i]; - if (auto tensor_input = dynamic_cast(input)) { + if (auto tensor_input = dynamic_cast(input)) { TORCH_INTERNAL_ASSERT( aten_inputs[i].isTensor(), "Something went wrong configuring launch. Inputs no longer match at index:", @@ -538,7 +687,7 @@ kir::ExpressionEvaluator bindKernelInputs( const auto aten_tensor = aten_inputs[i].toTensor(); const auto root_domain = - kir::TensorDomain::noReductions(tensor_input->domain()->rootDomain()); + TensorDomain::noReductions(tensor_input->domain()->getRootDomain()); TORCH_INTERNAL_ASSERT( aten_tensor.ndimension() == static_cast(root_domain.size()), "Something went wrong configuring launch. Inputs no longer match."); @@ -553,7 +702,7 @@ kir::ExpressionEvaluator bindKernelInputs( TORCH_CHECK( *prev_value == value, "Attempting to bind ", - kir::toString(extent), + extent->toString(), " to ", value, "but it's already set to ", @@ -561,7 +710,7 @@ kir::ExpressionEvaluator bindKernelInputs( should_bind = false; } } - if (should_bind && !extent->isConst()) { + if (should_bind && !extent->isConstScalar()) { expr_eval.bind(extent, value); } } @@ -697,24 +846,19 @@ NvrtcFunction nvrtcCompile( "--std=c++14", compute.c_str(), "-default-device"}; #endif - const char* disable_fastmath = getenv("PYTORCH_NVFUSER_DISABLE_FASTMATH"); - if (!disable_fastmath || (atoi(disable_fastmath) == 0)) { - args.push_back("--use_fast_math"); - } else { - TORCH_WARN_ONCE( - "fast math disabled in nvfuser, try set `PYTORCH_NVFUSER_DISABLE_FASTMATH=0`"); - } - const char* disable_fma = getenv("PYTORCH_NVFUSER_DISABLE_FMA"); - // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0; - if (disable_fma && atoi(disable_fma)) { #ifdef __HIP_PLATFORM_HCC__ + if (disable_fma && atoi(disable_fma)) { TORCH_WARN_ONCE( "PYTORCH_CUDA_FUSER_DISABLE_FMA is not supported on ROCm, ignoring"); -#else - args.push_back("--fmad=false"); -#endif } +#else + if (disable_fma && atoi(disable_fma)) { + args.push_back("--fmad=false"); + } else { + args.push_back("--fmad=true"); + } +#endif #ifndef NDEBUG // Add line info to generated kernels @@ -1037,7 +1181,7 @@ template class ExecutorCompileTimeEntry; } // namespace caching std::vector getParallelBindingsIterDomains( - GpuLower& lower, + GpuLower* lower, const std::vector& used_tvs) { std::vector parallel_ids; for (auto tv : used_tvs) { @@ -1047,7 +1191,7 @@ std::vector getParallelBindingsIterDomains( // Want to keep the broadcast dimensions if they are not resolved // TODO: piping down the parallel dimension map here would // be helpful - auto& parallel_map = lower.caParallelMap(); + auto& parallel_map = lower->caParallelMap(); if (parallel_map.getConcreteMappedID(id) == id) { parallel_ids.push_back(id); } @@ -1062,39 +1206,41 @@ std::vector getParallelBindingsIterDomains( return parallel_ids; } +namespace { + void insertParallelExtent( - GpuLower& lower, IterDomain* binding_id, const std::unique_ptr& parallel_iter_extents_ptr) { - auto kir_extent = lower.lowerValue(binding_id->extent()); + auto extent = binding_id->extent(); const auto it = parallel_iter_extents_ptr->find(binding_id->getParallelType()); if (it != parallel_iter_extents_ptr->end()) { - it->second.push_back(kir_extent); + it->second.push_back(extent); } else { parallel_iter_extents_ptr->operator[](binding_id->getParallelType()) = { - kir_extent}; + extent}; } } +} // namespace + std::unique_ptr getParallelIterExtents( - GpuLower& lower, std::vector& parallel_binding_ids) { auto parallel_iter_extents_ptr = std::make_unique(); for (auto id : parallel_binding_ids) { - insertParallelExtent(lower, id, parallel_iter_extents_ptr); + insertParallelExtent(id, parallel_iter_extents_ptr); } return parallel_iter_extents_ptr; } std::unique_ptr getSimplifiedParallelIterExtents( - GpuLower& lower, + GpuLower* lower, std::vector& parallel_binding_ids) { auto parallel_iter_extents_ptr = std::make_unique(); - auto& parallel_map = lower.caParallelMap(); + auto& parallel_map = lower->caParallelMap(); std::vector mapped; - bool is_tidx_warp_padded = lower.getWarpPaddedParallelInfo().is_tidx_padded; + bool is_tidx_warp_padded = lower->getWarpPaddedParallelInfo().is_tidx_padded; for (auto id : parallel_binding_ids) { if (std::any_of( @@ -1109,7 +1255,7 @@ std::unique_ptr getSimplifiedParallelIterExtents( } insertParallelExtent( - lower, parallel_map.getConcreteMappedID(id), parallel_iter_extents_ptr); + parallel_map.getConcreteMappedID(id), parallel_iter_extents_ptr); mapped.push_back(id); } @@ -1117,7 +1263,7 @@ std::unique_ptr getSimplifiedParallelIterExtents( } std::unique_ptr getWarpPaddedExtentsInfo( - GpuLower& lower, + kir::Kernel* kernel, std::vector& parallel_binding_ids) { auto warp_padded_extent_info_ptr = std::make_unique(); @@ -1125,7 +1271,6 @@ std::unique_ptr getWarpPaddedExtentsInfo( warp_padded_extent_info_ptr->warp_padded_extent_set; auto& warp_padded_constant = warp_padded_extent_info_ptr->warp_padded_constant; - auto kernel = lower.kernel(); bool has_warp_reduction = kernel->getWarpPaddedParallelInfo().has_warp_reduction; @@ -1135,11 +1280,11 @@ std::unique_ptr getWarpPaddedExtentsInfo( if (has_warp_reduction) { if (id->hasPaddingToMultipleOfWarp() || kernel->isParallelTypePadded(id->getParallelType())) { - auto kir_extent = lower.lowerValue(id->extent()); - warp_padded_extent_set.insert(kir_extent); + auto extent = id->extent(); + warp_padded_extent_set.insert(extent); auto padded_value = id->getMaybeSizeAfterPadding(); if (padded_value.has_value()) { - warp_padded_constant[kir_extent] = padded_value.value(); + warp_padded_constant[extent] = padded_value.value(); } } } @@ -1147,122 +1292,6 @@ std::unique_ptr getWarpPaddedExtentsInfo( return warp_padded_extent_info_ptr; } -std::unique_ptr getVectorizedTensorValidationInfo( - Fusion* fusion, - GpuLower& lower) { - auto vectorized_tensor_info_ptr = - std::make_unique(); - auto& tv_to_vector_word_size = - vectorized_tensor_info_ptr->tv_to_vector_word_size; - auto& global_inp_misaligned_tv = - vectorized_tensor_info_ptr->global_inp_misaligned_tv; - auto& global_out_misaligned_tv = - vectorized_tensor_info_ptr->global_out_misaligned_tv; - - kir::ExpressionEvaluator expr_eval; - - // Find all vectorized tensors and their word size - for (auto expr : fusion->exprs()) { - if (!expr->isA() || - expr->as()->getUnaryOpType() != UnaryOpType::Set) { - continue; - } - auto uop = expr->as(); - if (!uop->out()->isA() || !uop->in()->isA()) { - continue; - } - auto out_tv = uop->out()->as(); - auto in_tv = uop->in()->as(); - IterDomain* vector_dim = nullptr; - for (auto id : out_tv->domain()->domain()) { - if (id->getParallelType() == ParallelType::Vectorize || - id->getParallelType() == ParallelType::MisalignedVectorize) { - TORCH_INTERNAL_ASSERT( - vector_dim == nullptr, - "Found multiple vectorized dimensions on tensor ", - out_tv); - vector_dim = id; - } - } - if (vector_dim == nullptr) { - continue; - } - auto vector_word_size = - expr_eval.evaluate(lower.lowerValue(vector_dim->extent())); - TORCH_INTERNAL_ASSERT( - vector_word_size.has_value(), - "Non constant vector dimension found in ", - out_tv); - tv_to_vector_word_size[out_tv] = vector_word_size.value(); - tv_to_vector_word_size[in_tv] = vector_word_size.value(); - - if (vector_dim->getParallelType() == ParallelType::MisalignedVectorize) { - if (out_tv->getMemoryType() == MemoryType::Global && - in_tv->getMemoryType() == MemoryType::Local) { - global_out_misaligned_tv.insert(out_tv); - } else if ( - in_tv->getMemoryType() == MemoryType::Global && - out_tv->getMemoryType() == MemoryType::Local) { - global_inp_misaligned_tv.insert(in_tv); - } else { - TORCH_INTERNAL_ASSERT( - false, - "Unsupported memory configuration for misaligned vectorization."); - } - } - } - - // Check striding information on input and outputs as well as size information - // of all - auto& inp_misaligned_tensors_pos = - vectorized_tensor_info_ptr->inp_misaligned_tensors_pos; - auto& out_misaligned_tensors_pos = - vectorized_tensor_info_ptr->out_misaligned_tensors_pos; - auto& inp_pos_to_word_size_map_to_verify = - vectorized_tensor_info_ptr->inp_pos_to_word_size_map_to_verify; - auto& out_pos_to_word_size_map_to_verify = - vectorized_tensor_info_ptr->out_pos_to_word_size_map_to_verify; - - for (auto entry : tv_to_vector_word_size) { - auto tv = entry.first; - auto word_size = entry.second; - if (tv->isFusionInput()) { - auto inp_it = - std::find(fusion->inputs().begin(), fusion->inputs().end(), tv); - TORCH_INTERNAL_ASSERT( - inp_it != fusion->inputs().end(), - "Could not find ", - tv, - " in fusion inputs."); - auto inp_pos = std::distance(fusion->inputs().begin(), inp_it); - - if (global_inp_misaligned_tv.find(tv) != global_inp_misaligned_tv.end()) { - inp_misaligned_tensors_pos.emplace_back(inp_pos); - } else { - // Shouldn't visit same pos twice here, assert ? - inp_pos_to_word_size_map_to_verify[inp_pos] = word_size; - } - } else if (tv->isFusionOutput()) { - auto out_it = - std::find(fusion->outputs().begin(), fusion->outputs().end(), tv); - TORCH_INTERNAL_ASSERT( - out_it != fusion->outputs().end(), - "Could not find ", - tv, - " in provided fusion outputs."); - auto out_pos = std::distance(fusion->outputs().begin(), out_it); - - if (global_out_misaligned_tv.find(tv) != global_out_misaligned_tv.end()) { - out_misaligned_tensors_pos.emplace_back(out_pos); - } else { - out_pos_to_word_size_map_to_verify[out_pos] = word_size; - } - } - } - - return vectorized_tensor_info_ptr; -} - } // namespace executor_utils } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.h b/torch/csrc/jit/codegen/cuda/executor_utils.h index d851be48991..93deec6343f 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.h +++ b/torch/csrc/jit/codegen/cuda/executor_utils.h @@ -28,28 +28,16 @@ namespace executor_utils { // Include all the functions we might need in generated code std::string kernelPreamble(); -// TODO(kir): rewrite in terms of Kernel inputs void validateKernelInputs( Fusion* fusion, const at::ArrayRef& inputs, const c10::Device& device); -// TODO(kir): rewrite in terms of Kernel outputs void validateKernelOutputs( Fusion* fusion, const std::vector& outputs, const c10::Device& device); -// Returns if vectorizing the aten value by word size is possible -bool canVectorize(const IValue& aten_val, int word_size); - -// Returns if vectorizing the aten value by word size is possible -bool canVectorize( - TensorView* fusion_tv, - int word_size, - GpuLower& lower, - kir::ExpressionEvaluator& expr_eval); - //! Bind kernel input values to runtime values kir::ExpressionEvaluator bindKernelInputs( const at::ArrayRef& aten_inputs, @@ -112,7 +100,7 @@ class ParallelBindingIterDomains { class ParallelIterExtentMap { public: using DataType = - std::unordered_map, TypeHash>; + std::unordered_map, TypeHash>; static const CompileTimeEntryType EntryType = CompileTimeEntryType::PARALLEL_ITER_EXTENT_MAP; }; @@ -133,7 +121,7 @@ class ParallelIterExtentMap { class SimplifiedParallelIterExtentMap { public: using DataType = - std::unordered_map, TypeHash>; + std::unordered_map, TypeHash>; static const CompileTimeEntryType EntryType = CompileTimeEntryType::SIMPLIFIED_PARALLEL_ITER_EXTENT_MAP; }; @@ -141,8 +129,8 @@ class SimplifiedParallelIterExtentMap { //! WarpPaddedExtentsInfo: //! Auxiliary data type for entry class WarpPaddedParallelExtents struct WarpPaddedExtentsInfo { - std::unordered_set warp_padded_extent_set; - std::unordered_map warp_padded_constant; + std::unordered_set warp_padded_extent_set; + std::unordered_map warp_padded_constant; }; //! Compile-time info to be cached in each FusionExecutor: @@ -166,6 +154,8 @@ struct VectorizedTensorInfo { std::vector out_misaligned_tensors_pos; std::unordered_map inp_pos_to_word_size_map_to_verify; std::unordered_map out_pos_to_word_size_map_to_verify; + std::unordered_map + intermediate_tv_to_word_size_map_to_verify; }; //! Compile-time info to be cached in each FusionExecutor: @@ -284,42 +274,33 @@ class ExecutorCompileTimeEntry { //! Returns the vector of tensorviews that will be used to bind parallel //! dimensions. std::vector getParallelBindingsIterDomains( - GpuLower& lower, + GpuLower* lower, const std::vector& used_tvs); using ParallelExtentMap = - std::unordered_map, TypeHash>; + std::unordered_map, TypeHash>; //! Returns the extents of all parallel binding iterdomains corresponding //! to each parallel type. std::unique_ptr getParallelIterExtents( - GpuLower& lower, std::vector& parallel_binding_ids); //! Returns the simplified set of extents necessary for launch parameter //! binding. std::unique_ptr getSimplifiedParallelIterExtents( - GpuLower& lower, + GpuLower* lower, std::vector& parallel_binding_ids); //! Returns the symbolic or constant extetns of warp padded parallel //! iterdomains in the given vector. std::unique_ptr getWarpPaddedExtentsInfo( - GpuLower& lower, + kir::Kernel* lower, std::vector& parallel_binding_ids); -//! Returns the position information of vectorized input/output tensors -//! in the given fusion. -std::unique_ptr getVectorizedTensorValidationInfo( - Fusion* fusion, - GpuLower& lower); - -// TODO(kir): rewrite in terms of Kernel tensors void validateVectorizedTensors( - Fusion* fusion, + kir::Kernel* kernel, const at::ArrayRef& inputs, const std::vector& outputs, - GpuLower& lower, caching::ExecutorCompileTimeInfoCache* data_cache, kir::ExpressionEvaluator& expr_eval); diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.h b/torch/csrc/jit/codegen/cuda/expr_evaluator.h index ced4b59a783..5630743b6f6 100644 --- a/torch/csrc/jit/codegen/cuda/expr_evaluator.h +++ b/torch/csrc/jit/codegen/cuda/expr_evaluator.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/fusion.cpp b/torch/csrc/jit/codegen/cuda/fusion.cpp index d9d71e53c41..be686c0d943 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion.cpp @@ -8,10 +8,9 @@ #include #include #include +#include #include -#include - namespace torch { namespace jit { namespace fuser { @@ -37,13 +36,7 @@ void swap(Fusion& a, Fusion& b) noexcept { using std::swap; - // Swap the content - swap(a.val_set_, b.val_set_); - swap(a.expr_set_, b.expr_set_); - swap(a.val_deque_, b.val_deque_); - - swap(a.val_type_name_map_, b.val_type_name_map_); - swap(a.expr_name_counter_, b.expr_name_counter_); + swap(static_cast(a), static_cast(b)); swap(a.inputs_, b.inputs_); swap(a.outputs_, b.outputs_); @@ -51,27 +44,6 @@ void swap(Fusion& a, Fusion& b) noexcept { swap(a.io_alias_, b.io_alias_); swap(a.permuted_input_map_, b.permuted_input_map_); swap(a.permuted_output_map_, b.permuted_output_map_); - - // Fixup the Statement::fusion_ links for a - for (auto val : a.val_set_) { - val->fusion_ = &a; - } - for (auto expr : a.expr_set_) { - expr->fusion_ = &a; - } - - // Fixup the Statement::fusion_ links for b - for (auto val : b.val_set_) { - val->fusion_ = &b; - } - for (auto expr : b.expr_set_) { - expr->fusion_ = &b; - } -} - -Fusion::Fusion(const Fusion& other) { - FUSER_PERF_SCOPE("Fusion copy"); - Fusion::copy(&other, this); } std::unique_ptr Fusion::segment( @@ -82,28 +54,13 @@ std::unique_ptr Fusion::segment( IrCloner Fusion::copy(const Fusion* from, Fusion* to) { to->clear(); - IrCloner ir_cloner(to); + auto ir_cloner = IrContainer::copy(from, to); - for (auto val : from->val_set_) { - to->val_set_.insert(ir_cloner.clone(val)); - } - - for (auto expr : from->expr_set_) { - to->expr_set_.insert(ir_cloner.clone(expr)); - } - - for (auto val : from->val_deque_) { - to->val_deque_.push_back(ir_cloner.clone(val)); - } - - for (auto val : from->val_set_) { + for (auto val : from->vals_) { ir_cloner.clone(val)->setDefinition(ir_cloner.clone(val->definition_)); ir_cloner.clone(val)->setUses(ir_cloner.clone(val->uses_)); } - to->val_type_name_map_ = from->val_type_name_map_; - to->expr_name_counter_ = from->expr_name_counter_; - to->inputs_ = ir_cloner.clone(from->inputs_); to->outputs_ = ir_cloner.clone(from->outputs_); @@ -117,9 +74,22 @@ IrCloner Fusion::copy(const Fusion* from, Fusion* to) { to->permuted_input_map_ = from->permuted_input_map_; to->permuted_output_map_ = from->permuted_output_map_; + to->all_tv_uses_valid_ = from->all_tv_uses_valid_; + // This should never be true on copy, but copying for completeness. + to->is_during_update_uses_ = from->is_during_update_uses_; + return ir_cloner; } +// Clang tidy complains when using default constructor for IrContainer instead +// of copy constructor. Fusion::copy has a call to IrContainer::copy, so it's +// redundant to use the IrContainer copy constructor, but it is harmless since +// Fusion::copy starts by calling clear(). +Fusion::Fusion(const Fusion& other) : IrContainer(other) { + FUSER_PERF_SCOPE("Fusion copy"); + Fusion::copy(&other, this); +} + Fusion::Fusion(Fusion&& other) noexcept { FUSER_PERF_SCOPE("Fusion move"); swap(*this, other); @@ -147,36 +117,22 @@ Fusion::~Fusion() { void Fusion::clear() noexcept { FUSER_PERF_SCOPE("Fusion clear"); - // Free the owned values - for (auto ptr : val_set_) { - delete ptr; - } - - // Free the owned expressions - for (auto ptr : expr_set_) { - delete ptr; - } - - val_set_.clear(); - val_deque_.clear(); - expr_set_.clear(); - - for (auto& kv : val_type_name_map_) { - kv.second = 0; - } - - expr_name_counter_ = 0; + IrContainer::clear(); inputs_.clear(); outputs_.clear(); io_alias_.clear(); + permuted_input_map_.clear(); permuted_output_map_.clear(); + + all_tv_uses_valid_ = false; + is_during_update_uses_ = false; } void Fusion::removeExpr(Expr* expr) { - assertInFusion(expr, "Cannot remove expr "); + assertInContainer(expr, "Cannot remove expr "); // If we hit this error too frequently, we could lighten the restrictions so // that removing something that doesn't exist simply does nothing. For now, // we're going with the strictest model which errors. @@ -194,13 +150,11 @@ void Fusion::removeExpr(Expr* expr) { } } - expr_set_.erase(expr); - - delete expr; + IrContainer::removeExpr(expr); } void Fusion::removeVal(Val* val) { - assertInFusion(val, "Cannot remove val "); + assertInContainer(val, "Cannot remove val "); TORCH_CHECK( !val->isFusionInput(), @@ -213,22 +167,14 @@ void Fusion::removeVal(Val* val) { if (orig != nullptr) removeExpr(val->definition()); - for (Expr* use : unordered_uses(val)) + for (Expr* use : unordered_uses(val)) { removeExpr(use); - - val_set_.erase(val); - - for (auto it = val_deque_.begin(); it != val_deque_.end(); it++) - if (*it == val) { - val_deque_.erase(it); - break; - } - - delete val; + } + IrContainer::removeVal(val); } void Fusion::addInput(Val* input) { - assertInFusion(input, "Cannot register input "); + assertInContainer(input, "Cannot register input "); if (input->getValType().value() == ValType::TensorView) { auto tv = input->as(); @@ -242,7 +188,7 @@ void Fusion::addInput(Val* input) { } void Fusion::addOutput(Val* output) { - assertInFusion(output, "Cannot register output "); + assertInContainer(output, "Cannot register output "); if (output->getValType().value() == ValType::TensorView) { auto tv = output->as(); tv->setMemoryType(MemoryType::Global); @@ -307,27 +253,8 @@ void Fusion::replaceOutput(Val* output, Val* replacement) { } } -bool Fusion::inFusion(const Statement* stmt) const { - bool in_fusion = stmt->fusion() == this; - Statement* nonconst_stmt = const_cast(stmt); // NOLINT - - if (stmt->isExpr()) { - in_fusion &= expr_set_.find(nonconst_stmt->as()) != expr_set_.end(); - } - if (stmt->isVal()) { - in_fusion &= val_set_.find(nonconst_stmt->as()) != val_set_.end(); - } - - return in_fusion; -} - -void Fusion::assertInFusion(const Statement* stmt, const std::string& msg) - const { - TORCH_CHECK(inFusion(stmt), msg, " it was not found in the active fusion."); -} - std::vector Fusion::exprs() { - return ExprSort::getExprs(this); + return StmtSort::getExprs(this); } std::vector Fusion::inputsOf(Val* val) { @@ -341,12 +268,24 @@ void Fusion::validateInputs() { all_inputs.insert(input); } } + + std::unordered_set input_dims; + auto inp_tvs = ir_utils::filterByType(inputs()); + for (auto tv : inp_tvs) { + for (auto id : tv->getMaybeRFactorDomain()) { + input_dims.emplace(id->extent()); + } + } for (Val* input : all_inputs) { if (!input->isConstScalar()) { TORCH_CHECK( - hasInput(input) || inFusion(input), + input->isFusionInput() || + // TODO: Switch: + inContainer(input), + // to: input_dims.find(input) != input_dims.end(), + // https://github.com/csarofeen/pytorch/issues/1365 "Could not figure out how ", - input, + input->toString(), " is generated, however it was not specified as an input."); } } @@ -367,6 +306,10 @@ void Fusion::print() { void Fusion::printKernel() { FUSER_PERF_SCOPE("Fusion::printKernel"); + TORCH_INTERNAL_ASSERT( + !this->isA(), + "Cannot \"print kernel\" of a kernel container. ", + "This would require lowering during lowering."); std::cout << codegen::generateCudaKernel(GpuLower(this).kernel()); } @@ -394,7 +337,7 @@ void Fusion::printMath(bool from_outputs_only) { leaf_vals.push_back(val); } } - exprs_for_print = ExprSort::getExprs(this, leaf_vals); + exprs_for_print = StmtSort::getExprs(this, leaf_vals); } std::cout << "\n%kernel_math {\n"; @@ -412,33 +355,36 @@ void Fusion::printTransforms() { t_exprs.handle(this); } -StmtNameType Fusion::registerVal(Val* val) { - if (val->fusion()) { - if (val->fusion() != this) { - TORCH_CHECK(false, val, " was not found in the active fusion."); - } - if (inFusion(val)) { - return val->name(); - } +void Fusion::registerVal(Val* val) { + if (inContainer(val)) { + return; } - val_set_.emplace(val); - val_deque_.push_back(val); - return getValName(*(val->getValType())); + if (val->fusion()) { + TORCH_CHECK( + val->fusion() == this, val, " was not found in the active fusion."); + } + + IrContainer::registerVal(val); } -StmtNameType Fusion::registerExpr(Expr* expr) { - if (expr->fusion()) { - if (expr->fusion() != this) { - TORCH_CHECK(false, expr, " was not found in the active fusion."); - } - if (inFusion(expr)) { - return expr->name(); - } +void Fusion::registerExpr(Expr* expr) { + if (inContainer(expr)) { + return; } + if (expr->fusion()) { + TORCH_CHECK( + expr->fusion() == this, expr, " was not found in the active fusion."); + } + + IrContainer::registerExpr(expr); + + bool has_tv = false; + for (Val* input : expr->inputs()) { - assertInFusion(input, "Input to expr is invalid, "); + has_tv = has_tv || input->isA(); + assertInContainer(input, "Input to expr is invalid, "); auto uses_copy = input->uses(); if (std::find(uses_copy.begin(), uses_copy.end(), expr) == uses_copy.end()) { @@ -447,34 +393,25 @@ StmtNameType Fusion::registerExpr(Expr* expr) { } } + // Kernel is the only container type that is non-ssa. This is mainly (maybe + // only) because of initialization expressions which would overwrite tensor + // view definitions. + bool is_ssa = !this->isA(); + for (Val* output : expr->outputs()) { - assertInFusion(output, "Output to expr is invalid, "); - if (output->definition() != nullptr) { + has_tv = has_tv || output->isA(); + assertInContainer(output, "Output to expr is invalid, "); + if (output->definition() != nullptr && is_ssa) { removeExpr(output->definition()); } - output->setDefinition(expr); + if (is_ssa || (!is_ssa && output->definition() == nullptr)) { + output->setDefinition(expr); + } } - expr_set_.emplace(expr); - - resetTvUses(); - return getExprName(); -} - -StmtNameType Fusion::registerStatement(Statement* stmt) { - if (inFusion(stmt)) - return stmt->name(); - - if (stmt->isVal()) { - return registerVal(stmt->as()); - } else if (stmt->isExpr()) { - return registerExpr(stmt->as()); + if (has_tv) { + resetTvUses(); } - - TORCH_INTERNAL_ASSERT( - false, - "Could not register statement as Fusion could not recognize its type."); - return kInvalidStmName; } void Fusion::resetTvUses() { @@ -484,8 +421,8 @@ void Fusion::resetTvUses() { // getExprs only uses definition, so even if we've modified uses already to // remove dead exprs, this could reinsert them. getExprs is also boundeds by // inputs as registered inputs will return nullptr as their definition. - const auto all_tvs = ir_utils::filterByType(val_set_); - const auto used_exprs = ExprSort::getExprs(this); + const auto all_tvs = ir_utils::filterByType(vals_); + const auto used_exprs = StmtSort::getExprs(this); for (auto tv : all_tvs) { tv->setUses({}); @@ -507,14 +444,6 @@ void Fusion::resetTvUses() { is_during_update_uses_ = false; } -const std::unordered_set& Fusion::vals() const noexcept { - return val_set_; -} - -const std::deque& Fusion::deterministic_vals() const noexcept { - return val_deque_; -} - std::vector Fusion::usedMathVals() { // Note that using fusion->inputs() as the argument for the first // parameter of getAllValsBetween does not grab all used vals as @@ -553,37 +482,15 @@ std::vector Fusion::usedMathVals() { return used_math_vals; } -const std::unordered_set& Fusion::unordered_exprs() const noexcept { - return expr_set_; -} - std::unordered_set Fusion::unordered_uses(Val* val) const { return std::unordered_set(val->uses().begin(), val->uses().end()); } Expr* Fusion::definition(const Val* val) const { - assertInFusion(val, "Cannot detect the definition of val, "); + assertInContainer(val, "Cannot detect the definition of val, "); return val->definition(); } -bool Fusion::hasInput(const Val* val) const { - assertInFusion(val, "Cannot check if val is an input, "); - return val->isFusionInput(); -} - -bool Fusion::hasOutput(const Val* val) const { - assertInFusion(val, "Cannot check if val is an output, "); - return val->isFusionOutput(); -} - -StmtNameType Fusion::getValName(ValType vtype) { - return val_type_name_map_[vtype]++; -} - -StmtNameType Fusion::getExprName() { - return expr_name_counter_++; -} - // Indicate to kernel to set itself up to generate random numbers bool Fusion::isStochastic() { for (auto expr : exprs()) @@ -593,28 +500,6 @@ bool Fusion::isStochastic() { return false; } -bool Fusion::hasReduction() { - FUSER_PERF_SCOPE("Fusion::hasReduction"); - - for (auto expr : exprs()) - for (auto out : expr->outputs()) - if (out->getValType() == ValType::TensorView) - if (out->as()->hasReduction()) - return true; - - return false; -} - -bool Fusion::hasWelford() { - FUSER_PERF_SCOPE("Fusion::hasWelford"); - for (auto expr : exprs()) { - if (expr->isA()) { - return true; - } - } - return false; -} - std::vector Fusion::getTerminatingOutputs() { FUSER_PERF_SCOPE("getTerminatingOutputs"); diff --git a/torch/csrc/jit/codegen/cuda/fusion.h b/torch/csrc/jit/codegen/cuda/fusion.h index c892bd8171c..2e76e00896b 100644 --- a/torch/csrc/jit/codegen/cuda/fusion.h +++ b/torch/csrc/jit/codegen/cuda/fusion.h @@ -1,10 +1,11 @@ #pragma once #include +#include #include -#include #include +#include #include #include @@ -69,14 +70,14 @@ class TORCH_CUDA_CU_API FusionGuard { //! Fusion is mutable but unique. Nodes cannot be copied in any way from one //! Fusion to another. If anything like that is desired, it would require -//! duplicating all associated values and exprs. Fusion is considered to SSA, +//! duplicating all associated values and exprs. Fusion is considered to be SSA, //! though this could also change in the future if there is a good reason to do //! so. //! //! The Fusion owns the whole IR graph (Vals and Exprs) //! // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class TORCH_CUDA_CU_API Fusion final { +class TORCH_CUDA_CU_API Fusion : public IrContainer { typedef std::unordered_map> PermutationMap; public: @@ -96,45 +97,30 @@ class TORCH_CUDA_CU_API Fusion final { //! Break dependency chains associated with Expr, remove references to expr //! delete expr - void removeExpr(Expr* expr); + void removeExpr(Expr* expr) override; //! Completely remove val from the fusion, break all dependencies associated //! with it - void removeVal(Val* val); + void removeVal(Val* val) override; //! Register input as an input of the fusion - // TODO: Rename to register void addInput(Val* input); //! Register output as an output of the fusion - // TODO: Rename to register void addOutput(Val* output); //! Register output as an output of the fusion - // TODO: Rename to register void addOutput(WelfordResult& output); //! Deregister input as an input of the fusion - // TODO: Rename to register void removeInput(Val* input); //! Deregister output as an output of the fusion - // TODO: Rename to register void removeOutput(Val* output); //! Replace output with another value void replaceOutput(Val* output, Val* replacement); - //! Clear Expr's from TV uses that are not required to produce outputs from - //! inputs - void resetTvUses(); - - //! Check if stmt is properly registered with this fusion - bool inFusion(const Statement* stmt) const; - - //! Throw an error if stmt is not in this fusion - void assertInFusion(const Statement* stmt, const std::string& msg = "") const; - //! Assert that all leaves found from outputs are registered as an input void validateInputs(); @@ -151,17 +137,6 @@ class TORCH_CUDA_CU_API Fusion final { //! Lower the fusion and print a kernel void printKernel(); - //! Register the Val with this fusion - StmtNameType registerVal(Val* val); - - //! Register expr with this fusion. - //! When we register an expression, we want to update the dependency tracking - //! of Vals. We add expr to our general expr_set_, - StmtNameType registerExpr(Expr* expr); - - //! Register stmt with this fusion - StmtNameType registerStatement(Statement* stmt); - //! Return a list of topologically sorted expressions. This only includes //! exprs required to genereate registered outputs. std::vector exprs(); @@ -169,12 +144,6 @@ class TORCH_CUDA_CU_API Fusion final { //! Return a vector of fusion inputs that feed this Val std::vector inputsOf(Val* val); - //! Return the set of Vals registered with this fusion - const std::unordered_set& vals() const noexcept; - - //! Return in insertion order - const std::deque& deterministic_vals() const noexcept; - //! Return all Vals in math expressions that cannot be eliminated. //! //! It is generally equivalent to vals that are used to generate @@ -183,11 +152,6 @@ class TORCH_CUDA_CU_API Fusion final { //! also included as they must show up in the final code. std::vector usedMathVals(); - //! Return the set of Exprs registered with this fusion. Warning: This will - //! return exprs outside inputs/outputs, so can be unsafe for use with - //! segmented fusions. - const std::unordered_set& unordered_exprs() const noexcept; - //! Return all Exprs that use val std::unordered_set unordered_uses(Val* val) const; @@ -197,12 +161,6 @@ class TORCH_CUDA_CU_API Fusion final { //! Indicate to kernel to set itself up to generate random numbers bool isStochastic(); - //! Indicate that the fusion contains reduction operations - bool hasReduction(); - - //! Indicate that the fusion contains welford operations - bool hasWelford(); - //! Run fusion segmentation algorithm to create a segmented fusion std::unique_ptr segment( const at::ArrayRef& inputs); @@ -217,9 +175,6 @@ class TORCH_CUDA_CU_API Fusion final { std::vector getTerminatingOutputs(); - bool hasInput(const Val* val) const; - bool hasOutput(const Val* val) const; - // Aliasing output to input value, this is a WAR to allow inplace update on // input tensor. // Note: this is not always safe and should be used with extra caution. @@ -262,36 +217,40 @@ class TORCH_CUDA_CU_API Fusion final { return is_during_update_uses_; } + const auto& ioAlias() const { + return io_alias_; + } + protected: friend SegmentCandidateFinder; friend SegmentedFusion; friend class TranslateApplicableWelford; + friend Val; static IrCloner copy(const Fusion* from, Fusion* to); - private: - // Return an int that monotonically increases for each val/expr, some are - // explicitly incremented by type. - StmtNameType getValName(ValType vtype); - StmtNameType getExprName(); + //! Register the Val with this fusion + virtual void registerVal(Val* val) override; + //! Register expr with this fusion. + //! When we register an expression, we want to update the dependency tracking + //! of Vals. If this container is a not a Kernel, it will remove previous + //! definitions of outputs and register this Expr as the definition. Otherwise + //! will update definition if not previously set, but will not remove old + //! definitions. + virtual void registerExpr(Expr* expr) override; + + //! Clear Expr's from TV uses that are not required to produce outputs from + //! inputs. Only other place this is used (other than Fusion) is in + //! Val::uses() + void resetTvUses(); + + private: // Determine if the two values are compatible for aliasing // Same DataType, ValType, and number of dimensions bool isAliasCompatible(Val* left, Val* right); private: - // Sets of all Vals/Exprs registered with this fusion - // (val_deque_ is not owning the objects) - std::unordered_set val_set_; - std::deque val_deque_; - std::unordered_set expr_set_; - - // Values names counters - std::unordered_map val_type_name_map_; - - // Expression names counter - StmtNameType expr_name_counter_ = 0; - // Fusion inputs and outputs std::vector inputs_; std::vector outputs_; diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp index 9ff25780814..fd7b6fc502a 100644 --- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp @@ -322,7 +322,7 @@ void SegmentedFusion::draw() { for (auto group : groups()) { for (auto expr : group->exprs()) { - if (ir_utils::isTVOp(expr)) { + if (ir_utils::isTvOp(expr)) { expr_color_map[expr] = group_index; } } @@ -659,8 +659,8 @@ TensorView* castIntermediateValueInCompleteFusion( } // Create the actual domain and tv. - return new TensorView( - new TensorDomain( + return IrBuilder::create( + IrBuilder::create( new_root_domain, std::vector(new_root_domain.size(), true)), data_type); }; @@ -680,8 +680,8 @@ TensorView* castIntermediateValueInCompleteFusion( } // Insert the cast ops. - new UnaryOp(UnaryOpType::Cast, half_precision_tv, original_tv); - new UnaryOp(UnaryOpType::Cast, fp32_tv, half_precision_tv); + IrBuilder::create(UnaryOpType::Cast, half_precision_tv, original_tv); + IrBuilder::create(UnaryOpType::Cast, fp32_tv, half_precision_tv); // Return the new tv to replace original tv with // on the segmented edges. @@ -1740,9 +1740,10 @@ TranslateApplicableWelford::TranslateApplicableWelford( Fusion* fusion, const at::ArrayRef& runtime_inputs) : runtime_inputs_(runtime_inputs) { + auto exprs = fusion->exprs(); std::vector orignal_welfords( - ir_utils::filterByType(fusion->unordered_exprs()).begin(), - ir_utils::filterByType(fusion->unordered_exprs()).end()); + ir_utils::filterByType(exprs).begin(), + ir_utils::filterByType(exprs).end()); if (wouldTranslateToPersistent(orignal_welfords)) { for (auto welford : orignal_welfords) { @@ -1829,6 +1830,14 @@ bool TranslateApplicableWelford::wouldTranslateToPersistent( [&original_to_test_map](auto welford) { return original_to_test_map.clone(welford); }); + // Copied welfords will be invalidated on translation, but Vals will be + // reused, keep a reference to them. + std::vector welford_avgs; + std::vector welford_vars; + for (auto welford : copied_welfords) { + welford_avgs.push_back(welford->outAvg()); + welford_vars.push_back(welford->outVar()); + } // Translate the welford ops for (auto welford_to_translate : copied_welfords) { @@ -1860,6 +1869,21 @@ bool TranslateApplicableWelford::wouldTranslateToPersistent( return original_to_test_map.clone(out); }); + // If only average is used from welford, we should still translate, but we + // might not detect persistence if variance isn't actually used/marked as an + // output in the test. + for (auto outs_i : c10::irange(welford_avgs.size())) { + auto avg = welford_avgs[outs_i]; + auto var = welford_vars[outs_i]; + if (avg->uses().empty()) { + test_group_outputs_.push_back(avg); + } + + if (var->uses().empty()) { + test_group_outputs_.push_back(var); + } + } + // Temporarily localize test copy around // the group boundary FusionSegmentGuard fsg( @@ -1900,7 +1924,7 @@ void TranslateApplicableWelford::translateSingleWelford(WelfordOp* welford) { // Create scalar version of the feature element // counting. - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(1); std::vector broadcast_mask(in_root.size(), false); for (const auto i : c10::irange(in_root.size())) { if (out_root[i]->isReduction()) { @@ -1913,7 +1937,7 @@ void TranslateApplicableWelford::translateSingleWelford(WelfordOp* welford) { // Build a normalization expression group that is // equivalent to a welford operation. auto x_sum = sum(in_val, red_axes); - new BinaryOp(BinaryOpType::Div, out_avg, x_sum, num_features); + IrBuilder::create(BinaryOpType::Div, out_avg, x_sum, num_features); // welford.avg may be broadcast. Reuse it if found. TensorView* x_avg_bcast = nullptr; for (auto& use_expr : out_avg->uses()) { @@ -1949,8 +1973,12 @@ void TranslateApplicableWelford::translateSingleWelford(WelfordOp* welford) { } auto x_mean_sub_pow = mul(x_mean_sub, x_mean_sub); - new ReductionOp(BinaryOpType::Add, new Double(0.0), out_var, x_mean_sub_pow); - new UnaryOp(UnaryOpType::Set, out_N, num_features); + IrBuilder::create( + BinaryOpType::Add, + IrBuilder::create(0.0), + out_var, + x_mean_sub_pow); + IrBuilder::create(UnaryOpType::Set, out_N, num_features); // out_avg, out_N are now outputs of a pointwise ops and we // need to clear out its reduction domains. @@ -2687,14 +2715,20 @@ void SegmentCandidateFinder::findSegments() { } } + auto reduction_ops = + ir_utils::getReductionOps(segmented_fusion_->completeFusion()); + auto welford_ops = ir_utils::filterByType(reduction_ops); + if (options_.run_translate_welford && - segmented_fusion_->completeFusion()->hasWelford()) { + (welford_ops.begin() != welford_ops.end())) { TranslateApplicableWelford::run(segmented_fusion_.get(), runtime_inputs_); } for (auto group : groups()) { - // Set heuristics in case single reduction kernels were left out - group->setHeuristic(deriveHeuristic(group)); + if (!group->outputs().empty()) { + // Set heuristics in case single reduction kernels were left out + group->setHeuristic(deriveHeuristic(group)); + } } // Remove all scalar edges since they do not represent actual @@ -2913,7 +2947,7 @@ void SegmentCandidateFinder::resolveInputsInGroup(SegmentedGroup* group) { group->input_vals = IterVisitor::getInputsTo(group->inputs()); // Grab all expressions needed to produce to_visit - auto input_exprs = ExprSort::getExprs(completeFusion(), to_visit); + auto input_exprs = StmtSort::getExprs(completeFusion(), to_visit); // Insert those expressions at the beginning of the group group->exprs_.insert( @@ -3102,7 +3136,7 @@ void SegmentedFusion::annotateFP16IntermediateTensors() { } } -TORCH_CUDA_CU_API std::string toString( +std::string toString( const SegmentCandidateFinderOptions& segment_options) { std::stringstream ss; ss << "segmentation phases {\n"; diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h b/torch/csrc/jit/codegen/cuda/fusion_segmenter.h index 61fa966348e..63124839fc1 100644 --- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.h +++ b/torch/csrc/jit/codegen/cuda/fusion_segmenter.h @@ -288,11 +288,11 @@ class TORCH_CUDA_CU_API SegmentedFusion { } Val* findAlias(Val* val) const { - Val* alias_val = nullptr; - if (complete_fusion_->io_alias_.count(val) != 0) { - alias_val = complete_fusion_->io_alias_[val]; + auto alias_it = complete_fusion_->ioAlias().find(val); + if (alias_it != complete_fusion_->ioAlias().end()) { + return alias_it->second; } - return alias_val; + return nullptr; } //! Make a clone of the group and convert to fusion diff --git a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp index 47c0316abda..b2d1f893ba6 100644 --- a/torch/csrc/jit/codegen/cuda/graph_fuser.cpp +++ b/torch/csrc/jit/codegen/cuda/graph_fuser.cpp @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include @@ -19,6 +21,7 @@ #include #include +#include #include #include @@ -46,6 +49,13 @@ bool usedOnlyInDtype(Value* v) { Value* broadcastSizes(at::ArrayRef sizes) { AT_ASSERT(!sizes.empty()); Graph* graph = sizes[0]->owningGraph(); + Node* insertion_point = sizes[0]->node()->next(); + for (size_t i = 1; i < sizes.size(); i++) { + if (insertion_point->isBefore(sizes[i]->node()->next())) { + insertion_point = sizes[i]->node()->next(); + } + } + WithInsertPoint guard(insertion_point); Node* broadcast_n = graph->insertNode(graph->create(prim::BroadcastSizes, sizes)); broadcast_n->output()->setType(ListType::ofInts()); @@ -66,9 +76,13 @@ Value* createConditionalConstant(Node* profile_ivalue) { auto int_list = profile_ivalue->is(Symbol::attr("profiled_bool_list")); std::vector bool_list(int_list.begin(), int_list.end()); val = IValue(bool_list); - } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_size"))) { + } else if (profile_ivalue->hasAttribute( + Symbol::attr("profiled_reduction_size"))) { // int[] - val = IValue(profile_ivalue->is(Symbol::attr("profiled_size"))); + val = IValue(profile_ivalue->is(Symbol::attr("profiled_reduction_size"))); + } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_view_size"))) { + // int[] + val = IValue(profile_ivalue->is(Symbol::attr("profiled_view_size"))); } else if (profile_ivalue->hasAttribute(Symbol::attr("profiled_bool"))) { // bool val = IValue( @@ -101,6 +115,7 @@ struct CudaGraphFuser { std::unique_ptr aliasDb_; std::shared_ptr graph_; Symbol kind_ = prim::CudaFusionGroup; + std::unordered_map fusion_value_to_runtime_shape_; // nvrtc has a limit on the number of arguments allowed in a CUDA kernel. // The specific limit is a function of constant memory size, amount available @@ -764,9 +779,11 @@ struct CudaGraphFuser { // longer valid so we rescan the new FusionGroup for more fusions... return std::make_pair(fusion_group.value()->reverseIterator(), true); } - // horizontal fusion only applies on tensor inputs + + // horizontal fusion only applies on non-scalar tensor inputs if (getHorizontalFusion() && - producer->type()->isSubtypeOf(*TensorType::get())) { + producer->type()->isSubtypeOf(*TensorType::get()) && + !is_cpu_scalar(*producer->type()->cast())) { // fusing nodes sharing inputs, this could save memory bandwidth by // reducing number of tensor read. for (const auto& u : producer->uses()) { @@ -838,6 +855,7 @@ struct CudaGraphFuser { // Builds up expressions that compute shapes of all intermediates (and // outputs) of the fusion group, based on the sizes of inputs. You should run // DCE to remove those that you end up not using. + // TODO: Add shape support for view, reshape, unsqueeze, and squeeze std::unordered_map buildShapeExpressions(Node* fusion_group) { WithInsertPoint insert_guard{fusion_group->next()}; std::unordered_map shape_of; @@ -850,7 +868,9 @@ struct CudaGraphFuser { AT_ASSERT(inputs.size() == sinputs.size()); for (const auto i : c10::irange(inputs.size())) { if (inputs[i]->type()->isSubtypeOf(*TensorType::get())) { - shape_of[sinputs[i]] = graph->insert(aten::size, {inputs[i]}); + auto sinput_value = graph->insert(aten::size, {inputs[i]}); + shape_of[sinputs[i]] = sinput_value; + sinput_value->node()->moveBefore(fusion_group); } } @@ -869,6 +889,26 @@ struct CudaGraphFuser { } } + // Place all the shape expressions for intermediates in fusion + // before the CudaFusionGroup + graph->setInsertPoint(fusion_group); + + // hmmm, do I need to setInsertPoint... + const auto map_inputs = [&](Value* v) -> Value* { + // if constant ever has an input, it has to come from + // profile_ivalue dependency + if (v->node()->kind() == prim::Param && + fusion_group->input(v->offset())->node()->kind() == + prim::profile_ivalue) { + // we need to map it along profile_ivalue dependency + return fusion_group->input(v->offset()); + } else { + throw std::runtime_error( + std::string("unexpected input from node") + + v->node()->kind().toDisplayString()); + } + }; + for (Node* n : subgraph->nodes()) { // XXX: Use of shape_of.emplace is crucial to the output shape // optimization! @@ -912,21 +952,6 @@ struct CudaGraphFuser { n->input(2)->node()->kind() == prim::Constant, "only supports reduction axes and keepdim being constant"); - // hmmm, do I need to setInsertPoint... - const auto map_inputs = [&](Value* v) -> Value* { - // if constant ever has an input, it has to come from - // profile_ivalue dependency - if (v->node()->kind() == prim::Param && - fusion_group->input(v->offset())->node()->kind() == - prim::profile_ivalue) { - // we need to map it along profile_ivalue dependency - return fusion_group->input(v->offset()); - } else { - throw std::runtime_error( - std::string("unexpected input from node") + - v->node()->kind().toDisplayString()); - } - }; Node* in1_const = graph->createClone(n->input(1)->node(), map_inputs); graph->insertNode(in1_const); Node* in2_const = graph->createClone(n->input(2)->node(), map_inputs); @@ -1000,6 +1025,57 @@ struct CudaGraphFuser { } continue; } + if (n->kind() == aten::native_dropout) { + TORCH_INTERNAL_ASSERT( + shape_of.count(n->input(0)) > 0, + "buildShapeExpressions failed at accessing input shapes"); + shape_of.emplace(n->output(0), shape_of.at(n->input(0))); + shape_of.emplace(n->output(1), shape_of.at(n->input(0))); + continue; + } + if (n->kind() == prim::unsqueeze_copy) { + TORCH_INTERNAL_ASSERT( + shape_of.count(n->input(0)) > 0, + "buildShapeExpressions failed at accessing input shapes"); + TORCH_INTERNAL_ASSERT( + n->input(1)->node()->kind() == prim::Constant, + "only supports unsqueeze axes being constant"); + Node* dim_const = graph->createClone(n->input(1)->node(), map_inputs); + graph->insertNode(dim_const); + std::vector inputs = { + shape_of.at(n->input(0)), dim_const->output()}; + Node* size_node = graph->insertNode(graph->create( + Symbol::fromQualString("prim::infer_unsqueeze_size"), inputs, 1)); + Value* size = size_node->output(0); + size->setType(ListType::ofInts()); + shape_of.emplace(n->output(), size); + continue; + } + if (n->kind() == prim::squeeze_copy) { + TORCH_INTERNAL_ASSERT( + shape_of.count(n->input(0)) > 0, + "buildShapeExpressions failed at accessing input shapes"); + TORCH_INTERNAL_ASSERT( + n->inputs().size() == 2 || n->inputs().size() == 1, + "prim::squeeze_copy expects one or two inputs"); + std::vector inputs = {shape_of.at(n->input(0))}; + + if (n->inputs().size() == 2) { + TORCH_INTERNAL_ASSERT( + n->input(1)->node()->kind() == prim::Constant, + "only supports squeeze axes being constant"); + Node* dim_const = graph->createClone(n->input(1)->node(), map_inputs); + graph->insertNode(dim_const); + inputs.push_back(dim_const->output()); + } + Node* size_node = graph->insertNode(graph->create( + Symbol::fromQualString("prim::infer_squeeze_size"), inputs, 1)); + Value* size = size_node->output(0); + size->setType(ListType::ofInts()); + shape_of.emplace(n->output(), size); + continue; + } + auto tensor_inputs = filter(n->inputs(), [](Value* v) { return v->type()->isSubtypeOf(*TensorType::get()); }); @@ -1025,8 +1101,9 @@ struct CudaGraphFuser { // TODO: failure in buildShapeExpressions should not break fusion execution, // we can add a try/catch here to bailout from removeOutputsUsedOnlyInSize. GRAPH_DEBUG("before build shape expression: ", *graph_); - auto shape_of = buildShapeExpressions(fusion_group); + fusion_value_to_runtime_shape_ = buildShapeExpressions(fusion_group); GRAPH_DEBUG("after build shape expression: ", *graph_); + auto outputs = fusion_group->outputs().vec(); auto soutputs = subgraph->outputs().vec(); // XXX: Iterating in this order is not only good for performance reasons! @@ -1035,12 +1112,14 @@ struct CudaGraphFuser { for (int64_t i = static_cast(outputs.size()) - 1; i >= 0; --i) { auto output = outputs[i]; auto soutput = soutputs[i]; - if (usedOnlyInDtypeAndSize(output) && shape_of.count(soutput) > 0) { + if (usedOnlyInDtypeAndSize(output) && + fusion_value_to_runtime_shape_.count(soutput) > 0) { bool has_dtype = usedInDtype(output); auto uses = output->uses(); for (Use u : uses) { if (u.user->matches("aten::size(Tensor self) -> int[]")) { - u.user->output()->replaceAllUsesWith(shape_of.at(soutput)); + u.user->output()->replaceAllUsesWith( + fusion_value_to_runtime_shape_.at(soutput)); u.user->destroy(); } else if (u.user->matches("prim::dtype(Tensor a) -> int")) { continue; @@ -1286,6 +1365,55 @@ void PeepholeOptimizeShapeExpressions(Block* block) { } } +// view_sizes_runtime is the profiled-ivalue argument for view-size. +// view_sizes_constant_list is the constant list recorded during profiling runs. +Value* guardView( + Node* fusion, + std::unordered_map& fusion_value_to_runtime_size, + Node* versioning_if, + Node* view, + Value* view_sizes_runtime) { + // 1. Get self tensor sizes and view_sizes + auto self_value = view->inputs().front(); + auto self_type = self_value->type()->cast(); + auto self_sizes_constant_list = getTensorSizes(self_type); + + auto view_sizes_constant_list = + constant_as>(view->inputs().back()); + TORCH_INTERNAL_ASSERT(view_sizes_constant_list.has_value()); + + // 2. Get constraints for self tensor and view_sizes + auto constraints = analyzeViewConstraint( + self_sizes_constant_list, view_sizes_constant_list->vec()); + + // 3. Add constraints as constant to graph + auto self_tensor_constraint = fusion->owningGraph()->insertConstant( + IValue(constraints.original_constraint)); + self_tensor_constraint->node()->moveBefore(versioning_if); + auto view_sizes_constraint = + fusion->owningGraph()->insertConstant(IValue(constraints.new_constraint)); + view_sizes_constraint->node()->moveBefore(versioning_if); + + // 4. Create CudaFusionViewGuard using input tensor, profile_ivalue + // for view_sizes list, and constraints + TORCH_INTERNAL_ASSERT( + fusion_value_to_runtime_size.find(self_value) != + fusion_value_to_runtime_size.end(), + "Failed to find runtime size for fusion value:\t", + self_value->node()->kind().toDisplayString()); + Node* viewcheck_node = + fusion->owningGraph() + ->create( + c10::Symbol::fromQualString("prim::CudaFusionViewGuard"), + {fusion_value_to_runtime_size.at(self_value), + view_sizes_runtime, + self_tensor_constraint, + view_sizes_constraint}, + 1) + ->insertBefore(versioning_if); + return viewcheck_node->output(); +} + //! [ Note -- CudaFusionGuard implementation ] //! //! shamelessly copying code from NNC (tensorexpr_fuser) with very little @@ -1324,7 +1452,9 @@ void PeepholeOptimizeShapeExpressions(Block* block) { //! //! TODO: we also need to assert/check reduction axes and replace it with //! constants in `CudaFusionGroup` -void guardFusionGroup(Node* fusion) { +void guardFusionGroup( + Node* fusion, + std::unordered_map& fusion_value_to_runtime_size) { // Fixup types of the subgraph inputs std::vector guard_types; std::vector tensor_inputs_to_check; @@ -1375,10 +1505,12 @@ void guardFusionGroup(Node* fusion) { versioning_if->insertAfter(typecheck_node); + auto fusion_graph = fusion->g(attr::Subgraph); + std::vector check_flags = {}; + // Fill in the false block. It should contain the unoptimized // copy of the fused subgraph, unless we have conditional constants from // profiled_ivalue; - auto fusion_graph = fusion->g(attr::Subgraph); std::shared_ptr fb_graph; // resource holder; // Restore the dependency for constant introduced by profiled_ivalue within // the graph. @@ -1425,11 +1557,10 @@ void guardFusionGroup(Node* fusion) { // 2. REMOVE conditional constant dependency in fusion group size_t compensation = 0; - // get a constant false, which is used by `and` pattern later + // get a constant true, which is used by `and` pattern later auto const_true = fusion->owningGraph()->insertConstant(IValue(true)); const_true->node()->moveBefore(versioning_if); - std::vector check_flags = {}; for (const auto& original_offset : profiled_ivalue_indices) { size_t offset = original_offset - compensation; @@ -1457,7 +1588,7 @@ void guardFusionGroup(Node* fusion) { ->insertBefore(versioning_if) ->output(); } else if (fusion->input(offset)->node()->hasAttribute( - Symbol::attr("profiled_size"))) { + Symbol::attr("profiled_reduction_size"))) { // TODO(profile_size): check sizes here with special size comparison op // TORCH_INTERNAL_ASSERT(false, "not implemented yet"); ivalue_check = @@ -1468,6 +1599,28 @@ void guardFusionGroup(Node* fusion) { 1) ->insertBefore(versioning_if) ->output(); + } else if (fusion->input(offset)->node()->hasAttribute( + Symbol::attr("profiled_view_size"))) { + // TODO: Add support for dynamic split to view guard + + // Path from profile-ivalue to prim::view_copy operation + // profile-ivalue -> Uses: [Constant, CudaFusionGroup] + // Get argument position in CudaFusionGroup + // Get argument in subgraph for CudaFusionGroup + // CudaFusionGroup argument -> Constant List -> prim::view_copy + auto cuda_fusion_group_arg = profiled_ival->uses().back().offset; + auto subgraph_arg = fusion_graph->inputs()[cuda_fusion_group_arg]; + auto constant = subgraph_arg->uses().front().user->output(); + auto view = constant->uses().front().user; + TORCH_INTERNAL_ASSERT( + view->kind() == prim::view_copy || + view->kind() == prim::reshape_copy); + ivalue_check = guardView( + fusion, + fusion_value_to_runtime_size, + versioning_if, + view, + profiled_ival); } else { ivalue_check = fusion->owningGraph() ->create(aten::eq, {profiled_ival, const_o}, 1) @@ -1495,22 +1648,24 @@ void guardFusionGroup(Node* fusion) { fusion_graph->eraseInput(offset); compensation++; } - - if (!check_flags.empty()) { - // attaching output from CudaFusionGuard to profile ivalue checks - check_flags.emplace_back(typecheck_result); - auto graph = fusion->owningGraph(); - auto bool_list_node = - graph->insertNode(graph->createList(BoolType::get(), check_flags)); - bool_list_node->moveBefore(versioning_if); - Value* bool_list = bool_list_node->output(); - // new typecheck_result - typecheck_result = graph->insert(aten::all, {bool_list}); - typecheck_result->node()->moveBefore(versioning_if); - } // update graph in fusion node fusion->g_(attr::Subgraph, fusion_graph); - } else { + } + + if (!check_flags.empty()) { + // attaching output from CudaFusionGuard to profile ivalue checks + check_flags.emplace_back(typecheck_result); + auto graph = fusion->owningGraph(); + auto bool_list_node = + graph->insertNode(graph->createList(BoolType::get(), check_flags)); + bool_list_node->moveBefore(versioning_if); + Value* bool_list = bool_list_node->output(); + // new typecheck_result + typecheck_result = graph->insert(aten::all, {bool_list}); + typecheck_result->node()->moveBefore(versioning_if); + } + + if (profiled_ivalue_indices.empty()) { WithInsertPoint guard(false_block->return_node()); const auto subgraph_outputs = insertGraph(*fusion->owningGraph(), *fusion_graph, fusion->inputs()); @@ -1536,11 +1691,13 @@ void guardFusionGroup(Node* fusion) { } } -void guardFusionGroups(Block* block) { +void guardFusionGroups( + Block* block, + std::unordered_map& fusion_value_to_runtime_size) { std::vector fusions; for (Node* n : block->nodes()) { for (Block* b : n->blocks()) { - guardFusionGroups(b); + guardFusionGroups(b, fusion_value_to_runtime_size); } if (n->kind() == prim::CudaFusionGroup) { fusions.push_back(n); @@ -1550,7 +1707,7 @@ void guardFusionGroups(Block* block) { // step 1: a. add prim::CudaFusionGuard and fallback logic // b. insert guard logic of profile_ivalue with if block // c. restore conditional constant to non-constant for fallback - guardFusionGroup(fusion); + guardFusionGroup(fusion, fusion_value_to_runtime_size); } } @@ -1918,6 +2075,85 @@ void decomposeLinearOps(Block* block) { } } +// Replace 'operation' with 'operation_copy' to guard alias operations. +// Supports View, Reshape, Squeeze, and Unsqueeze +void replaceAliasOpsWithCopy(std::shared_ptr& graph, Block* block) { + static std::unordered_map op_mapping( + {{aten::view, prim::view_copy}, + {aten::reshape, prim::reshape_copy}, + {aten::squeeze, prim::squeeze_copy}, + {aten::unsqueeze, prim::unsqueeze_copy}}); + + std::vector maybe_alias_nodes; + for (Node* n : block->nodes()) { + for (Block* b : n->blocks()) { + replaceAliasOpsWithCopy(graph, b); + } + if (op_mapping.find(n->kind()) != op_mapping.end()) { + maybe_alias_nodes.push_back(n); + } + } + + auto alias_db = std::make_unique(graph); + for (Node* n : maybe_alias_nodes) { + if (!alias_db->safeToChangeAliasingRelationship( + n->input(0), n->output(0))) { + continue; + } + + WithInsertPoint guard(n); + auto op_copy = + graph->insertNode(graph->create(op_mapping[n->kind()], n->inputs(), 1)); + op_copy->output()->setType(n->output(0)->type()); + + // adding newly created value into alias_db; + alias_db->createValue(op_copy->output()); + + n->output()->replaceAllUsesWith(op_copy->output()); + n->destroy(); + } +} + +// Revert all 'op_copy' with 'op' except in CudaFusionGroup +// e.g., Any non-fused alias operation including within the prim::FallbackGraph +// Supports View, Reshape, Squeeze, and Unsqueeze +void revertAliasCopyOps(std::shared_ptr& graph, Block* block) { + static std::unordered_map op_mapping( + {{prim::view_copy, aten::view}, + {prim::reshape_copy, aten::reshape}, + {prim::squeeze_copy, aten::squeeze}, + {prim::unsqueeze_copy, aten::unsqueeze}}); + + std::vector alias_copy_ops; + for (Node* n : block->nodes()) { + // Allow alias copy ops in CudaFusionGroup + if (n->kind() == prim::CudaFusionGroup) { + continue; + } + // Revert alias copy ops within FallbackGraph + if (n->kind() == prim::FallbackGraph) { + auto subgraph = n->g(attr::Subgraph); + revertAliasCopyOps(subgraph, subgraph->block()); + } + for (Block* b : n->blocks()) { + revertAliasCopyOps(graph, b); + } + // Revert any non-fused alias copy ops + if (op_mapping.find(n->kind()) != op_mapping.end()) { + alias_copy_ops.push_back(n); + } + } + + for (Node* n : alias_copy_ops) { + WithInsertPoint guard(n); + auto reverted_op = + graph->insertNode(graph->create(op_mapping[n->kind()], n->inputs(), 1)); + reverted_op->output()->setType(n->output(0)->type()); + n->output()->replaceAllUsesWith(reverted_op->output()); + n->destroy(); + } +} + // break `conv2d` layer into `conv2d` and `add_optional`. This allows us to fuse // the binary operation without supporting gemm. // Note that we are not breaking `conv2d` layer without bias. @@ -2030,12 +2266,16 @@ void CudaFuseGraph(std::shared_ptr& graph) { decomposeConvOps(graph->block()); GRAPH_DEBUG("After decompose decompose Conv Ops by nvfuser: ", *graph); - CudaGraphFuser(graph->block(), graph).run(); + replaceAliasOpsWithCopy(graph, graph->block()); + GRAPH_DEBUG("replace alias_op with alias_copy by nvfuser: ", *graph); + + CudaGraphFuser cgf(graph->block(), graph); + cgf.run(); GRAPH_DEBUG("After Fusion: ", *graph); // guard input types as well as conditional constants from // aten::profile_ivalue - guardFusionGroups(graph->block()); + guardFusionGroups(graph->block(), cgf.fusion_value_to_runtime_shape_); GRAPH_DEBUG("After Guard Fusion: ", *graph); // mutate `aten::_batch_norm_impl_index` and @@ -2053,6 +2293,10 @@ void CudaFuseGraph(std::shared_ptr& graph) { // optimization targeting AMP removeOutputUsedOnlyInDtype(graph->block()); GRAPH_DEBUG("After removeOutputUsedOnlyInDtype: ", *graph); + + revertAliasCopyOps(graph, graph->block()); + GRAPH_DEBUG("revert alias_copy ops by nvfuser: ", *graph); + // After FuseGraph some common subexpressions may come back EliminateCommonSubexpression(graph); // We might have emitted a fair amount of useless shape propagating code, so diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp index 39176a60c53..8e151372b75 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp @@ -10,9 +10,8 @@ #include #include #include -#include -#include #include +#include #include #include #include @@ -44,9 +43,9 @@ class ContigIDs : public OptInDispatch { using OptInDispatch::handle; // Mark if ids are result of contigous merges - std::unordered_set contig_ids; + std::unordered_set contig_ids; // Given contiguous domain, return all iter domains within its history. - std::unordered_map> + std::unordered_map> within_contig_ids; const std::vector& root_domain_; const std::vector& root_contiguity_; @@ -58,7 +57,7 @@ class ContigIDs : public OptInDispatch { }); } - bool isContig(kir::IterDomain* id) { + bool isContig(IterDomain* id) { return contig_ids.find(id) != contig_ids.end(); } @@ -66,14 +65,11 @@ class ContigIDs : public OptInDispatch { void handle(Split*) override {} void handle(Merge* merge) override { - const auto gpu_lower = GpuLower::current(); - // If either input is non-contiguous so is output. const auto inner = merge->inner(); const auto outer = merge->outer(); - if ((!isContig(gpu_lower->lowerValue(inner)->as()) || - !isContig(gpu_lower->lowerValue(outer)->as()))) { + if (!isContig(inner) || !isContig(outer)) { return; } @@ -136,38 +132,34 @@ class ContigIDs : public OptInDispatch { // If we matched all inputs, the output is contiguous. Only want to keep the // top contig ID, lower ids should be placed in the "within_contig_ids" map // of top id. - auto kir_inner = - gpu_lower->lowerValue(merge->inner())->as(); - auto kir_outer = - gpu_lower->lowerValue(merge->outer())->as(); - auto kir_out = gpu_lower->lowerValue(merge->out())->as(); + auto out = merge->out()->as(); if (ordered_inputs.empty()) { - if (contig_ids.find(kir_inner) != contig_ids.end()) { - contig_ids.erase(kir_inner); + if (contig_ids.find(inner) != contig_ids.end()) { + contig_ids.erase(inner); } - if (contig_ids.find(kir_outer) != contig_ids.end()) { - contig_ids.erase(kir_outer); + if (contig_ids.find(outer) != contig_ids.end()) { + contig_ids.erase(outer); } - contig_ids.emplace(kir_out); + contig_ids.emplace(out); - std::unordered_set within_out; - within_out.emplace(kir_inner); - if (within_contig_ids.find(kir_inner) != within_contig_ids.end()) { - auto in_inner = within_contig_ids.at(kir_inner); + std::unordered_set within_out; + within_out.emplace(inner); + if (within_contig_ids.find(inner) != within_contig_ids.end()) { + auto in_inner = within_contig_ids.at(inner); within_out.insert(in_inner.begin(), in_inner.end()); - within_contig_ids.erase(kir_inner); + within_contig_ids.erase(inner); } - within_out.emplace(kir_outer); - if (within_contig_ids.find(kir_outer) != within_contig_ids.end()) { - auto in_outer = within_contig_ids.at(kir_outer); + within_out.emplace(outer); + if (within_contig_ids.find(outer) != within_contig_ids.end()) { + auto in_outer = within_contig_ids.at(outer); within_out.insert(in_outer.begin(), in_outer.end()); - within_contig_ids.erase(kir_outer); + within_contig_ids.erase(outer); } - within_contig_ids[kir_out] = within_out; + within_contig_ids[out] = within_out; } } @@ -195,8 +187,6 @@ class ContigIDs : public OptInDispatch { " != ", root_contiguity_.size()); - const auto gpu_lower = GpuLower::current(); - for (const auto i : c10::irange(root_domain_.size())) { // If a root domain has halo, can't use merged domain even if // both inputs are contiguous. HaloInfo is also initialized for @@ -204,32 +194,32 @@ class ContigIDs : public OptInDispatch { // RootAxisInfo. This should be safe as no rfactor tensor should // need halo. if (root_contiguity_[i] && - !gpu_lower->haloInfo().getRootAxisInfo(root_domain_[i]).hasHalo()) { - auto kir_root_domain_i = - gpu_lower->lowerValue(root_domain_[i])->as(); - contig_ids.emplace(kir_root_domain_i); - within_contig_ids[kir_root_domain_i] = - std::unordered_set(); + !GpuLower::current() + ->haloInfo() + .getRootAxisInfo(root_domain_[i]) + .hasHalo()) { + auto root_domain_i = root_domain_[i]->as(); + contig_ids.emplace(root_domain_i); + within_contig_ids[root_domain_i] = std::unordered_set(); is_contig_root[root_domain_[i]] = true; } else { is_contig_root[root_domain_[i]] = false; } } - auto exprs = ExprSort::getExprs(ids[0]->fusion(), {ids.begin(), ids.end()}); + auto exprs = StmtSort::getExprs(ids[0]->fusion(), {ids.begin(), ids.end()}); for (auto expr : exprs) { handle(expr); } } - const std::unordered_set contigIDs() const { + const std::unordered_set contigIDs() const { return contig_ids; } - const std:: - unordered_map> - withinContigIDs() const { + const std::unordered_map> + withinContigIDs() const { return within_contig_ids; } }; @@ -276,21 +266,18 @@ void updateHaloInfoForReference( // // ref_map: ref-to-consumer in consumer indexing; ref-to-producer in // producer indexing -std::unordered_map getReferenceHaloExtentMap( +std::unordered_map getReferenceHaloExtentMap( const ReferenceTensor& reference, const std::unordered_map& index_map_from_ref) { - const auto gpu_lower = GpuLower::current(); + const auto& halo_info = GpuLower::current()->haloInfo(); - const auto& halo_info = gpu_lower->haloInfo(); - - std::unordered_map reference_halo_extent_map; + std::unordered_map reference_halo_extent_map; // Propagate halo extents of the reference to the consumer or // producer tensor for (auto kv : index_map_from_ref) { - auto ref_id = gpu_lower->lowerValue(kv.first)->as(); - auto producer_or_consumer_id = - gpu_lower->lowerValue(kv.second)->as(); + auto ref_id = kv.first; + auto producer_or_consumer_id = kv.second; auto extent = halo_info.getExtent(ref_id); if (extent != nullptr) { reference_halo_extent_map[producer_or_consumer_id] = extent; @@ -302,7 +289,7 @@ std::unordered_map getReferenceHaloExtentMap( //! Offset of an index of a producer axis with respect to its //! corresponding consumer index -kir::Val* getProducerHaloOffset( +int getProducerHaloOffset( const TensorView* producer_tv, size_t producer_axis, const TensorView* consumer_tv) { @@ -325,41 +312,31 @@ kir::Val* getProducerHaloOffset( const auto p_pad = halo_map.getRootAxisInfo(producer_id).width(0); const auto c_pad = halo_map.getRootAxisInfo(consumer_id).width(0); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - kir::Val* offset = (p_pad->isConst() && c_pad->isConst()) - ? ir_builder.create( - p_pad->value().value() - c_pad->value().value()) - : ir_builder.subExpr(p_pad, c_pad); + auto offset = p_pad - c_pad; // If the consumer is a result of shifting the producer, adjust the // producer index per the offsets argument of the shift op. if (auto shift_op = dynamic_cast(consumer_tv->definition())) { - offset = ir_builder.subExpr( - offset, ir_builder.create(shift_op->offset(producer_axis))); + offset -= shift_op->offset(producer_axis); } return offset; } //! Offset producer index when necessary -kir::Val* getProducerIndexWithHalo( +Val* getProducerIndexWithHalo( const TensorView* producer_tv, size_t producer_axis, - kir::Val* producer_index, + Val* producer_index, const TensorView* consumer_tv) { const auto offset = getProducerHaloOffset(producer_tv, producer_axis, consumer_tv); - if (offset->isZeroInt()) { + if (offset == 0) { return producer_index; } - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - producer_index = ir_builder.addExpr(producer_index, offset); + producer_index = SimplifyingIrBuilder::addExpr(producer_index, offset); return producer_index; } @@ -368,58 +345,58 @@ kir::Val* getProducerIndexWithHalo( //! //! \param consumer_root_axis Position of corresponding consumer axis //! \param consumer_tv Consumer TensorView +//! \param index_map Mappings from consumer or reference to indices +//! \param use_reference_map True when index_map maps reference domains //! \param concrete_to_ref_map Mappings from concrete to reference domains -//! \param ref_index_map Mappings from reference domains to indices -kir::Val* getProducerOffsetWithGather( +Val* getProducerOffsetWithGather( size_t consumer_root_axis, const TensorView* consumer_tv, - const std::unordered_map& concrete_to_ref_map, - const std::unordered_map& ref_index_map) { + const std::unordered_map& index_map, + bool use_reference_map = false, + const std::unordered_map& concrete_to_ref_map = + {}) { const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); const auto gather_expr = dynamic_cast(consumer_tv->definition()); if (gather_expr == nullptr) { - return ir_builder.zeroVal(); + return gpu_lower->kernel()->zeroVal(); } // If the window extent is one, no specific offsetting // is necessary if (consumer_root_axis >= gather_expr->windowShape().size() || - gather_expr->windowShape()[consumer_root_axis]->isOneInt()) { - return ir_builder.zeroVal(); + gather_expr->windowShape()[consumer_root_axis] == 1) { + return gpu_lower->kernel()->zeroVal(); } // Basically, the goal is to build an expression of producer_index + // window_index, so we first need to locate the index expression // that corresponds to the window axis of this producer axis. - // Locate the root IterDomain of the reference that corresponds to the gather - // axis const auto window_axis = gather_expr->gatherAxis(consumer_root_axis); auto window_id = consumer_tv->getRootDomain().at(window_axis); - auto concrete_window_id = - gpu_lower->caIndexMap().getConcreteMappedID(window_id); - auto concrete_2_ref_it = concrete_to_ref_map.find(concrete_window_id); - TORCH_INTERNAL_ASSERT(concrete_2_ref_it != concrete_to_ref_map.end()); - IterDomain* reference_root_of_gather_axis = concrete_2_ref_it->second; - // Now that reference_root_of_gather_axis is the IterDomain for the - // window axis, take its corresponding index from the index map - auto window_idx = - ref_index_map.at(gpu_lower->lowerValue(reference_root_of_gather_axis) - ->as()); + // When index_map maps a reference tensor, find the corresponding + // reference ID of window_id. + if (use_reference_map) { + auto concrete_window_id = + gpu_lower->caIndexMap().getConcreteMappedID(window_id); + auto concrete_2_ref_it = concrete_to_ref_map.find(concrete_window_id); + TORCH_INTERNAL_ASSERT(concrete_2_ref_it != concrete_to_ref_map.end()); + window_id = concrete_2_ref_it->second; + } - // Positive (or negative) padding at offset zero means the indexing - // shifted to the negative (or positive) direction. + auto window_idx = index_map.at(window_id); + + // Positive padding at offset zero means the indexing shifted to the + // negative direction. auto pad_width = gather_expr->padWidth()[consumer_root_axis][0]; // producer offset: window_index - padding - auto producer_offset = - ir_builder.subExpr(window_idx, ir_builder.create(pad_width)); + auto producer_offset = SimplifyingIrBuilder::subExpr( + window_idx, IrBuilder::create(pad_width)); return producer_offset; - ; } //! Offset a producer index of a gather expression @@ -428,13 +405,13 @@ kir::Val* getProducerOffsetWithGather( //! expression that accesses a window position that the current loop //! structure refers to. Use getGatherProducerOffset to create an //! offset Val. -kir::Val* getProducerIndexWithGather( - kir::Val* producer_index, +Val* getProducerIndexWithGather( + Val* producer_index, size_t producer_root_axis, const TensorView* producer_tv, const TensorView* consumer_tv, const std::unordered_map& concrete_to_ref_map, - const std::unordered_map& ref_index_map) { + const std::unordered_map& ref_index_map) { auto gather_op = dynamic_cast(consumer_tv->definition()); // Just return the producer index as is if this is not a gather @@ -460,22 +437,18 @@ kir::Val* getProducerIndexWithGather( ", producer_axis: ", producer_root_axis); - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); auto offset = getProducerOffsetWithGather( - consumer_axis, consumer_tv, concrete_to_ref_map, ref_index_map); - return ir_builder.addExpr(producer_index, offset); + consumer_axis, consumer_tv, ref_index_map, true, concrete_to_ref_map); + return SimplifyingIrBuilder::addExpr(producer_index, offset); } // Adjusts a global consumer index when its root domain is partially // split. Note that non-global consumer indices don't need any // adjustment. -kir::Val* getGlobalConsumerOffsetWithPartialSplit(kir::IterDomain* root_id) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto offset = gpu_lower->partialSplitMap().getStartOffset(root_id); +Val* getGlobalConsumerOffsetWithPartialSplit(IterDomain* root_id) { + auto offset = GpuLower::current()->partialSplitMap().getStartOffset(root_id); if (offset == nullptr) { - return ir_builder.zeroVal(); + return GpuLower::current()->kernel()->zeroVal(); } else { return offset; } @@ -488,13 +461,12 @@ kir::Val* getGlobalConsumerOffsetWithPartialSplit(kir::IterDomain* root_id) { // it needs to be added to the index. Also, when the producer itself // also has a non-zero split offset, that needs to be subtracted from // the index. -kir::Val* getProducerIndexWithPartialSplit( - kir::Val* producer_index, +Val* getProducerIndexWithPartialSplit( + Val* producer_index, IterDomain* producer_root_id, const TensorView* producer_tv, const TensorView* consumer_tv) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); auto p2c = PairwiseRootDomainMap(producer_tv, consumer_tv) @@ -509,31 +481,29 @@ kir::Val* getProducerIndexWithPartialSplit( auto consumer_offset = gpu_lower->partialSplitMap().getStartOffset(consumer_root_id); - auto consumer_offset_kir = consumer_offset == nullptr - ? ir_builder.zeroVal() - : gpu_lower->lowerValue(consumer_offset); + consumer_offset = consumer_offset == nullptr ? gpu_lower->kernel()->zeroVal() + : consumer_offset; auto producer_offset = gpu_lower->partialSplitMap().getStartOffset(producer_root_id); - auto producer_offset_kir = producer_offset == nullptr - ? ir_builder.zeroVal() - : gpu_lower->lowerValue(producer_offset); + producer_offset = producer_offset == nullptr ? gpu_lower->kernel()->zeroVal() + : producer_offset; // If the producer is on global memory, it's always allocated // without trimming the out-of-bounds region, so the consumer offset // should be added to the index. if (producer_tv->getMemoryType() == MemoryType::Global) { - if (consumer_offset_kir->isZeroInt()) { + if (consumer_offset->isZeroInt()) { return producer_index; } else { - return ir_builder.addExpr(producer_index, consumer_offset_kir); + return IrBuilder::addExpr(producer_index, consumer_offset); } } // Non-global case. Difference of the split offsets must be // accounted. - auto diff = ir_builder.subExpr(consumer_offset_kir, producer_offset_kir); + auto diff = IrBuilder::subExpr(consumer_offset, producer_offset); kir::ExpressionEvaluator ee; auto diff_eval = ee.evaluate(diff); // We currently only allow constant offsetting @@ -543,19 +513,16 @@ kir::Val* getProducerIndexWithPartialSplit( return producer_index; } - return ir_builder.addExpr( - producer_index, ir_builder.create(diff_eval.value())); + return IrBuilder::addExpr( + producer_index, IrBuilder::create(diff_eval.value())); } } // namespace void IndexCompute::handle(Split* split) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto in_id = gpu_lower->lowerValue(split->in())->as(); - auto outer_id = gpu_lower->lowerValue(split->outer())->as(); - auto inner_id = gpu_lower->lowerValue(split->inner())->as(); + auto in_id = split->in()->as(); + auto outer_id = split->outer()->as(); + auto inner_id = split->inner()->as(); auto outer_it = index_map_.find(outer_id); auto inner_it = index_map_.find(inner_id); @@ -588,8 +555,8 @@ void IndexCompute::handle(Split* split) { } if (isZero(in_id)) { - index_map_[in_id] = ir_builder.create(0); - extent_map_[in_id] = ir_builder.create(0); + index_map_[in_id] = GpuLower::current()->kernel()->zeroVal(); + extent_map_[in_id] = GpuLower::current()->kernel()->zeroVal(); } else if (zero_merged_in && outer_zero) { index_map_[in_id] = inner_ind; extent_map_[in_id] = getExtent(inner_id); @@ -597,24 +564,21 @@ void IndexCompute::handle(Split* split) { index_map_[in_id] = outer_ind; extent_map_[in_id] = getExtent(outer_id); } else { - index_map_[in_id] = ir_builder.addExpr( - ir_builder.mulExpr(outer_ind, getExtent(inner_id)), inner_ind); + index_map_[in_id] = IrBuilder::addExpr( + IrBuilder::mulExpr(outer_ind, getExtent(inner_id)), inner_ind); // The extent should be updated only when its allocation is // partial, i.e., zero_merged_in is true. See PR #1270. if (zero_merged_in) { extent_map_[in_id] = - ir_builder.mulExpr(getExtent(outer_id), getExtent(inner_id)); + IrBuilder::mulExpr(getExtent(outer_id), getExtent(inner_id)); } } } void IndexCompute::handle(Merge* merge) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto out_id = gpu_lower->lowerValue(merge->out())->as(); - auto outer_id = gpu_lower->lowerValue(merge->outer())->as(); - auto inner_id = gpu_lower->lowerValue(merge->inner())->as(); + auto out_id = merge->out(); + auto outer_id = merge->outer(); + auto inner_id = merge->inner(); auto out_it = index_map_.find(out_id); if (out_it == index_map_.end()) { @@ -622,7 +586,7 @@ void IndexCompute::handle(Merge* merge) { } auto out_ind = out_it->second; - auto zero = ir_builder.zeroVal(); + auto zero = GpuLower::current()->kernel()->zeroVal(); if (isZero(out_id)) { index_map_[outer_id] = zero; @@ -643,17 +607,14 @@ void IndexCompute::handle(Merge* merge) { TORCH_INTERNAL_ASSERT(!input_ids.empty()); for (auto root_id : input_ids) { - index_map_[gpu_lower->lowerValue(root_id)->as()] = zero; + index_map_[root_id] = zero; } - index_map_[gpu_lower - ->lowerValue(*(input_ids.end() - 1)) - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - ->as()] = out_ind; + index_map_[*(input_ids.end() - 1)] = out_ind; return; } - kir::Val* inner_extent = getExtent(inner_id); + Val* inner_extent = getExtent(inner_id); // When the reference has halo extent for inner_id, that extent needs to // be used to un-merge @@ -718,8 +679,8 @@ void IndexCompute::handle(Merge* merge) { zero_merged_in_.emplace(inner_id); zero_merged_in_.emplace(outer_id); } else { - index_map_[outer_id] = ir_builder.divExpr(out_ind, inner_extent); - index_map_[inner_id] = ir_builder.modExpr(out_ind, inner_extent); + index_map_[outer_id] = IrBuilder::divExpr(out_ind, inner_extent); + index_map_[inner_id] = IrBuilder::modExpr(out_ind, inner_extent); } } @@ -739,13 +700,13 @@ void IndexCompute::handle(Expr* e) { // using TransformIter::runBackward; IndexCompute::IndexCompute( const TensorDomain* _td, - std::unordered_map initial_index_map, - std::unordered_map extent_map, - std::unordered_set zero_domains, - std::unordered_set zero_merged_in, + std::unordered_map initial_index_map, + std::unordered_map extent_map, + std::unordered_set zero_domains, + std::unordered_set zero_merged_in, const std::vector& root_contiguity, - std::unordered_set preferred_paths, - std::unordered_map reference_halo_extent_map) + std::unordered_set preferred_paths, + std::unordered_map reference_halo_extent_map) : td_(_td), index_map_(std::move(initial_index_map)), extent_map_(std::move(extent_map)), @@ -783,7 +744,7 @@ void IndexCompute::run() { traverseFrom(td_->fusion(), domain_vals, false); } -kir::Val* IndexCompute::getExtent(kir::IterDomain* id) { +Val* IndexCompute::getExtent(IterDomain* id) { // Pick from extent_map_ if available. Previously parallel // dimensions were ued (e.g., blockDim.x), however, it would result // in out-of-bounds errors when the extent of IterDomain is smaller @@ -795,11 +756,11 @@ kir::Val* IndexCompute::getExtent(kir::IterDomain* id) { } } -bool IndexCompute::hasZeroMerged(kir::IterDomain* id) const { +bool IndexCompute::hasZeroMerged(IterDomain* id) const { return zero_merged_in_.find(id) != zero_merged_in_.end() || isZero(id); } -bool IndexCompute::isZero(kir::IterDomain* id) const { +bool IndexCompute::isZero(IterDomain* id) const { return zero_domains_.find(id) != zero_domains_.end(); } @@ -807,22 +768,17 @@ IndexCompute IndexCompute::updateIndexCompute( const TensorDomain* new_td, const std::unordered_map& id_map, const std::vector& root_contiguity, - const std::unordered_map& - reference_halo_extent_map) { + const std::unordered_map& reference_halo_extent_map) { FUSER_PERF_SCOPE("GpuLower::Lower::updateIndexCompute"); - const auto gpu_lower = GpuLower::current(); - - std::unordered_map updated_index_map; - std::unordered_map updated_extent_map; - std::unordered_set updated_zero_domains; - std::unordered_set updated_zero_merged_in; + std::unordered_map updated_index_map; + std::unordered_map updated_extent_map; + std::unordered_set updated_zero_domains; + std::unordered_set updated_zero_merged_in; for (auto id_entry : id_map) { - kir::IterDomain* prev_id = - gpu_lower->lowerValue(id_entry.first)->as(); - kir::IterDomain* new_id = - gpu_lower->lowerValue(id_entry.second)->as(); + IterDomain* prev_id = id_entry.first; + IterDomain* new_id = id_entry.second; if (index_map_.find(prev_id) != index_map_.end()) { updated_index_map[new_id] = index_map_.at(prev_id); @@ -859,8 +815,8 @@ class UpdateLeafIndices : public IterVisitor { public: UpdateLeafIndices( const TensorDomain* td, - std::unordered_map initial_index_map, - std::unordered_map extent_map) + std::unordered_map initial_index_map, + std::unordered_map extent_map) : td_(td), index_map_(std::move(initial_index_map)), extent_map_(std::move(extent_map)) { @@ -870,11 +826,11 @@ class UpdateLeafIndices : public IterVisitor { traverseFrom(td_->fusion(), domain_vals, false); } - const std::unordered_map& indexMap() const { + const std::unordered_map& indexMap() const { return index_map_; } - const std::unordered_map& extentMap() const { + const std::unordered_map& extentMap() const { return extent_map_; } @@ -882,13 +838,9 @@ class UpdateLeafIndices : public IterVisitor { using IterVisitor::handle; void handle(Split* split) override { - const auto gpu_lower = GpuLower::current(); - - auto in_id = gpu_lower->lowerValue(split->in())->as(); - auto outer_id = - gpu_lower->lowerValue(split->outer())->as(); - auto inner_id = - gpu_lower->lowerValue(split->inner())->as(); + auto in_id = split->in(); + auto outer_id = split->outer(); + auto inner_id = split->inner(); // Nothing need to be done when mappings for the output axes // already exist. @@ -899,22 +851,17 @@ class UpdateLeafIndices : public IterVisitor { return; } - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto factor = gpu_lower->lowerValue(split->factor()); - index_map_[inner_id] = ir_builder.modExpr(index_map_[in_id], factor); + auto factor = split->factor(); + index_map_[inner_id] = IrBuilder::modExpr(index_map_[in_id], factor); extent_map_[inner_id] = factor; - index_map_[outer_id] = ir_builder.divExpr(index_map_[in_id], factor); - extent_map_[outer_id] = ir_builder.ceilDivExpr(getExtent(in_id), factor); + index_map_[outer_id] = IrBuilder::divExpr(index_map_[in_id], factor); + extent_map_[outer_id] = IrBuilder::ceilDivExpr(getExtent(in_id), factor); } void handle(Merge* merge) override { - const auto gpu_lower = GpuLower::current(); - - auto out_id = gpu_lower->lowerValue(merge->out())->as(); - auto outer_id = - gpu_lower->lowerValue(merge->outer())->as(); - auto inner_id = - gpu_lower->lowerValue(merge->inner())->as(); + auto out_id = merge->out(); + auto outer_id = merge->outer(); + auto inner_id = merge->inner(); // Nothing need to be done when mappings for the output axes // already exist. @@ -927,17 +874,16 @@ class UpdateLeafIndices : public IterVisitor { TORCH_INTERNAL_ASSERT( index_map_.find(inner_id) != index_map_.end(), "Inner ID not found"); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - index_map_[out_id] = ir_builder.mulExpr( + index_map_[out_id] = IrBuilder::mulExpr( index_map_[inner_id], - ir_builder.mulExpr(index_map_[outer_id], getExtent(inner_id))); + IrBuilder::mulExpr(index_map_[outer_id], getExtent(inner_id))); extent_map_[out_id] = - ir_builder.mulExpr(getExtent(outer_id), getExtent(inner_id)); + IrBuilder::mulExpr(getExtent(outer_id), getExtent(inner_id)); } // return extent_map_[id] if exists, else return id->extent() - kir::Val* getExtent(kir::IterDomain* id) { + Val* getExtent(IterDomain* id) { if (extent_map_.find(id) != extent_map_.end()) { return extent_map_.at(id); } else { @@ -947,25 +893,21 @@ class UpdateLeafIndices : public IterVisitor { private: const TensorDomain* td_; - std::unordered_map index_map_; - std::unordered_map extent_map_; + std::unordered_map index_map_; + std::unordered_map extent_map_; }; // Returns halo-extended extent if id has halo. Otherwise, just // returns id->extent. -kir::Val* getHaloExtentOfRootAxis( - IterDomain* id, - kir::Val* normal_extent = nullptr) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - +Val* getHaloExtentOfRootAxis(IterDomain* id, Val* normal_extent = nullptr) { if (normal_extent == nullptr) { - normal_extent = gpu_lower->lowerValue(id->extent()); + normal_extent = id->extent(); } - const auto& halo = gpu_lower->haloInfo().getRootAxisInfo(id); + const auto& halo = GpuLower::current()->haloInfo().getRootAxisInfo(id); if (halo.hasHalo()) { - auto halo_extent = ir_builder.addExpr(normal_extent, halo.width()); + auto halo_extent = + IrBuilder::addExpr(normal_extent, IrBuilder::create(halo.width())); return halo_extent; } else { return normal_extent; @@ -976,10 +918,10 @@ kir::Val* getHaloExtentOfRootAxis( IndexSwizzle::IndexSwizzle( const TensorView* tv, - std::unordered_map initial_index_map, - std::unordered_map extent_map, - std::unordered_set zero_domains, - std::unordered_set zero_merged_in) + std::unordered_map initial_index_map, + std::unordered_map extent_map, + std::unordered_set zero_domains, + std::unordered_set zero_merged_in) : IndexCompute( tv->domain(), std::move(initial_index_map), @@ -996,8 +938,6 @@ void IndexSwizzle::run() { swizzle_type_ == SwizzleType::NoSwizzle || swizzle_type_ == SwizzleType::Transpose, "Invalid swizzle type"); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); if (swizzle_type_ == SwizzleType::Transpose) { // Shifts the second axis by the first axis as ((idx_1 + idx_2) % // ext). Alternatively, ((idx_1 - idx_2) & (ext - 1)) would also @@ -1013,20 +953,16 @@ void IndexSwizzle::run() { IterDomain* id_to_swizzle_i = ids_to_swizzle_.at(0); IterDomain* id_to_swizzle_j = ids_to_swizzle_.at(1); - kir::IterDomain* id_to_swizzle_i_kir = - gpu_lower->lowerValue(id_to_swizzle_i)->as(); - kir::IterDomain* id_to_swizzle_j_kir = - gpu_lower->lowerValue(id_to_swizzle_j)->as(); - if (indexMap().find(id_to_swizzle_i_kir) != indexMap().end() && - indexMap().find(id_to_swizzle_j_kir) != indexMap().end()) { - auto idx_to_swizzle_i = indexMap().at(id_to_swizzle_i_kir); - auto idx_to_swizzle_j = indexMap().at(id_to_swizzle_j_kir); + if (indexMap().find(id_to_swizzle_i) != indexMap().end() && + indexMap().find(id_to_swizzle_j) != indexMap().end()) { + auto idx_to_swizzle_i = indexMap().at(id_to_swizzle_i); + auto idx_to_swizzle_j = indexMap().at(id_to_swizzle_j); - auto swizzled_idx = ir_builder.modExpr( - ir_builder.addExpr(idx_to_swizzle_i, idx_to_swizzle_j), - id_to_swizzle_j_kir->extent()); - index_map_[id_to_swizzle_j_kir] = swizzled_idx; + auto swizzled_idx = IrBuilder::modExpr( + IrBuilder::addExpr(idx_to_swizzle_i, idx_to_swizzle_j), + id_to_swizzle_j->extent()); + index_map_[id_to_swizzle_j] = swizzled_idx; swizzled_ids_.insert(id_to_swizzle_j); IndexCompute::run(); } @@ -1055,17 +991,15 @@ namespace { // to loop indices as well as a set of loops that do not contribute to // indexing. std::pair< - std::unordered_map, + std::unordered_map, std::unordered_set> indexMapFromTV( const TensorView* tv, const std::vector& loops, - const std::pair& alloc_point, - bool as_consumer) { + kir::ForLoop* alloc_loop, + bool as_consumer, + kir::ForLoop* double_buffer_loop = nullptr) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - - auto alloc_loop = alloc_point.first; bool within_alloc = false; if (alloc_loop == nullptr) { @@ -1076,7 +1010,7 @@ indexMapFromTV( const bool is_shared = tv->getMemoryType() == MemoryType::Shared; const bool is_local = tv->getMemoryType() == MemoryType::Local; - std::unordered_map loop_to_ind_map; + std::unordered_map loop_to_ind_map; // When indexed as a producer, the parallel types of the the // producer domains may not be the same as those of the loops, but @@ -1085,17 +1019,16 @@ indexMapFromTV( // with zero isn't valid. That's only valid when there's a matching // IterDomain in the producer tensor that has the same parallel // type. - auto find_matching_parallel_domain = [tv](kir::IterDomain* id) -> bool { + auto find_matching_parallel_domain = [tv](IterDomain* id) -> bool { const auto gpu_lower = GpuLower::current(); auto it = std::find_if( tv->domain()->domain().begin(), tv->domain()->domain().end(), [&](IterDomain* tv_id) { - auto kir_tv_id = gpu_lower->lowerValue(tv_id)->as(); // Matching is done using the index and loop maps. See // validateParallelize as well. - return gpu_lower->caIndexMap().areMapped(id, kir_tv_id) || - (gpu_lower->caLoopMap().areMapped(id, kir_tv_id) && + return gpu_lower->caIndexMap().areMapped(id, tv_id) || + (gpu_lower->caLoopMap().areMapped(id, tv_id) && ir_utils::derivedFromRootCAAxes(tv, tv_id)); }); if (it == tv->domain()->domain().end()) { @@ -1103,7 +1036,7 @@ indexMapFromTV( } auto corresponding_domain = *it; - return corresponding_domain->getParallelType() == id->parallelType(); + return corresponding_domain->getParallelType() == id->getParallelType(); }; // Track domains that do not contibute to the resulting @@ -1113,7 +1046,7 @@ indexMapFromTV( std::unordered_set zero_loops; for (auto loop : loops) { - kir::Val* idx = nullptr; + Val* idx = nullptr; const auto same_parallel_type = as_consumer || find_matching_parallel_domain(loop->iter_domain()); // See also LoopNestGenerator::pushAlloc. @@ -1123,7 +1056,7 @@ indexMapFromTV( (loop->iter_domain()->isThread() && is_global)) { idx = loop->index(); } else { - idx = ir_builder.zeroVal(); + idx = GpuLower::current()->kernel()->zeroVal(); zero_loops.insert(loop); } } else if ( @@ -1145,7 +1078,7 @@ indexMapFromTV( // parallel type (loop->iter_domain()->isThread() && is_local && same_parallel_type) || loop->vectorize()) { - idx = ir_builder.zeroVal(); + idx = GpuLower::current()->kernel()->zeroVal(); if (!loop->vectorize()) { zero_loops.insert(loop); } @@ -1153,6 +1086,10 @@ indexMapFromTV( idx = loop->index(); } + if (loop == double_buffer_loop) { + idx = IrBuilder::addExpr(idx, GpuLower::current()->kernel()->oneVal()); + } + loop_to_ind_map[loop] = idx; if (!within_alloc && loop == alloc_loop) { @@ -1184,8 +1121,6 @@ void ensureStaticIndexing( within_alloc = true; } - const auto gpu_lower = GpuLower::current(); - for (auto loop : loops) { if (!within_alloc) { if (loop == alloc_loop) { @@ -1193,7 +1128,7 @@ void ensureStaticIndexing( } continue; } - kir::IterDomain* loop_id = loop->iter_domain(); + IterDomain* loop_id = loop->iter_domain(); if (loop->vectorize() || loop_id->isThread()) { continue; } @@ -1203,7 +1138,7 @@ void ensureStaticIndexing( auto it = std::find_if( tv->domain()->domain().begin(), tv->domain()->domain().end(), - [loop_id, gpu_lower, &id_map](IterDomain* id) { + [loop_id, &id_map](IterDomain* id) { if (id->isBroadcast() || id->isReduction() || id->isStride()) { return false; } @@ -1211,8 +1146,7 @@ void ensureStaticIndexing( if (id_replacement != id_map.end()) { id = id_replacement->second; } - auto kir_id = gpu_lower->lowerValue(id)->as(); - return gpu_lower->caLoopMap().areMapped(loop_id, kir_id); + return GpuLower::current()->caLoopMap().areMapped(loop_id, id); }); if (it != tv->domain()->domain().end()) { loop->requireUnroll(); @@ -1260,13 +1194,12 @@ std::unordered_map indexMapReferenceTo( } // namespace -std::vector Index::getGlobalProducerStridedIndices( +std::vector Index::getGlobalProducerStridedIndices( TensorView* producer_tv, const TensorView* consumer_tv, const std::vector& loops) { FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalProducerIndex"); const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // Get a reference tensor replayed as existing loop structure auto reference = IndexReferenceReplay::getReference(loops); @@ -1311,9 +1244,12 @@ std::vector Index::getGlobalProducerStridedIndices( } } + kir::ForLoop* db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop( + consumer_tv, loops, true); + // Index into the reference tensor. Reference indexing will handle vectorized // dims where index should be set to 0 - auto ref_compute = getReferenceIndexing(loops, reference_domain); + auto ref_compute = getReferenceIndexing(loops, reference_domain, db_loop); // Forward vectorized IDs to index into producer correctly // We want p_id to be vectorized like consumer just for the indexing, then we @@ -1355,25 +1291,24 @@ std::vector Index::getGlobalProducerStridedIndices( auto root_dom = producer_tv->getMaybeRFactorDomain(); // TODO: Abstract stride logic to reuse with consumer indexing - auto zero = ir_builder.create(0); - std::vector strides(root_dom.size(), nullptr); + std::vector strides(root_dom.size(), nullptr); { int stride_i = 0; for (const auto i : c10::irange(root_dom.size())) { if (root_dom[i]->isReduction() || root_dom[i]->getIterType() == IterType::BroadcastWithoutStride) { - strides[i] = zero; + strides[i] = GpuLower::current()->kernel()->oneVal(); continue; } std::stringstream ss; ss << "T" << producer_tv->name() << ".stride[" << stride_i++ << "]"; - strides[i] = ir_builder.create(ss.str(), DataType::Int); + strides[i] = IrBuilder::create(ss.str(), DataType::Int); } } TORCH_INTERNAL_ASSERT( root_dom.size() == producer_tv->domain()->contiguity().size()); - kir::Val* cur_contig_stride = ir_builder.create(1); + Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal(); for (const auto i : c10::irange(root_dom.size())) { auto dim = root_dom.size() - i - 1; if (root_dom[dim]->isReduction()) { @@ -1383,14 +1318,12 @@ std::vector Index::getGlobalProducerStridedIndices( continue; } - kir::Val* root_ind = nullptr; - auto kir_root_dom = - gpu_lower->lowerValue(root_dom[dim])->as(); - if (producer_indexing.indexMap().find(kir_root_dom) != + Val* root_ind = nullptr; + if (producer_indexing.indexMap().find(root_dom[dim]) != producer_indexing.indexMap().end()) { - root_ind = producer_indexing.indexMap().at(kir_root_dom); + root_ind = producer_indexing.indexMap().at(root_dom[dim]); } else if (root_dom[dim]->getIterType() == IterType::BroadcastWithStride) { - root_ind = zero; + root_ind = GpuLower::current()->kernel()->zeroVal(); } TORCH_INTERNAL_ASSERT( @@ -1410,12 +1343,12 @@ std::vector Index::getGlobalProducerStridedIndices( // by extent of this dimension auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]); cur_contig_stride = - ir_builder.mulExpr(cur_contig_stride, root_dim_extent); + IrBuilder::mulExpr(cur_contig_stride, root_dim_extent); } else { // If non contiguous dimension, keep local stride information, set cur // stride to local stride * local raw extent auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]); - cur_contig_stride = ir_builder.mulExpr(strides[dim], root_dim_extent); + cur_contig_stride = IrBuilder::mulExpr(strides[dim], root_dim_extent); } } @@ -1423,7 +1356,8 @@ std::vector Index::getGlobalProducerStridedIndices( loops.empty() ? nullptr : loops.back()->vectorize_shift(); // Global striding - std::vector strided_inds(root_dom.size(), ir_builder.zeroVal()); + std::vector strided_inds( + root_dom.size(), GpuLower::current()->kernel()->zeroVal()); for (const auto i : c10::irange(root_dom.size())) { // If the domain is derived from a trivial reduction, no indexing // to create. @@ -1434,20 +1368,17 @@ std::vector Index::getGlobalProducerStridedIndices( continue; } - auto kir_root_dom_i = - gpu_lower->lowerValue(root_dom[i])->as(); - TORCH_INTERNAL_ASSERT( - producer_indexing.indexMap().find(kir_root_dom_i) != + producer_indexing.indexMap().find(root_dom[i]) != producer_indexing.indexMap().end(), "Couldn't find root mapping for TV", producer_tv->name(), " dim: ", i, " id: ", - kir::toString(kir_root_dom_i)); + root_dom[i]->toString()); - auto root_ind = producer_indexing.indexMap().at(kir_root_dom_i); + auto root_ind = producer_indexing.indexMap().at(root_dom[i]); root_ind = getProducerIndexWithHalo(producer_tv, i, root_ind, consumer_tv); @@ -1465,9 +1396,9 @@ std::vector Index::getGlobalProducerStridedIndices( if (root_ind->isZeroInt()) { continue; } else { - auto strided_ind = ir_builder.mulExpr(root_ind, strides[i]); + auto strided_ind = IrBuilder::mulExpr(root_ind, strides[i]); if (i == root_dom.size() - 1 && vectorize_shift != nullptr) { - strided_inds[i] = ir_builder.addExpr(strided_ind, vectorize_shift); + strided_inds[i] = IrBuilder::addExpr(strided_ind, vectorize_shift); } else { strided_inds[i] = strided_ind; } @@ -1478,12 +1409,11 @@ std::vector Index::getGlobalProducerStridedIndices( } // Producer index for either shared or local memory -std::vector Index::getNonGlobalProducerStridedIndices( +std::vector Index::getNonGlobalProducerStridedIndices( TensorView* producer_tv, const TensorView* consumer_tv, const std::vector& loops) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // Get a reference tensor replayed as existing loop structure auto reference = IndexReferenceReplay::getReference(loops); @@ -1526,31 +1456,35 @@ std::vector Index::getNonGlobalProducerStridedIndices( } } + kir::ForLoop* consumer_db_loop = + gpu_lower->doubleBufferInfo().getDoubleBufferLoop( + consumer_tv, loops, true); + // Find allocation point of producer relative to loop nests. P2C map is // required because producer was replayed as consumer, so we can't use the // regular compute at maps to line up its iter domains with the for loops. - auto alloc_point = - loop_utils::getAllocPoint(producer_tv, loops, p2c_alloc_map, true); - std::unordered_map loop_to_ind_map; + auto alloc_info = + loop_utils::getAllocInformation(producer_tv, loops, p2c_alloc_map, true); + std::unordered_map loop_to_ind_map; std::unordered_set zero_loops; - std::tie(loop_to_ind_map, zero_loops) = - indexMapFromTV(producer_tv, loops, alloc_point, false); + std::tie(loop_to_ind_map, zero_loops) = indexMapFromTV( + producer_tv, loops, alloc_info.init_for_loop, false, consumer_db_loop); - ensureStaticIndexing(producer_tv, alloc_point.first, loops, p2c_alloc_map); + ensureStaticIndexing( + producer_tv, alloc_info.init_for_loop, loops, p2c_alloc_map); // Map loop nests to indicies, zeroing out those not used due to locality of // memory - std::unordered_map ref_id_to_ind_map; + std::unordered_map ref_id_to_ind_map; // Track which domains are not used - std::unordered_set ref_zero_domains; + std::unordered_set ref_zero_domains; // Due to rfactor/initialization reference_domain may be bigger than loop nest // structure, ignore IterDomains that aren't present in the loop nest when // indexing reference. TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims()); for (const auto loop_i : c10::irange(loops.size())) { - auto ref_axis = gpu_lower->lowerValue(reference_domain->axis(loop_i)) - ->as(); + auto ref_axis = reference_domain->axis(loop_i); ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loops[loop_i]]; if (zero_loops.count(loops[loop_i]) > 0) { ref_zero_domains.insert(ref_axis); @@ -1677,8 +1611,7 @@ std::vector Index::getNonGlobalProducerStridedIndices( } // Already an entry for this root domain, continue - if (index_map.find(gpu_lower->lowerValue(root_id)->as()) != - index_map.end()) { + if (index_map.find(root_id) != index_map.end()) { continue; } @@ -1690,25 +1623,23 @@ std::vector Index::getNonGlobalProducerStridedIndices( } } - std::vector strided_inds(root_dom.size(), ir_builder.zeroVal()); + std::vector strided_inds( + root_dom.size(), GpuLower::current()->kernel()->zeroVal()); for (const auto i : c10::irange(root_dom.size())) { if (skip_indexing.count(root_dom[i])) { continue; } - auto kir_root_dom_i = - gpu_lower->lowerValue(root_dom[i])->as(); - TORCH_INTERNAL_ASSERT( - index_map.find(kir_root_dom_i) != index_map.end(), + index_map.find(root_dom[i]) != index_map.end(), "Couldn't find root mapping for TV", producer_tv->name(), " dim: ", i, " id: ", - kir::toString(kir_root_dom_i)); + root_dom[i]->toString()); - auto root_ind_i = index_map.at(kir_root_dom_i); + auto root_ind_i = index_map.at(root_dom[i]); root_ind_i = getProducerIndexWithHalo(producer_tv, i, root_ind_i, consumer_tv); @@ -1729,17 +1660,14 @@ std::vector Index::getNonGlobalProducerStridedIndices( } // Compute striding for this index. - kir::Val* stride = nullptr; + Val* stride = nullptr; for (const auto j : c10::irange(i + 1, root_dom.size())) { if (skip_indexing.count(root_dom[j])) { continue; } - auto kir_root_dom_j = - gpu_lower->lowerValue(root_dom[j])->as(); - TORCH_INTERNAL_ASSERT( - index_map.find(kir_root_dom_j) != index_map.end(), + index_map.find(root_dom[j]) != index_map.end(), "Couldn't find root mapping for TV", consumer_tv->name(), " dim: ", @@ -1747,37 +1675,49 @@ std::vector Index::getNonGlobalProducerStridedIndices( " id: ", root_dom[i]); - auto root_ext_j = extent_map.find(kir_root_dom_j) == extent_map.end() - ? kir_root_dom_j->extent() - : extent_map.at(kir_root_dom_j); + auto root_ext_j = extent_map.find(root_dom[j]) == extent_map.end() + ? root_dom[j]->extent() + : extent_map.at(root_dom[j]); root_ext_j = getHaloExtentOfRootAxis(root_dom[j], root_ext_j); - if (zero_domain_map.count(kir_root_dom_j) == 0) { + if (zero_domain_map.count(root_dom[j]) == 0) { if (stride == nullptr) { stride = root_ext_j; } else { - stride = ir_builder.mulExpr(stride, root_ext_j); + stride = IrBuilder::mulExpr(stride, root_ext_j); } } } if (stride != nullptr) { - strided_inds[i] = ir_builder.mulExpr(root_ind_i, stride); + strided_inds[i] = IrBuilder::mulExpr(root_ind_i, stride); } else { strided_inds[i] = root_ind_i; } } + if (producer_tv->isDoubleBuffered()) { + auto db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop( + producer_tv, loops, true); + if (db_loop != nullptr) { + auto db_switch_index = + IrBuilder::modExpr(db_loop->index(), IrBuilder::create(2)); + auto original_alloc_size = + gpu_lower->doubleBufferInfo().getOriginalAllocSize(producer_tv); + auto db_strided_index = + IrBuilder::mulExpr(db_switch_index, original_alloc_size); + strided_inds.push_back(db_strided_index); + } + } return strided_inds; } -std::vector Index::getGlobalConsumerStridedIndices( +std::vector Index::getGlobalConsumerStridedIndices( const TensorView* consumer_tv, const std::vector& loops) { FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalConsumerIndex"); const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); // Get a reference tensor replayed as existing loop structure auto reference = IndexReferenceReplay::getReference(loops); @@ -1813,26 +1753,27 @@ std::vector Index::getGlobalConsumerStridedIndices( auto root_dom = consumer_tv->getMaybeRFactorDomain(); // TODO: Abstract stride logic to reuse with producer indexing - auto zero = ir_builder.zeroVal(); - std::vector strides(root_dom.size(), zero); + std::vector strides( + root_dom.size(), GpuLower::current()->kernel()->oneVal()); { int stride_i = 0; for (const auto i : c10::irange(root_dom.size())) { if (root_dom[i]->isReduction() || root_dom[i]->getIterType() == IterType::BroadcastWithoutStride || root_dom[i]->isStride()) { - strides[i] = zero; + strides[i] = GpuLower::current()->kernel()->oneVal(); continue; } std::stringstream ss; ss << "T" << consumer_tv->name() << ".stride[" << stride_i++ << "]"; - strides[i] = ir_builder.create(ss.str(), DataType::Int); + strides[i] = + SimplifyingIrBuilder::create(ss.str(), DataType::Int); } } TORCH_INTERNAL_ASSERT( root_dom.size() == consumer_tv->domain()->contiguity().size()); - kir::Val* cur_contig_stride = ir_builder.oneVal(); + Val* cur_contig_stride = GpuLower::current()->kernel()->oneVal(); for (const auto i : c10::irange(root_dom.size())) { auto dim = root_dom.size() - i - 1; if (root_dom[dim]->isReduction() || root_dom[dim]->isStride()) { @@ -1842,14 +1783,12 @@ std::vector Index::getGlobalConsumerStridedIndices( continue; } - kir::Val* root_ind = nullptr; - auto kir_root_dom = - gpu_lower->lowerValue(root_dom[dim])->as(); - if (consumer_indexing.indexMap().find(kir_root_dom) != + Val* root_ind = nullptr; + if (consumer_indexing.indexMap().find(root_dom[dim]) != consumer_indexing.indexMap().end()) { - root_ind = consumer_indexing.indexMap().at(kir_root_dom); + root_ind = consumer_indexing.indexMap().at(root_dom[dim]); } else if (root_dom[dim]->getIterType() == IterType::BroadcastWithStride) { - root_ind = zero; + root_ind = GpuLower::current()->kernel()->zeroVal(); } TORCH_INTERNAL_ASSERT( @@ -1869,11 +1808,11 @@ std::vector Index::getGlobalConsumerStridedIndices( // by extent of this dimension auto root_dim_extent = getHaloExtentOfRootAxis(root_dom[dim]); cur_contig_stride = - ir_builder.mulExpr(cur_contig_stride, root_dim_extent); + SimplifyingIrBuilder::mulExpr(cur_contig_stride, root_dim_extent); } else { // If non contiguous dimension, keep local stride information, set cur // stride to local stride * local raw extent - cur_contig_stride = ir_builder.mulExpr( + cur_contig_stride = SimplifyingIrBuilder::mulExpr( strides[dim], getHaloExtentOfRootAxis(root_dom[dim])); } } @@ -1882,7 +1821,8 @@ std::vector Index::getGlobalConsumerStridedIndices( loops.empty() ? nullptr : loops.back()->vectorize_shift(); // Global striding - std::vector strided_inds(root_dom.size(), ir_builder.zeroVal()); + std::vector strided_inds( + root_dom.size(), GpuLower::current()->kernel()->zeroVal()); for (const auto i : c10::irange(root_dom.size())) { // See a comment in indexing to root domains in getGlobalProducerIndex. if (root_dom[i]->isReduction() || @@ -1893,71 +1833,70 @@ std::vector Index::getGlobalConsumerStridedIndices( continue; } - auto kir_root_dom_i = - gpu_lower->lowerValue(root_dom[i])->as(); - TORCH_INTERNAL_ASSERT( - consumer_indexing.indexMap().find(kir_root_dom_i) != + consumer_indexing.indexMap().find(root_dom[i]) != consumer_indexing.indexMap().end(), "Couldn't find root mapping for TV", consumer_tv->name(), " dim: ", i, " id: ", - kir::toString(kir_root_dom_i)); + root_dom[i]->toString()); - auto root_ind = consumer_indexing.indexMap().at(kir_root_dom_i); + auto root_ind = consumer_indexing.indexMap().at(root_dom[i]); - root_ind = ir_builder.addExpr( - root_ind, getGlobalConsumerOffsetWithPartialSplit(kir_root_dom_i)); + root_ind = SimplifyingIrBuilder::addExpr( + root_ind, getGlobalConsumerOffsetWithPartialSplit(root_dom[i])); if (root_ind->isZeroInt()) { continue; } else { - auto strided_ind = ir_builder.mulExpr(root_ind, strides[i]); + auto strided_ind = SimplifyingIrBuilder::mulExpr(root_ind, strides[i]); if (i == root_dom.size() - 1 && vectorize_shift != nullptr) { - strided_inds[i] = ir_builder.addExpr(strided_ind, vectorize_shift); + strided_inds[i] = + SimplifyingIrBuilder::addExpr(strided_ind, vectorize_shift); } else { strided_inds[i] = strided_ind; } } } + TORCH_INTERNAL_ASSERT( + strided_inds.size() == consumer_tv->getMaybeRFactorDomain().size()); + return strided_inds; } // Consumer index for either shared or local memory -std::vector Index::getNonGlobalConsumerStridedIndices( +std::vector Index::getNonGlobalConsumerStridedIndices( const TensorView* consumer_tv, const std::vector& loops) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // Get a reference tensor replayed as existing loop structure auto reference = IndexReferenceReplay::getReference(loops); auto reference_domain = reference.domain; auto reference_id_map = reference.concrete_to_id; - auto alloc_point = loop_utils::getAllocPoint(consumer_tv, loops); - std::unordered_map loop_to_ind_map; + auto alloc_info = loop_utils::getAllocInformation(consumer_tv, loops); + std::unordered_map loop_to_ind_map; std::unordered_set zero_loops; std::tie(loop_to_ind_map, zero_loops) = - indexMapFromTV(consumer_tv, loops, alloc_point, true); + indexMapFromTV(consumer_tv, loops, alloc_info.init_for_loop, true); - ensureStaticIndexing(consumer_tv, alloc_point.first, loops); + ensureStaticIndexing(consumer_tv, alloc_info.init_for_loop, loops); // Map loop nests to indicies, zeroing out those not used due to locality of // memory - std::unordered_map ref_id_to_ind_map; - std::unordered_set ref_zero_domains; + std::unordered_map ref_id_to_ind_map; + std::unordered_set ref_zero_domains; // Due to rfactor/initialization reference_domain may be bigger than loop nest // structure, ignore IterDomains that aren't present in the loop nest when // indexing reference. TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims()); for (const auto loop_i : c10::irange(loops.size())) { - auto ref_axis = gpu_lower->lowerValue(reference_domain->axis(loop_i)) - ->as(); + auto ref_axis = reference_domain->axis(loop_i); ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loops[loop_i]]; if (zero_loops.count(loops[loop_i]) > 0) { ref_zero_domains.insert(ref_axis); @@ -2022,7 +1961,8 @@ std::vector Index::getNonGlobalConsumerStridedIndices( // Indices should now be mapped onto IterDomains in consumer, so just grab // and use them. auto root_dom = consumer_tv->getMaybeRFactorDomain(); - std::vector strided_inds(root_dom.size(), ir_builder.zeroVal()); + std::vector strided_inds( + root_dom.size(), GpuLower::current()->kernel()->zeroVal()); for (const auto i : c10::irange(root_dom.size())) { if (root_dom[i]->isReduction() || root_dom[i]->isBroadcast() || gpu_lower->trivialReductionInfo().isDerived(root_dom[i]) || @@ -2030,25 +1970,22 @@ std::vector Index::getNonGlobalConsumerStridedIndices( continue; } - auto kir_root_dom_i = - gpu_lower->lowerValue(root_dom[i])->as(); - TORCH_INTERNAL_ASSERT( - index_map.find(kir_root_dom_i) != index_map.end(), + index_map.find(root_dom[i]) != index_map.end(), "Couldn't find root mapping for TV", consumer_tv->name(), " dim: ", i, " id: ", - kir::toString(kir_root_dom_i)); + root_dom[i]->toString()); - const auto root_ind_i = index_map.at(kir_root_dom_i); + const auto root_ind_i = index_map.at(root_dom[i]); if (root_ind_i->isZeroInt()) { continue; } // Compute striding for this index. - kir::Val* stride = nullptr; + Val* stride = nullptr; for (const auto j : c10::irange(i + 1, root_dom.size())) { if (root_dom[j]->isBroadcast() || root_dom[j]->isReduction() || gpu_lower->trivialReductionInfo().isDerived(root_dom[j]) || @@ -2056,11 +1993,8 @@ std::vector Index::getNonGlobalConsumerStridedIndices( continue; } - auto kir_root_dom_j = - gpu_lower->lowerValue(root_dom[j])->as(); - TORCH_INTERNAL_ASSERT( - index_map.find(kir_root_dom_j) != index_map.end(), + index_map.find(root_dom[j]) != index_map.end(), "Couldn't find root mapping for TV", consumer_tv->name(), " dim: ", @@ -2068,45 +2002,67 @@ std::vector Index::getNonGlobalConsumerStridedIndices( " id: ", root_dom[i]); - auto root_ext_j = extent_map.find(kir_root_dom_j) == extent_map.end() - ? kir_root_dom_j->extent() - : extent_map.at(kir_root_dom_j); + auto root_ext_j = extent_map.find(root_dom[j]) == extent_map.end() + ? root_dom[j]->extent() + : extent_map.at(root_dom[j]); root_ext_j = getHaloExtentOfRootAxis(root_dom[j], root_ext_j); - if (zero_domain_map.count(kir_root_dom_j) == 0) { + if (zero_domain_map.count(root_dom[j]) == 0) { if (stride == nullptr) { stride = root_ext_j; } else { - stride = ir_builder.mulExpr(stride, root_ext_j); + stride = IrBuilder::mulExpr(stride, root_ext_j); } } } if (stride != nullptr) { - strided_inds[i] = ir_builder.mulExpr(root_ind_i, stride); + strided_inds[i] = IrBuilder::mulExpr(root_ind_i, stride); } else { strided_inds[i] = root_ind_i; } } + // This check was originally done in getConsumerStridedIndices, but + // the number of strided index values depends on the loop where the + // consumer tensor is located. If it's double buffered and not in + // the prologue loop, strided_inds ends up having one more + // index, so it's just much simpler to check here before adding the + // additional index for double buffering. + TORCH_INTERNAL_ASSERT( + strided_inds.size() == consumer_tv->getMaybeRFactorDomain().size()); + + if (consumer_tv->isDoubleBuffered()) { + auto db_loop = gpu_lower->doubleBufferInfo().getDoubleBufferLoop( + consumer_tv, loops, true); + if (db_loop != nullptr) { + auto db_switch_index = IrBuilder::subExpr( + gpu_lower->kernel()->oneVal(), + IrBuilder::modExpr(db_loop->index(), IrBuilder::create(2))); + auto original_alloc_size = + gpu_lower->doubleBufferInfo().getOriginalAllocSize(consumer_tv); + auto db_strided_index = + IrBuilder::mulExpr(db_switch_index, original_alloc_size); + strided_inds.push_back(db_strided_index); + } + } + return strided_inds; } -std::vector Index::getProducerStridedIndices( +std::vector Index::getProducerStridedIndices( TensorView* producer, const TensorView* consumer, const std::vector& loops) { FUSER_PERF_SCOPE("GpuLower::Lower::Index::getProducerStridedIndices"); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - if (producer->domain()->noReductions().size() == 0) { - return std::vector( - producer->getMaybeRFactorDomain().size(), ir_builder.zeroVal()); + return std::vector( + producer->getMaybeRFactorDomain().size(), + GpuLower::current()->kernel()->zeroVal()); } - std::vector strided_indices; + std::vector strided_indices; if (producer->getMemoryType() == MemoryType::Global) { strided_indices = getGlobalProducerStridedIndices(producer, consumer, loops); @@ -2116,7 +2072,9 @@ std::vector Index::getProducerStridedIndices( } TORCH_INTERNAL_ASSERT( - strided_indices.size() == producer->getMaybeRFactorDomain().size()); + strided_indices.size() == + producer->getMaybeRFactorDomain().size() + + (producer->isDoubleBuffered() ? 1 : 0)); return strided_indices; } @@ -2126,35 +2084,27 @@ kir::TensorIndex* Index::getProducerIndex( TensorView* producer, const TensorView* consumer, const std::vector& loops) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto strided_indices = getProducerStridedIndices(producer, consumer, loops); - return ir_builder.create(producer, strided_indices); + return IrBuilder::create(producer, strided_indices); } -std::vector Index::getConsumerStridedIndices( +std::vector Index::getConsumerStridedIndices( const TensorView* consumer, const std::vector& loops) { FUSER_PERF_SCOPE("GpuLower::Lower::Index::getConsumerStridedIndices"); - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - if (consumer->domain()->noReductions().size() == 0) { - return std::vector( - consumer->getMaybeRFactorDomain().size(), ir_builder.zeroVal()); + return std::vector( + consumer->getMaybeRFactorDomain().size(), + GpuLower::current()->kernel()->zeroVal()); } - std::vector strided_indices; + std::vector strided_indices; if (consumer->getMemoryType() == MemoryType::Global) { strided_indices = getGlobalConsumerStridedIndices(consumer, loops); } else { strided_indices = getNonGlobalConsumerStridedIndices(consumer, loops); } - TORCH_INTERNAL_ASSERT( - strided_indices.size() == consumer->getMaybeRFactorDomain().size()); - return strided_indices; } @@ -2162,11 +2112,8 @@ std::vector Index::getConsumerStridedIndices( kir::TensorIndex* Index::getConsumerIndex( const TensorView* consumer, const std::vector& loops) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto strided_indices = getConsumerStridedIndices(consumer, loops); - return ir_builder.create(consumer, strided_indices); + return IrBuilder::create(consumer, strided_indices); } namespace { @@ -2184,37 +2131,19 @@ struct PredicateDomainInfo { bool is_non_divisible_split = false; }; -// Find iteration domains in the history of reference comprised only of -// merge operations. Only return iteration domains that are subsequently fed -// into a split, or are in the provided domain. In other words, we don't want to -// return every IterDomain that's contiguous, just the one closest to the -// leaves. Predicates are not associated with physical memory so we can treat -// all of them as contiguous merges. +// Find iteration domains in the history of a consumer to predicate comprised +// only of merge operations. Only return iteration domains that are subsequently +// fed into a split, or are in the provided domain. In other words, we don't +// want to return every IterDomain that's contiguous, just the one closest to +// the leaves. Predicates are not associated with physical memory so we can +// treat all of them as contiguous merges. std::vector getPredicateContigIds( - const ReferenceTensor& reference, - TensorView* consumer_tv, - const std::unordered_map& ref_2_consumer) { + TensorView* consumer_tv) { const auto gpu_lower = GpuLower::current(); - std::vector reference_predicated_root_domain; - for (const auto consumer_root : consumer_tv->getRootDomain()) { - if (consumer_root->isBroadcast()) { - continue; - } - auto consumer_root_concrete = - gpu_lower->caIndexMap().getConcreteMappedID(consumer_root); - auto it = reference.concrete_to_id.find(consumer_root_concrete); - // When initializing a reduction buffer, the reduction axis - // doesn't have a loop, so the reference tensor doesn't have a - // mapped domain. The reduction axis can be safely ignored. - if (it == reference.concrete_to_id.end()) { - continue; - } - auto reference_root = it->second; - reference_predicated_root_domain.emplace_back(reference_root); - } + const auto& consumer_root_domain = consumer_tv->getRootDomain(); - std::vector contiguous_ids = reference_predicated_root_domain; + std::vector contiguous_ids = consumer_root_domain; if (contiguous_ids.empty()) { return std::vector(); @@ -2227,20 +2156,24 @@ std::vector getPredicateContigIds( // about halo to do correct predication, so they must be excluded. std::unordered_set excluded_ids; - for (auto reference_predicated_id : reference_predicated_root_domain) { - if (GpuLower::current() - ->haloInfo() - .getRootAxisInfo(reference_predicated_id) - .hasHalo()) { + for (auto consumer_root_id : consumer_root_domain) { + if (gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id).hasHalo()) { + excluded_ids.insert(consumer_root_id); continue; } - auto it = ref_2_consumer.find(reference_predicated_id); - if (it == ref_2_consumer.end()) { - continue; - } - auto consumer_root_id = it->second; if (consumer_root_id->maybePartial()) { - excluded_ids.insert(reference_predicated_id); + excluded_ids.insert(consumer_root_id); + continue; + } + // When consumer_root_id is a broadcast domain, do not allow contig + // predication as the merged output is not mapped with the + // reference unless the concrete domain is also a broadcast + // domain. + if (consumer_root_id->isBroadcast() && + !gpu_lower->caLoopMap() + .getConcreteMappedID(consumer_root_id) + ->isBroadcast()) { + excluded_ids.insert(consumer_root_id); continue; } // Shifted or gathered axes need to be predicated at the root domain @@ -2252,15 +2185,16 @@ std::vector getPredicateContigIds( auto consumer_root_pos = consumer_tv->domain()->rootPosOf(consumer_root_id); if ((shift_expr && shift_expr->offset(consumer_root_pos) != 0) || (gather_expr && consumer_root_pos < gather_expr->windowShape().size() && - !gather_expr->windowShape().at(consumer_root_pos)->isOneInt())) { - excluded_ids.insert(reference_predicated_id); + gather_expr->windowShape().at(consumer_root_pos) != 1)) { + excluded_ids.insert(consumer_root_id); } } // Run through iteration domain history - auto exprs = ExprSort::getExprs( + auto exprs = StmtSort::getExprs( consumer_tv->fusion(), - {reference.domain->domain().begin(), reference.domain->domain().end()}); + {consumer_tv->domain()->domain().begin(), + consumer_tv->domain()->domain().end()}); for (auto expr : exprs) { // If not a merge, output is not contiguous @@ -2296,8 +2230,7 @@ std::vector getPredicateContigIds( // reference_predicated_root_domain. auto contig_root_vals = IterVisitor::getInputsTo( {contig_id}, - {reference_predicated_root_domain.begin(), - reference_predicated_root_domain.end()}); + {consumer_root_domain.begin(), consumer_root_domain.end()}); auto contig_root_ids = ir_utils::filterByType(contig_root_vals); PredicateDomainInfo contig_id_info; contig_id_info.id = contig_id; @@ -2312,8 +2245,7 @@ IterDomain* getMappedReferenceDomain( IterDomain* id, const ReferenceTensor& reference) { // Partially overlaps with getPredicateContigIds() - const auto gpu_lower = GpuLower::current(); - auto concrete_id = gpu_lower->caIndexMap().getConcreteMappedID(id); + auto concrete_id = GpuLower::current()->caIndexMap().getConcreteMappedID(id); auto it = reference.concrete_to_id.find(concrete_id); if (it == reference.concrete_to_id.end()) { return nullptr; @@ -2321,9 +2253,8 @@ IterDomain* getMappedReferenceDomain( return it->second; } -std::vector getNonDivisibleReferenceDomainsToPredicate( - TensorView* consumer_tv, - const ReferenceTensor& reference) { +std::vector getNonDivisibleConsumerDomainsToPredicate( + TensorView* consumer_tv) { const auto& non_divisible_split_info = GpuLower::current()->nonDivisibleSplitInfo(); @@ -2337,11 +2268,7 @@ std::vector getNonDivisibleReferenceDomainsToPredicate( const auto& splits_to_predicate = it->second; for (auto split : splits_to_predicate) { - auto ref_id = getMappedReferenceDomain(split->in(), reference); - if (ref_id == nullptr) { - continue; - } - PredicateDomainInfo info{ref_id, {ref_id}, true}; + PredicateDomainInfo info{split->in(), {split->in()}, true}; pred_info_vec.emplace_back(info); } @@ -2352,9 +2279,8 @@ bool needsPadding(TensorView* tv) { auto shift_expr = dynamic_cast(tv->definition()); auto gather_expr = dynamic_cast(tv->definition()); - // Padding is only necessary for padded shift and - // gather - return (shift_expr != nullptr && shift_expr->pad()) || gather_expr != nullptr; + return (shift_expr != nullptr && shift_expr->hasPadding()) || + (gather_expr != nullptr && gather_expr->hasPadding()); } // Get an additional offset of a stop index when building a predicate @@ -2364,11 +2290,10 @@ bool needsPadding(TensorView* tv) { // compared with each other by just looking at the additional offsets. // // consumer_root_id: the domain for which a stop predicate is being built. -kir::Val* getUnswitchStopOffset( +int getUnswitchStopOffset( IterDomain* consumer_root_id, TensorView* consumer_tv) { const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); AxisHaloInfo halo_info = gpu_lower->haloInfo().getRootAxisInfo(consumer_root_id); @@ -2376,7 +2301,7 @@ kir::Val* getUnswitchStopOffset( // If the consumer root domain to predicate does not have halo, no // adjustment is required. if (!halo_info.hasHalo()) { - return ir_builder.zeroVal(); + return 0; } // Find if this contig_id is used in the unswitched domains @@ -2400,22 +2325,14 @@ kir::Val* getUnswitchStopOffset( })) { return halo_info.width(); } else { - return ir_builder.zeroVal(); + return 0; } } -// Get offsets for the start and stop predicates. Similar to the -// gather case, but it's a little simpler as it does not (yet) -// dynamic shifting. -void adjustStartAndStopOffsetsForShift( - std::vector& start_offsets, - std::vector& stop_offsets, +std::pair getStartAndStopOffsetsForShift( TensorView* consumer_tv, IterDomain* consumer_id, bool padding_predicate) { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - TORCH_INTERNAL_ASSERT(consumer_id != nullptr); auto shift_expr = dynamic_cast(consumer_tv->definition()); @@ -2423,105 +2340,124 @@ void adjustStartAndStopOffsetsForShift( // Adjustment is not necessary if not shift. // Even so, padding predicate does not need any adjustment. if (shift_expr == nullptr || padding_predicate) { - return; + return { + GpuLower::current()->kernel()->zeroVal(), + GpuLower::current()->kernel()->zeroVal()}; } const auto root_axis_pos = consumer_tv->domain()->rootPosOf(consumer_id); - // Assume this adjustment is done first, so start and stop offsets - // just contain zeroVal. - TORCH_INTERNAL_ASSERT( - start_offsets.size() == 1 && start_offsets[0]->isZeroInt() && - stop_offsets.size() == 1 && stop_offsets[0]->isZeroInt()); - start_offsets.clear(); - stop_offsets.clear(); + // The first or last N elements, where N is the padding width, + // correspond to the padding predicate. - // The consumer offset is zero. - auto consumer_offset = 0; - // The producer offset is based off the consumer offset. - auto producer_offset = 0; + const auto shift_offset = shift_expr->offset(root_axis_pos); + const auto pad_width = shift_expr->padWidth().at(root_axis_pos); - // When the shift operation is not padded, the start and stop positions of the - // consumer axis, i.e., consumer_id->start and - // consumer_id->stop_ofset, are adjusted accordingly, which includes - // the effect of the shift offset, so using the consumer offset is - // sufficient as the only predicate is sufficient. + int start_offset = 0; + int stop_offset = 0; - if (shift_expr->pad()) { - // Positive shift offset means shifting the input tensor to the - // positive direction, so the producer offset becomes negative. - auto shift_offset = shift_expr->offset(root_axis_pos); - producer_offset = -shift_offset; + if (shift_offset > 0) { + start_offset = -pad_width; + } else if (shift_offset < 0) { + stop_offset = pad_width; } - // Since shift doesn't allow dynamic offsets, we can statically - // choose more restrictive offsets between the producer and consumer - // offsets. The start predicate uses greater-than, so using the - // smaller offset is sufficient. Similarly, for the stop predicate, - // using the larger offset is sufficient. - auto start_offset = std::min(consumer_offset, producer_offset); - auto stop_offset = std::max(consumer_offset, producer_offset); - - start_offsets.push_back(ir_builder.create(start_offset)); - stop_offsets.push_back(ir_builder.create(stop_offset)); + return { + IrBuilder::create(start_offset), + IrBuilder::create(stop_offset)}; } -// Get offsets for the start and stop predicates. There can be two -// offsets because the shift offset is determined by a loop index. -void adjustStartAndStopOffsetsForGather( - std::vector& start_offsets, - std::vector& stop_offsets, +std::pair getStartAndStopOffsetsForGather( TensorView* consumer_tv, IterDomain* consumer_id, - const ReferenceTensor& reference, - const std::unordered_map& ref_start_index_map, - const std::unordered_map& ref_stop_index_map, + const std::unordered_map& ref_start_index_map, + const std::unordered_map& ref_stop_index_map, bool padding_predicate) { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - TORCH_INTERNAL_ASSERT(consumer_id != nullptr); // Adjustment is not necessary if not gather. Even so, padding // predicate does not need any adjustment. if (!consumer_tv->definition()->isA() || padding_predicate) { - return; + return { + GpuLower::current()->kernel()->zeroVal(), + GpuLower::current()->kernel()->zeroVal()}; } const auto root_axis_pos = consumer_tv->domain()->rootPosOf(consumer_id); - // Assume this adjustment is done first, so start and stop offsets - // just contain zeroVal. - TORCH_INTERNAL_ASSERT( - start_offsets.size() == 1 && start_offsets[0]->isZeroInt() && - stop_offsets.size() == 1 && stop_offsets[0]->isZeroInt()); - start_offsets.clear(); - stop_offsets.clear(); - auto producer_start_offset = getProducerOffsetWithGather( - root_axis_pos, - consumer_tv, - reference.concrete_to_id, - ref_start_index_map); + root_axis_pos, consumer_tv, ref_start_index_map); auto producer_stop_offset = getProducerOffsetWithGather( - root_axis_pos, consumer_tv, reference.concrete_to_id, ref_stop_index_map); + root_axis_pos, consumer_tv, ref_stop_index_map); - // The producer and consumer accesses must be predicated as it is - // not statically determined which is more restrictive. + auto consumer_start_offset = GpuLower::current()->kernel()->zeroVal(); + auto consumer_stop_offset = GpuLower::current()->kernel()->zeroVal(); - // Consumer offsets are just zero. - start_offsets.push_back(ir_builder.zeroVal()); - stop_offsets.push_back(ir_builder.zeroVal()); - - // Adds producer offsets if they are not zero. - if (!producer_start_offset->isZeroInt()) { - start_offsets.push_back(producer_start_offset); + if (producer_start_offset->isZeroInt() && producer_stop_offset->isZeroInt()) { + return {consumer_start_offset, consumer_stop_offset}; } - if (!producer_stop_offset->isZeroInt()) { - stop_offsets.push_back(producer_stop_offset); + Val* start_offset = nullptr; + Val* stop_offset = nullptr; + + // In the normal case, take the minimum of the start and the + // maximum of the stop offsets. If there's no padding, the producer + // offset must be always larger than the consumer + // offset. So, the consumer and produce offsets can be always used + // for the start and stop offsets, respectively. + const auto pad_left = + consumer_tv->definition()->as()->padWidth()[root_axis_pos][0]; + const auto pad_right = + consumer_tv->definition()->as()->padWidth()[root_axis_pos][1]; + const auto window_size = + consumer_tv->definition()->as()->windowShape()[root_axis_pos]; + + // consumer index: index + // producer index: index + window_index - pad_left + // + // consumer extent: ext + // producer extent: ext + window_size - 1 - pad_left - pad_right + // + // consumer stop pred: index < ext + // producer stop pred: index + window_index - pad_left < ext + window_size - 1 + // - pad_left - pad_right + // -> index + window_index - pad_left - (window_size - 1 - + // pad_left - pad_right) < ext + // -> index + window_index - (window_size - 1 - pad_right) < + // ext + // + // consumer start pred: index >= 0 + // producer start pred: index + window_index - pad_left >= 0 + + const auto producer_ext_adj = window_size - 1 - pad_left - pad_right; + producer_stop_offset = SimplifyingIrBuilder::subExpr( + producer_stop_offset, + SimplifyingIrBuilder::create(producer_ext_adj)); + + // As commented above, when pad_left is zero, the consumer predicate + // is always more restrictive than the producer predicate. + if (pad_left == 0) { + start_offset = consumer_start_offset; + } else { + start_offset = SimplifyingIrBuilder::minExpr( + consumer_start_offset, producer_start_offset); } + + // As commented above, when pad_right is zero, the consumer + // predicate is always more restrictive than the producer + // predicate. + if (pad_right == 0) { + stop_offset = consumer_stop_offset; + } else { + stop_offset = SimplifyingIrBuilder::maxExpr( + consumer_stop_offset, producer_stop_offset); + } + + TORCH_INTERNAL_ASSERT(start_offset != nullptr); + TORCH_INTERNAL_ASSERT(stop_offset != nullptr); + + return {start_offset, stop_offset}; } // Get the start and stop limit offsets that define the valid range to @@ -2530,18 +2466,16 @@ void adjustStartAndStopOffsetsForGather( // stop that's different from extent. Also, when IterDomain has halo, // the actual offsets of the logical start and stop positions are // shifted. -std::pair getStartAndStopLimitOffsets( +std::pair getStartAndStopLimitOffsets( IterDomain* consumer_id, bool padding_predicate, bool non_divisible_pred) { const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); TORCH_INTERNAL_ASSERT(consumer_id != nullptr); - kir::Val* start_limit = gpu_lower->lowerValue(consumer_id->start()); - kir::Val* stop_limit = - ir_builder.negExpr(gpu_lower->lowerValue(consumer_id->stopOffset())); + Val* start_limit = consumer_id->start(); + Val* stop_limit = SimplifyingIrBuilder::negExpr(consumer_id->stopOffset()); if (!non_divisible_pred) { AxisHaloInfo halo_info = gpu_lower->haloInfo().getRootAxisInfo(consumer_id); @@ -2554,12 +2488,14 @@ std::pair getStartAndStopLimitOffsets( // [0, left halo)[start_limit, stop_limit)[0, right halo) // if (!padding_predicate) { - start_limit = ir_builder.addExpr(start_limit, halo_info.width(0)); - stop_limit = ir_builder.addExpr(stop_limit, halo_info.width(0)); + start_limit = + SimplifyingIrBuilder::addExpr(start_limit, halo_info.width(0)); + stop_limit = + SimplifyingIrBuilder::addExpr(stop_limit, halo_info.width(0)); } else { // In case of the padding predicate, the whole range, including both left // and right halo regions, is computed. - stop_limit = ir_builder.addExpr(stop_limit, halo_info.width()); + stop_limit = SimplifyingIrBuilder::addExpr(stop_limit, halo_info.width()); } } else { // For non-divisible predicates, the index must be predicated such @@ -2568,28 +2504,26 @@ std::pair getStartAndStopLimitOffsets( // isn't a root domain. if (gpu_lower->haloInfo().hasHaloWidth(consumer_id)) { auto halo = gpu_lower->haloInfo().getHaloWidth(consumer_id); - stop_limit = ir_builder.addExpr(stop_limit, halo); + stop_limit = SimplifyingIrBuilder::addExpr(stop_limit, halo); } } return {start_limit, stop_limit}; } -// Return an index map for a predicate reference tensor. Two different +// Return an IndexCompute for a predicate reference tensor. Two different // maps are used when generating predicates for unswitched expressions // as start and stop conditions need to use different loop-to-index // mappings. -std::unordered_map getPredicateReferenceIndexing( +auto getPredicateReferenceIndexing( const std::vector& loops, const ReferenceTensor& reference, kir::ForLoop* unswitch_or_vec_loop, + IterDomain* double_buffer_axis, bool start) { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - auto reference_domain = reference.domain; - std::unordered_map loop_to_ind_map; + std::unordered_map loop_to_ind_map; std::transform( loops.begin(), @@ -2606,7 +2540,7 @@ std::unordered_map getPredicateReferenceIndexing( // vectorized loop should be like this. bool vectorized_pred = - unswitch_or_vec_loop->iter_domain()->parallelType() == + unswitch_or_vec_loop->iter_domain()->getParallelType() == ParallelType::Vectorize; TORCH_INTERNAL_ASSERT( @@ -2614,12 +2548,11 @@ std::unordered_map getPredicateReferenceIndexing( "Invalid reference generated."); bool within_unswitch = false; - const auto one = ir_builder.oneVal(); for (const auto loop_i : c10::irange(loops.size())) { auto loop = loops[loop_i]; auto loop_id = loop->iter_domain(); - auto loop_pt = loop_id->parallelType(); + auto loop_pt = loop_id->getParallelType(); auto ref_id = reference_domain->axis(loop_i); if (loop == unswitch_or_vec_loop) { @@ -2668,20 +2601,21 @@ std::unordered_map getPredicateReferenceIndexing( if (loop->stop() == loop_id->extent()) { loop_to_ind_map[loop] = loop->start(); } else if (start) { - loop_to_ind_map[loop] = ir_builder.zeroVal(); + loop_to_ind_map[loop] = GpuLower::current()->kernel()->zeroVal(); } else { // Note that the parallel dimension is used rather than // loop-stop(). See the above comment. - loop_to_ind_map[loop] = ir_builder.subExpr( - gpu_lower->parallelDimensionMap().get(loop_pt), - ir_builder.create(1)); + loop_to_ind_map[loop] = SimplifyingIrBuilder::subExpr( + GpuLower::current()->parallelDimensionMap().get(loop_pt), + GpuLower::current()->kernel()->zeroVal()); } } else if (start) { - loop_to_ind_map[loop] = ir_builder.zeroVal(); + loop_to_ind_map[loop] = GpuLower::current()->kernel()->zeroVal(); } else { // Similar to the above, loop_id()->extent() is // used here instead of loop->stop(). See the above comment. - loop_to_ind_map[loop] = ir_builder.subExpr(loop_id->extent(), one); + loop_to_ind_map[loop] = SimplifyingIrBuilder::subExpr( + loop_id->extent(), GpuLower::current()->kernel()->oneVal()); } } @@ -2693,9 +2627,27 @@ std::unordered_map getPredicateReferenceIndexing( } } + if (double_buffer_axis != nullptr) { + auto db_loop = GpuLower::current()->doubleBufferInfo().getDoubleBufferLoop( + double_buffer_axis, loops, true); + if (db_loop != nullptr) { + auto loop_to_ind_map_it = loop_to_ind_map.find(db_loop); + TORCH_INTERNAL_ASSERT(loop_to_ind_map_it != loop_to_ind_map.end()); + auto cur_index = loop_to_ind_map_it->second; + // if cur_index is not the same as the index of db_loop, it must + // be true that that index has been modified to support + // unswitch. In that case, it is not necessary to move ahead the + // index for double buffering. + if (cur_index == db_loop->index()) { + loop_to_ind_map[db_loop] = IrBuilder::addExpr( + cur_index, GpuLower::current()->kernel()->oneVal()); + } + } + } + // Add magic zero to a loop pretty far inside in indexing - kir::IterDomain* magic_zero_loop = nullptr; - std::unordered_map ref_id_to_ind_map; + IterDomain* magic_zero_loop = nullptr; + std::unordered_map ref_id_to_ind_map; // Due to rfactor/initialization reference_domain may be bigger than loop nest // structure TORCH_INTERNAL_ASSERT(loops.size() <= reference_domain->nDims()); @@ -2703,19 +2655,19 @@ std::unordered_map getPredicateReferenceIndexing( auto loop = loops[loop_i]; auto ind = loop_to_ind_map[loops[loop_i]]; auto ref_axis = reference_domain->axis(loop_i); - auto kir_ref_axis = gpu_lower->lowerValue(ref_axis)->as(); if (Index::protectWithMagicZero(loop, ref_axis, ind)) { - magic_zero_loop = kir_ref_axis; + magic_zero_loop = ref_axis; } - ref_id_to_ind_map[kir_ref_axis] = loop_to_ind_map[loop]; + ref_id_to_ind_map[ref_axis] = loop_to_ind_map[loop]; } if (ref_id_to_ind_map.count(magic_zero_loop)) { auto& ind = ref_id_to_ind_map[magic_zero_loop]; if (!ind->isConstScalar()) { - ind = ir_builder.addExpr(ind, ir_builder.magicZeroVal()); + ind = SimplifyingIrBuilder::addExpr( + ind, GpuLower::current()->kernel()->magicZeroVal()); } } @@ -2729,7 +2681,7 @@ std::unordered_map getPredicateReferenceIndexing( ref_self_map.insert({id, id}); }); - std::unordered_map reference_halo_extent_map = + std::unordered_map reference_halo_extent_map = getReferenceHaloExtentMap(reference, ref_self_map); // Index into the reference tensor @@ -2741,64 +2693,55 @@ std::unordered_map getPredicateReferenceIndexing( {}, reference_halo_extent_map); - return index_compute.indexMap(); + return index_compute; } // Get the offsets for the start and stop predicates. The offsets // are to be added to the index. -std::pair, std::vector> getStartAndStopOffsets( +std::pair getStartAndStopOffsets( IterDomain* consumer_id, TensorView* consumer_tv, const ReferenceTensor& reference, - const std::unordered_map& ref_start_index_map, - const std::unordered_map& ref_stop_index_map, + const std::unordered_map& consumer_start_index_map, + const std::unordered_map& consumer_stop_index_map, bool padding_predicate, bool unswitch, bool non_divisible_pred) { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - // By default, the offsets for the start and stop predicates are - // just zero. - std::vector start_offsets{ir_builder.zeroVal()}; - std::vector stop_offsets{ir_builder.zeroVal()}; - - if (consumer_id == nullptr) { - return {start_offsets, stop_offsets}; + // just zero. All halo-related adjustments are done at root domains, + // so consumer_id is not a root domain, no adjustment is required. + if (consumer_id->definition() != nullptr && !non_divisible_pred) { + return { + GpuLower::current()->kernel()->zeroVal(), + GpuLower::current()->kernel()->zeroVal()}; } auto consumer_def = consumer_tv->definition(); + Val* start_offset = GpuLower::current()->kernel()->zeroVal(); + Val* stop_offset = GpuLower::current()->kernel()->zeroVal(); + // These adjustments are not required when predicating non-divisible splits if (!non_divisible_pred) { if (consumer_def->isA()) { - adjustStartAndStopOffsetsForShift( - start_offsets, - stop_offsets, - consumer_tv, - consumer_id, - padding_predicate); + std::tie(start_offset, stop_offset) = getStartAndStopOffsetsForShift( + consumer_tv, consumer_id, padding_predicate); } else if (consumer_def->isA()) { - adjustStartAndStopOffsetsForGather( - start_offsets, - stop_offsets, + std::tie(start_offset, stop_offset) = getStartAndStopOffsetsForGather( consumer_tv, consumer_id, - reference, - ref_start_index_map, - ref_stop_index_map, + consumer_start_index_map, + consumer_stop_index_map, padding_predicate); } // Adjustment for partial split - auto partial_split_offset = getGlobalConsumerOffsetWithPartialSplit( - gpu_lower->lowerValue(consumer_id)->as()); - for (auto& start_offset : start_offsets) { - start_offset = ir_builder.addExpr(start_offset, partial_split_offset); - } - for (auto& stop_offset : stop_offsets) { - stop_offset = ir_builder.addExpr(stop_offset, partial_split_offset); - } + auto partial_split_offset = + getGlobalConsumerOffsetWithPartialSplit(consumer_id); + start_offset = + SimplifyingIrBuilder::addExpr(start_offset, partial_split_offset); + stop_offset = + SimplifyingIrBuilder::addExpr(stop_offset, partial_split_offset); // If generating a predicate for unswitch, adjust the stop offset to // accommodate the addition of halo to the loop stop. See the @@ -2808,9 +2751,8 @@ std::pair, std::vector> getStartAndStopOffsets !padding_predicate, "Unswitch should not use the padding predicate"); auto stop_unswitch_offset = getUnswitchStopOffset(consumer_id, consumer_tv); - for (auto& stop_offset : stop_offsets) { - stop_offset = ir_builder.addExpr(stop_offset, stop_unswitch_offset); - } + stop_offset = + SimplifyingIrBuilder::addExpr(stop_offset, stop_unswitch_offset); } } @@ -2830,39 +2772,49 @@ std::pair, std::vector> getStartAndStopOffsets // index + (start_offset - start_limit) >= 0 // index + (stop_offset - stop_limit) < extent - for (auto& start_offset : start_offsets) { - start_offset = ir_builder.subExpr(start_offset, limits.first); - } - for (auto& stop_offset : stop_offsets) { - stop_offset = ir_builder.subExpr(stop_offset, limits.second); - } + start_offset = SimplifyingIrBuilder::subExpr(start_offset, limits.first); + stop_offset = SimplifyingIrBuilder::subExpr(stop_offset, limits.second); - return {start_offsets, stop_offsets}; + return {start_offset, stop_offset}; } -bool canOmitStartPredicate(kir::Val* start_offset) { +// A partial value of a start offset is returned if determined to be +// safe. Nullptr is returned if it can be omitted completely. +Val* simplifyStartOffset(Val* start_offset) { // Start predicate can be omitted when start_offset >= 0. - auto offset_val = start_offset->as()->value(); - return offset_val.has_value() && offset_val.value() >= 0; + auto offset_val = start_offset->as()->value(); + if (offset_val.has_value() && offset_val.value() >= 0) { + return nullptr; + } + + // start_offset may look like min(0, window_index - pad). Then, can + // remove min and leave the rhs only. + auto def = dynamic_cast(start_offset->definition()); + if (def != nullptr && def->getBinaryOpType() == BinaryOpType::Min && + def->lhs()->isZeroInt()) { + return def->rhs(); + } + + return start_offset; } bool canOmitStopPredicate( - kir::Val* stop_index, - kir::Val* stop_offset, - kir::IterDomain* kir_contig_id) { + Val* stop_index, + Val* stop_offset, + IterDomain* contig_id) { bool index_simple = stop_index->definition() == nullptr; // The definition may be just adding the magic zero, which can be // effectively considered "simple" if (!index_simple && isProtectedWithMagicZero(stop_index)) { // Make sure the lhs of stop_index is simple. - auto lhs = stop_index->definition()->as()->lhs(); + auto lhs = stop_index->definition()->as()->lhs(); if (lhs->definition() == nullptr) { index_simple = true; } } // Omit only when both the index and extent are "simple". - if (!(index_simple && kir_contig_id->extent()->definition() == nullptr)) { + if (!(index_simple && contig_id->extent()->definition() == nullptr)) { return false; } @@ -2873,33 +2825,32 @@ bool canOmitStopPredicate( // omitted if extent + halo + stop_offset < extent, i.e., halo + // stop_offset <= 0. - auto stop_offset_val = stop_offset->as()->value(); + auto stop_offset_val = stop_offset->as()->value(); - auto halo_ext = - gpu_lower->haloInfo().getRootAxisInfo(kir_contig_id).width()->value(); + auto halo_ext = gpu_lower->haloInfo().getRootAxisInfo(contig_id).width(); // If they are not compile-time constant, can't prove the // condition. - if (!stop_offset_val.has_value() || !halo_ext.has_value()) { + if (!stop_offset_val.has_value()) { return false; } - if (halo_ext.value() + stop_offset_val.value() > 0) { + if (halo_ext + stop_offset_val.value() > 0) { return false; } // When the domain is parallelized, the parallel dimension must be // exact. Otherwise, there would be extra threads/blocks that need // to be predicated out. - if (isParallelTypeThread(kir_contig_id->parallelType())) { + if (isParallelTypeThread(contig_id->getParallelType())) { if (!gpu_lower->parallelDimensionMap().isExact( - kir_contig_id->parallelType())) { + contig_id->getParallelType())) { return false; } // If the domain has halo, the loop is expanded by the halo // extent, so we can't prove the loop extent is the same as the // parallel dimension. - if (!(halo_ext.has_value() && halo_ext.value() == 0)) { + if (halo_ext != 0) { return false; } } @@ -2912,50 +2863,70 @@ bool canOmitStopPredicate( // Returns predicates and the concrete (by loop map) root domains they cover std::pair, ReferenceTensor> Index:: getReferenceRootPredicates( - const kir::TensorView* kir_consumer_tv, + TensorView* consumer_tv, const std::vector& loops, kir::ForLoop* unswitch_or_vec_loop, bool shift_padding) { FUSER_PERF_SCOPE("GpuLower::Lower::Index::getReferenceRootPredicates"); const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); + + const bool is_unswitch = unswitch_or_vec_loop != nullptr; // Nothing needs to be done when padding is not required. - if (shift_padding && !needsPadding(kir_consumer_tv->fuserTv())) { + if (shift_padding && !needsPadding(consumer_tv)) { return {{RootPredicateInfo::getFalseInfo()}, ReferenceTensor{}}; } - auto consumer_tv = kir_consumer_tv->fuserTv(); - // Get a reference tensor replayed as existing loop structure ReferenceTensor reference = IndexReferenceReplay::getReference(loops); // Generate halo information for reference. updateHaloInfoForReference(reference, consumer_tv); + const auto ref_2_consumer = indexMapReferenceTo( + consumer_tv, gpu_lower->caIndexMap(), reference.concrete_to_id); + + const auto reference_halo_extent_map = + getReferenceHaloExtentMap(reference, ref_2_consumer); + + auto db_axis = gpu_lower->doubleBufferInfo().getDoubleBufferAxis(consumer_tv); + // Both start and stop positions may need to be predicated. Indexing // differs when generating predicates for unswitch. // NOTE: If we could find-and-replace KIR nodes, we could just // generate one index map, clone it and replace the loop-to-index // mappings of unswitched loops for the start predicate. - const auto ref_stop_index_map = getPredicateReferenceIndexing( - loops, reference, unswitch_or_vec_loop, false); - // If not unswitch, share the same indexing map as the stop index map - const auto& ref_start_index_map = unswitch_or_vec_loop != nullptr - ? getPredicateReferenceIndexing( - loops, reference, unswitch_or_vec_loop, true) - : ref_stop_index_map; + auto ref_stop_indexing = getPredicateReferenceIndexing( + loops, reference, unswitch_or_vec_loop, db_axis, false); + const auto consumer_stop_indexing = ref_stop_indexing.updateIndexCompute( + consumer_tv->domain(), + ref_2_consumer, + std::vector(consumer_tv->getMaybeRFactorDomain().size(), false), + reference_halo_extent_map); + const auto& consumer_stop_index_map = consumer_stop_indexing.indexMap(); - auto ref_2_consumer = indexMapReferenceTo( - consumer_tv, gpu_lower->caIndexMap(), reference.concrete_to_id); + // If not unswitch, share the same indexing map as the stop index + // map + std::unordered_map consumer_start_index_map; + if (is_unswitch) { + auto ref_start_indexing = getPredicateReferenceIndexing( + loops, reference, unswitch_or_vec_loop, db_axis, true); + const auto consumer_start_indexing = ref_start_indexing.updateIndexCompute( + consumer_tv->domain(), + ref_2_consumer, + std::vector(consumer_tv->getMaybeRFactorDomain().size(), false), + reference_halo_extent_map); + consumer_start_index_map = consumer_start_indexing.indexMap(); + } else { + consumer_start_index_map = consumer_stop_index_map; + } // Get the contiguous ids we need to generate predicates for - auto contig_id_infos = - getPredicateContigIds(reference, consumer_tv, ref_2_consumer); + auto contig_id_infos = getPredicateContigIds(consumer_tv); auto non_divisible_splits = - getNonDivisibleReferenceDomainsToPredicate(consumer_tv, reference); + getNonDivisibleConsumerDomainsToPredicate(consumer_tv); contig_id_infos.insert( contig_id_infos.end(), non_divisible_splits.begin(), @@ -2972,52 +2943,22 @@ std::pair, ReferenceTensor> Index:: } auto root_ids = contig_id_entry.covered_ids; - auto kir_contig_id = - gpu_lower->lowerValue(contig_id)->as(); - const auto ref_stop_indexing_it = ref_stop_index_map.find(kir_contig_id); + const auto consumer_stop_indexing_it = + consumer_stop_index_map.find(contig_id); - // First condition below is due to broadcasts in consumers of consumer that - // are not in consumer there can be unresolved indexing in the reference - // tensor. This can happen when we have something like: TV3[i1o*i2, i1i] and - // TV1[i2] where tv3 and tv1 share their outer dimension. i1 will be part of - // reference tensors root domain, but when indexing into TV1 there aren't - // enough indices to resolve it. - // - // The condition also happens with Misaligned predicates, where + // First condition below happens with Misaligned predicates, where // inner-most vectorized loops are not included in the loops // parameter. Predicates involving vectorized loops are separately // generated in lower_misaligned_vectorization. // - // It can also happens with rfactored reductions. The reference - // tensor may include rfactored domains, so the contig id may be - // a root domain of the reference, not a rfactor root. Since - // there is no loop for rfactor domains, there's no indexing - // mapping for root domains. This seems safe as it can only happen - // with rfactor and rfactored tensors do not need predicates. - // // Second condition is simply to avoid predication on broadcasting axes as // it's not required. - if (ref_stop_indexing_it == ref_stop_index_map.end() || - ref_stop_indexing_it->second->isZeroInt()) { + if (consumer_stop_indexing_it == consumer_stop_index_map.end() || + consumer_stop_indexing_it->second->isZeroInt()) { continue; } - // Find a corresponding consumer root id if exists. Used to - // support shift. If a contig_id is a merged non-root domain, nothing - // is required to do for shift as shift-related domains are - // excluded from contig domains. - IterDomain* consumer_id = nullptr; - if (contig_id->definition() == nullptr || - contig_id_entry.is_non_divisible_split) { - auto it = ref_2_consumer.find(contig_id); - if (it != ref_2_consumer.end()) { - consumer_id = it->second; - } else { - continue; - } - } - RootPredicateInfo info; // Compute offsets for start and stop predicate. For non-shift, @@ -3032,53 +2973,50 @@ std::pair, ReferenceTensor> Index:: // The final predicates will look like: // (index + start_offset) >= 0 && (index + stop_offset) < extent. - std::tie(info.start_offsets_, info.stop_offsets_) = getStartAndStopOffsets( - consumer_id, + std::tie(info.start_offset_, info.stop_offset_) = getStartAndStopOffsets( + contig_id, consumer_tv, reference, - ref_start_index_map, - ref_stop_index_map, + consumer_start_index_map, + consumer_stop_index_map, shift_padding, unswitch_or_vec_loop != nullptr, contig_id_entry.is_non_divisible_split); - auto stop_index = ref_stop_indexing_it->second; - auto start_index = ref_start_index_map.at(kir_contig_id); + auto stop_index = consumer_stop_indexing_it->second; + auto start_index = consumer_start_index_map.at(contig_id); // Build predicates for start positions as: // start_index + start_offset >= 0 - for (auto start_offset : info.start_offsets_) { - if (canOmitStartPredicate(start_offset)) { - info.start_predicates_.push_back(ir_builder.trueVal()); - continue; - } + auto start_offset = simplifyStartOffset(info.start_offset_); + if (start_offset == nullptr) { + info.start_predicate_ = GpuLower::current()->kernel()->trueVal(); + } else { auto offsetted_start_index = - ir_builder.addExpr(start_index, start_offset); - auto pred = - ir_builder.geExpr(offsetted_start_index, ir_builder.zeroVal()) - ->as(); - info.start_predicates_.push_back(pred); + SimplifyingIrBuilder::addExpr(start_index, start_offset); + auto start_pred = + SimplifyingIrBuilder::geExpr( + offsetted_start_index, GpuLower::current()->kernel()->zeroVal()) + ->as(); + info.start_predicate_ = start_pred; } // Build predicates for stop positions as: // stop_index + stop_offset < IterDomain::extent - for (auto stop_offset : info.stop_offsets_) { - if (canOmitStopPredicate(stop_index, stop_offset, kir_contig_id)) { - info.stop_predicates_.push_back(ir_builder.trueVal()); - continue; - } - auto offsetted_stop_index = ir_builder.addExpr(stop_index, stop_offset); - auto pred = - ir_builder.ltExpr(offsetted_stop_index, kir_contig_id->extent()) - ->as(); - info.stop_predicates_.push_back(pred); + auto stop_offset = info.stop_offset_; + if (canOmitStopPredicate(stop_index, stop_offset, contig_id)) { + info.stop_predicate_ = GpuLower::current()->kernel()->trueVal(); + } else { + auto offsetted_stop_index = + SimplifyingIrBuilder::addExpr(stop_index, stop_offset); + auto stop_pred = SimplifyingIrBuilder::ltExpr( + offsetted_stop_index, contig_id->extent()) + ->as(); + info.stop_predicate_ = stop_pred; } - // Transform ids from reference to concrete and consumer domains - // (based on loop compute at map) - for (auto ref_id : contig_id_entry.covered_ids) { - info.root_ids_.insert(reference.id_to_concrete.at(ref_id)); - info.consumer_ids_.insert(ref_2_consumer.at(ref_id)); + for (auto consumer_id : contig_id_entry.covered_ids) { + info.root_ids_.insert(consumer_id); } pred_info_vec.emplace_back(info); } @@ -3089,7 +3027,7 @@ std::pair, ReferenceTensor> Index:: bool Index::protectWithMagicZero( kir::ForLoop* loop, IterDomain* reference_domain, - kir::Val* ind) { + Val* ind) { bool ref_dom_simple = (reference_domain == nullptr ? true : reference_domain->definition() != nullptr); @@ -3100,16 +3038,9 @@ bool Index::protectWithMagicZero( } RootPredicateInfo RootPredicateInfo::getFalseInfo() { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - RootPredicateInfo info; - info.start_predicates_.push_back(ir_builder.falseVal()); - info.stop_predicates_.push_back(ir_builder.falseVal()); - // These are just placeholder. When the predicate is false, the - // offset should not be used. - info.start_offsets_.push_back(nullptr); - info.stop_offsets_.push_back(nullptr); + info.start_predicate_ = GpuLower::current()->kernel()->falseVal(); + info.stop_predicate_ = GpuLower::current()->kernel()->falseVal(); return info; } diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/torch/csrc/jit/codegen/cuda/index_compute.h index 83536067c19..27f1c911bde 100644 --- a/torch/csrc/jit/codegen/cuda/index_compute.h +++ b/torch/csrc/jit/codegen/cuda/index_compute.h @@ -69,30 +69,30 @@ class IndexCompute : public BackwardVisitor { void handle(Expr*) override; // return extent_map_[id] if exists, else return id->extent() - kir::Val* getExtent(kir::IterDomain* id); + Val* getExtent(IterDomain* id); //! True if a domain is not used to index - bool isZero(kir::IterDomain* id) const; + bool isZero(IterDomain* id) const; //! True if any dependent of a domain is not used to index - bool hasZeroMerged(kir::IterDomain* id) const; + bool hasZeroMerged(IterDomain* id) const; // Tensor domain we're mapping back to root const TensorDomain* td_; // NOLINT // Map we update as we propagate backward, containing all IDs in the // propagation. Initial indices are mapped with this map at tv->domain() - // and are back propagated to tv->rootDomain(). This index_map_ keeps the + // and are back propagated to tv->getRootDomain(). This index_map_ keeps the // indices at intermediate IterDomain's in that back propagation. - std::unordered_map index_map_; // NOLINT + std::unordered_map index_map_; // NOLINT // Map from IterDomain to their broadcasted extent. If a TV has I0*I1 but its // producer has B0*I1 this map will contain a mapping from the ID{B0*I1} to // the extent I0*I1. Also contains updated extents if we merge in a 0 index. // See zero_merged_in_. - std::unordered_map extent_map_; // NOLINT + std::unordered_map extent_map_; // NOLINT // Keeps track of domains that do not contribute to indexing - std::unordered_set zero_domains_; // NOLINT + std::unordered_set zero_domains_; // NOLINT // This set keeps track of IterDomain's that have had a zero index merged into // them. This happens if we do something like tv->axis(0)->split(4) then @@ -100,47 +100,46 @@ class IndexCompute : public BackwardVisitor { // indexing would be (0, i) then when we do the backward computation that zero // and i would attempt to be merged together. We handle indices like these // specially. - std::unordered_set zero_merged_in_; + std::unordered_set zero_merged_in_; // IDs that are a result of contiguous merges - std::unordered_set contig_ids; + std::unordered_set contig_ids; // Mentions if we should propagate an index down a particular IterDomain path // if there's an option - std::unordered_set preferred_paths_; + std::unordered_set preferred_paths_; // Map from IterDomains to halo-extended extents in corresponding // reference tensor - std::unordered_map reference_halo_extent_map_; + std::unordered_map reference_halo_extent_map_; public: - const std::unordered_map& indexMap() const { + const std::unordered_map& indexMap() const { return index_map_; } - const std::unordered_map& extentMap() const { + const std::unordered_map& extentMap() const { return extent_map_; } - const std::unordered_set& zeroDomains() const { + const std::unordered_set& zeroDomains() const { return zero_domains_; } - const std::unordered_set& zeroMergedIn() const { + const std::unordered_set& zeroMergedIn() const { return zero_merged_in_; } // Propagate back from _td using initial_index_map IndexCompute( const TensorDomain* _td, - std::unordered_map initial_index_map, - std::unordered_map _extent_map, - std::unordered_set zero_domains, - std::unordered_set _zero_merged_in, + std::unordered_map initial_index_map, + std::unordered_map _extent_map, + std::unordered_set zero_domains, + std::unordered_set _zero_merged_in, const std::vector& _root_contiguity, - std::unordered_set preferred_paths = {}, - std::unordered_map - reference_halo_extent_map = {}); + std::unordered_set preferred_paths = {}, + std::unordered_map reference_halo_extent_map = {}); // Updates index_map, extent_map, and zero_merged_in based on id_map and // returns a new IndexCompute ready to be used. @@ -148,8 +147,8 @@ class IndexCompute : public BackwardVisitor { const TensorDomain* new_td, const std::unordered_map& id_map, const std::vector& _root_contiguity, - const std::unordered_map& - reference_halo_extent_map = {}); + const std::unordered_map& reference_halo_extent_map = + {}); virtual void run(); }; @@ -159,10 +158,10 @@ class IndexSwizzle : public IndexCompute { public: IndexSwizzle( const TensorView* tv, - std::unordered_map initial_index_map, - std::unordered_map extent_map, - std::unordered_set zero_domains, - std::unordered_set zero_merged_in); + std::unordered_map initial_index_map, + std::unordered_map extent_map, + std::unordered_set zero_domains, + std::unordered_set zero_merged_in); void run() override; @@ -183,51 +182,45 @@ class RootPredicateInfo { friend class Index; public: - const auto& startPredicates() const { - return start_predicates_; + const auto& startPredicate() const { + return start_predicate_; } - auto& startPredicates() { - return start_predicates_; + auto& startPredicate() { + return start_predicate_; } - const auto& startOffsets() const { - return start_offsets_; + const auto& startOffset() const { + return start_offset_; } - const auto& stopPredicates() const { - return stop_predicates_; + const auto& stopPredicate() const { + return stop_predicate_; } - const auto& stopOffsets() const { - return stop_offsets_; + const auto& stopOffset() const { + return stop_offset_; } const auto& rootIds() const { return root_ids_; } - const auto& consumerIds() const { - return consumer_ids_; - } - //! Return a false RootPredicateInfo, i.e., both start and stop //! predicates are false. static RootPredicateInfo getFalseInfo(); private: - // prdicates for lower end - std::vector start_predicates_; - // prdicates for upper end - std::vector stop_predicates_; - // Offsets of the start predicate - std::vector start_offsets_; - // Offsets of the stop predicate - std::vector stop_offsets_; + // prdicate for lower end + Bool* start_predicate_ = nullptr; + // prdicate for upper end + Bool* stop_predicate_ = nullptr; + // Offset of the start predicate + Val* start_offset_ = nullptr; + // Offset of the stop predicate + Val* stop_offset_ = nullptr; // Track which roots have been handled by the generated predicates std::unordered_set root_ids_; - // Consumer IDs that correspond to root_ids_ - std::unordered_set consumer_ids_; }; // Simple interface for IndexCompute @@ -236,24 +229,24 @@ class RootPredicateInfo { class Index { private: // Producer indexing if it's in shared or local memory - static std::vector getNonGlobalProducerStridedIndices( + static std::vector getNonGlobalProducerStridedIndices( TensorView* producer, const TensorView* consumer, const std::vector& loops); // Consumer indexing if it's in shared or local memory - static std::vector getNonGlobalConsumerStridedIndices( + static std::vector getNonGlobalConsumerStridedIndices( const TensorView* consumer, const std::vector& loops); // Producer if it's in global memory - static std::vector getGlobalProducerStridedIndices( + static std::vector getGlobalProducerStridedIndices( TensorView* producer, const TensorView* consumer, const std::vector& loops); // Consumer indexing if it's in global memory - static std::vector getGlobalConsumerStridedIndices( + static std::vector getGlobalConsumerStridedIndices( const TensorView* consumer, const std::vector& loops); @@ -276,7 +269,7 @@ class Index { //! root domain of a producer tensor. The size of the returned //! vector is guaranteed to be equal to the number of axes of the //! indexing root domain. - static std::vector getProducerStridedIndices( + static std::vector getProducerStridedIndices( TensorView* producer, const TensorView* consumer, const std::vector& loops); @@ -285,7 +278,7 @@ class Index { //! root domain of a consumer tensor. The size of the returned //! vector is guaranteed to be equal to the number of axes of the //! indexing root domain. - static std::vector getConsumerStridedIndices( + static std::vector getConsumerStridedIndices( const TensorView* consumer, const std::vector& loops); @@ -313,7 +306,7 @@ class Index { //! vectorized loop. static std::pair, ReferenceTensor> getReferenceRootPredicates( - const kir::TensorView* kir_consumer_tv, + TensorView* consumer_tv, const std::vector& loops, kir::ForLoop* unswitch_or_vec_loop, bool padding_predicate); @@ -328,7 +321,7 @@ class Index { static bool protectWithMagicZero( kir::ForLoop* loop, IterDomain* reference_domain = nullptr, - kir::Val* ind = nullptr); + Val* ind = nullptr); }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp b/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp index fcd0a8937ed..27e5b93e94e 100644 --- a/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp +++ b/torch/csrc/jit/codegen/cuda/index_reference_replay.cpp @@ -1,11 +1,10 @@ #include #include +#include #include #include #include -#include -#include namespace torch { namespace jit { @@ -41,16 +40,12 @@ IterDomain* IndexReferenceReplay::idCopy(IterDomain* id) { // reduction. All we care about are the transformations, and trying to make // sure we track correctly a replaying with consistent reduction/broadcast // domains is challenging and unnecessary. - auto copied_id = - new IterDomain(id->start(), id->extent(), id->getParallelType()); + auto copied_id = IrBuilder::create( + id->container(), id->start(), id->extent(), id->getParallelType()); replayed_ids_.emplace_back(copied_id); return copied_id; } -IterDomain* IndexReferenceReplay::toFusionID(kir::IterDomain* kir_id) { - return ca_map_.toFusion(kir_id); -} - IterDomain* IndexReferenceReplay::toConcrete(IterDomain* id) { return ca_map_.getConcreteMappedID(id); } @@ -70,7 +65,8 @@ void IndexReferenceReplay::handle(Split* split) { } // Replay the provided split operation and add it to the reference DAG - new Split( + IrBuilder::create( + split->container(), ref_outer, ref_inner, ref_in, @@ -101,7 +97,7 @@ void IndexReferenceReplay::handle(Merge* merge) { } // Replay the provided merge operation and add it to the reference DAG - new Merge(ref_out, ref_outer, ref_inner); + IrBuilder::create(merge->container(), ref_out, ref_outer, ref_inner); // Mark producers and consumers ref_id_consumed_.emplace(ref_outer); @@ -149,7 +145,7 @@ TensorDomain* IndexReferenceReplay::computeReplay() { loop_structure_.begin(), loop_structure_.end(), std::back_inserter(domain_ids), - [this](kir::ForLoop* fl) { return toFusionID(fl->iter_domain()); }); + [](kir::ForLoop* fl) { return fl->iter_domain(); }); // IterVisitor based traversals don't work because we don't have all outputs. // backward traversal's traverseFrom(domain_ids) will throw "Invalid backward @@ -194,7 +190,7 @@ TensorDomain* IndexReferenceReplay::computeReplay() { // Construct a tensor that's representitive of the replayed loop structure. std::vector loops_replayed_domain; for (auto loop : loop_structure_) { - auto loop_id = toFusionID(loop->iter_domain()); + auto loop_id = loop->iter_domain(); // Map to loops with the loop map, but make sure the replayed id is actually // a leaf in the replay. auto ref_id_it = std::find_if( @@ -222,7 +218,7 @@ TensorDomain* IndexReferenceReplay::computeReplay() { loops_replayed_domain.begin(), loops_replayed_domain.end(), [](IterDomain* id) { return id->definition() != nullptr; })) { - auto domain = new TensorDomain( + auto domain = IrBuilder::create( // If there was no replay only return a domain with a root domain. loops_replayed_domain); return domain; @@ -257,8 +253,9 @@ TensorDomain* IndexReferenceReplay::computeReplay() { } // Create and return the reference. - auto domain = new TensorDomain( - {root_domain_ids.begin(), root_domain_ids.end()}, + auto domain = IrBuilder::create( + std::vector( + root_domain_ids.begin(), root_domain_ids.end()), loops_replayed_domain); return domain; } @@ -266,26 +263,30 @@ TensorDomain* IndexReferenceReplay::computeReplay() { IndexCompute getReferenceIndexing( const std::vector& loop_structure, - TensorDomain* reference_tensor) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - + TensorDomain* reference_tensor, + kir::ForLoop* double_buffer_loop) { // Create a simple index mapping from loop iter domains to their local index. // This is only applicable to global memory buffers. - std::unordered_map initial_index_map; + std::unordered_map initial_index_map; TORCH_INTERNAL_ASSERT(loop_structure.size() <= reference_tensor->nDims()); int magic_zero_loop = -1; for (const auto loop_i : c10::irange(loop_structure.size())) { auto ref_axis = reference_tensor->axis(loop_i); - auto kir_ref_axis = gpu_lower->lowerValue(ref_axis)->as(); auto loop = loop_structure[loop_i]; auto ind = loop->index(); - ; - initial_index_map[kir_ref_axis] = ind; + initial_index_map[ref_axis] = ind; if (loop->vectorize()) { - initial_index_map[kir_ref_axis] = ir_builder.create(0); + initial_index_map[ref_axis] = GpuLower::current()->kernel()->zeroVal(); + } else if (double_buffer_loop == loop) { + // This version of getReferenceIndexing is only used for + // indexing global tensors. When indexing global producers, the + // index for a double buffered loop needs to be incremented. The + // parameter double_buffer_loop should be nullptr when indexing + // global consumers tensors. + initial_index_map[ref_axis] = + IrBuilder::addExpr(ind, GpuLower::current()->kernel()->oneVal()); } if (Index::protectWithMagicZero(loop, ref_axis, ind)) { @@ -295,10 +296,9 @@ IndexCompute getReferenceIndexing( // Add magic zero to a fairly inner most index if (magic_zero_loop >= 0) { - auto ref_id = gpu_lower->lowerValue(reference_tensor->axis(magic_zero_loop)) - ->as(); - initial_index_map[ref_id] = ir_builder.addExpr( - initial_index_map[ref_id], ir_builder.magicZeroVal()); + auto ref_id = reference_tensor->axis(magic_zero_loop); + initial_index_map[ref_id] = IrBuilder::addExpr( + initial_index_map[ref_id], FusionGuard::getCurFusion()->magicZeroVal()); } // Send to the other version of reference indexing that directly takes the @@ -310,19 +310,17 @@ IndexCompute getReferenceIndexing( IndexCompute getReferenceIndexing( const std::vector& loop_structure, TensorDomain* reference_tensor, - std::unordered_map index_map, - std::unordered_set zero_domains, + std::unordered_map index_map, + std::unordered_set zero_domains, std::unordered_set preferred_paths, - std::unordered_map halo_extent_map) { - auto gpu_lower = GpuLower::current(); - + std::unordered_map halo_extent_map) { // I thought this might be necesasry, but turns out it's not. I think it's // because of the root ordering above, however leaving it in case we find // out it is necessary in some cases. At the time of commiting, cuda-memcheck // passed without this. // - // std::unordered_map reference_extent_map; for (auto loop : loop_structure) { + // std::unordered_map reference_extent_map; for (auto loop : loop_structure) { // // If there's a broadcast merged in the for loop ID we want to track its // // extent // auto inputs = InputsOf::outputs( @@ -342,16 +340,6 @@ IndexCompute getReferenceIndexing( // } // } - // Convert to preferred_path to kir::IterDomain for IndexCompute - std::unordered_set kir_preferred_path; - std::transform( - preferred_paths.begin(), - preferred_paths.end(), - std::inserter(kir_preferred_path, kir_preferred_path.begin()), - [&gpu_lower](IterDomain* id) { - return gpu_lower->lowerValue(id)->as(); - }); - IndexCompute compute( reference_tensor, index_map, // NOLINT @@ -359,9 +347,9 @@ IndexCompute getReferenceIndexing( // in this function {}, zero_domains, - std::unordered_set(), + std::unordered_set(), reference_tensor->contiguity(), - kir_preferred_path, + preferred_paths, halo_extent_map); compute.run(); diff --git a/torch/csrc/jit/codegen/cuda/index_reference_replay.h b/torch/csrc/jit/codegen/cuda/index_reference_replay.h index c4626213e76..fcb8e1f94e8 100644 --- a/torch/csrc/jit/codegen/cuda/index_reference_replay.h +++ b/torch/csrc/jit/codegen/cuda/index_reference_replay.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -34,10 +34,6 @@ class IndexReferenceReplay : public OptInDispatch { // Make a new id for the reference replay based on the provided id IterDomain* idCopy(IterDomain* id); - // Use the compute at map to get the fusion IterDomain from the - // kir::IterDomain - IterDomain* toFusionID(kir::IterDomain* kir_id); - // Return the concrete entry of the non-reference id IterDomain* toConcrete(IterDomain* id); @@ -87,16 +83,17 @@ class IndexReferenceReplay : public OptInDispatch { IndexCompute getReferenceIndexing( const std::vector& loop_structure, TensorDomain* reference_domain, - std::unordered_map index_map, - std::unordered_set zero_domains, + std::unordered_map index_map, + std::unordered_set zero_domains, std::unordered_set preferred_path, - std::unordered_map halo_extent_map = {}); + std::unordered_map halo_extent_map = {}); // Short cut for global TVs. Index into the reference based on all loop indicies // in the loop structure. IndexCompute getReferenceIndexing( const std::vector& loop_structure, - TensorDomain* reference_domain); + TensorDomain* reference_domain, + kir::ForLoop* double_buffer_loop = nullptr); // When indexing there are sometimes an option to propagate an index down // multiple paths. This will return the IterDomains in the history of the diff --git a/torch/csrc/jit/codegen/cuda/instrumentation.cpp b/torch/csrc/jit/codegen/cuda/instrumentation.cpp index 52e16b3a7af..d227df0ab26 100644 --- a/torch/csrc/jit/codegen/cuda/instrumentation.cpp +++ b/torch/csrc/jit/codegen/cuda/instrumentation.cpp @@ -1,6 +1,6 @@ #include -#include +#include #ifdef _WIN32 #include diff --git a/torch/csrc/jit/codegen/cuda/interface.cpp b/torch/csrc/jit/codegen/cuda/interface.cpp index bd54d30811d..d21004ae154 100644 --- a/torch/csrc/jit/codegen/cuda/interface.cpp +++ b/torch/csrc/jit/codegen/cuda/interface.cpp @@ -15,13 +15,15 @@ C10_DEFINE_bool( C10_DEFINE_bool( torch_jit_nvfuser_horizontal_fusion, true, - "enable single node fusion for nvfuser"); + "enable horizontal fusion for nvfuser"); namespace torch { namespace jit { namespace fuser { namespace cuda { +static std::atomic cuda_fusion_guard_mode{true}; + bool getSingletonFusion() { return FLAGS_torch_jit_nvfuser_singleton_fusion; } @@ -42,8 +44,6 @@ bool setHorizontalFusion(bool value) { return old_value; } -static std::atomic cuda_fusion_guard_mode{true}; - std::atomic& getCudaFusionGuardMode() { return cuda_fusion_guard_mode; } @@ -329,6 +329,220 @@ RegisterOperators reg_guard({ aliasAnalysisFromSchema()), }); +// Infer dynamic axis (-1) in view_sizes given tensor_sizes +bool inferViewShape( + c10::List tensor_sizes, + c10::List view_sizes) { + int64_t dynamic_index = -1; + size_t view_size_num_elements = 1; + for (size_t idx = 0; idx < view_sizes.size(); ++idx) { + if (view_sizes[idx] == -1) { + TORCH_INTERNAL_ASSERT( + dynamic_index == -1, "Only one dimension can by inferred.") + dynamic_index = idx; + } else { + TORCH_INTERNAL_ASSERT(view_sizes[idx] > 0); + view_size_num_elements *= view_sizes[idx]; + } + } + const size_t kNumElements = std::accumulate( + tensor_sizes.begin(), tensor_sizes.end(), 1, std::multiplies<>()); + + if (kNumElements % view_size_num_elements != 0) { + return false; + } + + if (dynamic_index != -1) { + view_sizes[dynamic_index] = kNumElements / view_size_num_elements; + } + + return true; +} + +//! [ Note -- type guard logic in CudaFusionViewGuard ] +//! +//! CudaFusionViewGuard is used to guard input tensors to a `CudaFusionGroup` +//! that contains view operations, so that we would not feed inputs that +//! violate the graph defined in `GraphCache`. +//! +//! output = view(self, view-sizes) +//! +//! View Guard Inputs: +//! 1. self tensor_sizes - dynamic size List[Int] +//! 2. view_sizes - profile_ivalue List[Int] +//! 3. tensor_constraint - Constant List[Int] +//! 4. view_sizes_constraint - Constant List[Int] +//! +//! Things that we check: +//! 1. The #dimensions are the same for self tensor and its constraint +//! 2. The #dimensions are the same for view-sizes and its constraint +//! 3. Self tensor does not violate its constraint +//! a. Queue unrestricted sizes +//! b. Calculate #elements in self tensor +//! 4. view-sizes does not violate its constraint +//! a. Pop unrestricted sizes from queue +//! b. Calculate #elements in view-sizes +//! 5. The #elements is the same for self tensor and view-sizes +//! +//! Constraints: +//! A restricted axis creates a graph constraint, so its sizes is static. +//! An unrestricted axis is allowed to have a dynamic size, if it is consistent +//! between self tensor and view-sizes. It is marked with -1 in the constraint. +//! Only iterDomains with the Keep transform are dynamic. All other transforms +//! create a static constraint. +//! +bool checkViewGuard( + c10::List tensor_sizes, + c10::List view_sizes, + c10::List tensor_constraint, + c10::List view_sizes_constraint) { + // 1: Num Dimensions Check + if (tensor_constraint.size() != tensor_sizes.size() || + view_sizes_constraint.size() != view_sizes.size()) { + return false; + } + + // If axis allows dynamic sizes, then add tensor size to this queue. + // For dynamic axes in view_sizes, check that it is consistent with + // the corresponding tensor size. + std::queue dynamic_axis_queue; + + // 2. Tensor Static Check + int64_t tensor_size_product = 1; + for (const auto idx : c10::irange(tensor_sizes.size())) { + if (tensor_constraint[idx] == -1) { + dynamic_axis_queue.push(tensor_sizes[idx]); + } else if (tensor_constraint[idx] != tensor_sizes[idx]) { + return false; + } + tensor_size_product *= tensor_sizes[idx]; + } + + // 3. View-Sizes Static Check + int64_t view_size_product = 1; + for (const auto idx : c10::irange(view_sizes.size())) { + auto dynamic_size = (view_sizes_constraint[idx] == -1) + ? dynamic_axis_queue.front() + : view_sizes_constraint[idx]; + if (dynamic_size != view_sizes[idx]) { + return false; + } + view_size_product *= dynamic_size; + if (view_sizes_constraint[idx] == -1) { + dynamic_axis_queue.pop(); + } + } + + // 4. Check view invariant + // The number of elements in the input and output tensors are the same. + return tensor_size_product == view_size_product; +} + +//! +//! CudaFusionViewGuard Example Graph: +//! +//! graph(%self : __torch__.BiasViewRelu, +//! %inputs.1 : Tensor): +//! %2 : int = prim::Constant[value=-1]() # dynamic_bvg.py:50:40 +//! %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25 +//! %4 : NoneType = prim::Constant() +//! %5 : int[] = prim::Constant[value=[2, 3]]() +//! %6 : int[] = aten::size(%inputs.1) # dynamic_bvg.py:50:25 +//! %7 : int[] = aten::slice(%6, %4, %2, %3) # dynamic_bvg.py:50:25 +//! %view_shape.1 : int[] = aten::add(%7, %5) # dynamic_bvg.py:50:25 +//! %bias : Tensor = prim::GetAttr[name="bias"](%self) +//! %10 : int[] = aten::size(%bias) +//! %11 : int[] = prim::BroadcastSizes(%6, %10) +//! %12 : bool = prim::CudaFusionGuard[types=[...]](%inputs.1, %bias) +//! %13 : int[] = prim::Constant[value=[-1, -1, -1, 6]]() +//! %14 : int[] = prim::Constant[value=[-1, -1, -1, 2, 3]]() +//! %15 : bool = prim::CudaFusionViewGuard(%11, %view_shape.1, %13, %14) +//! %16 : bool[] = prim::ListConstruct(%15, %12) +//! %17 : bool = aten::all(%16) +//! %18 : Tensor = prim::If(%17) +//! block0(): +//! %19 : Tensor = prim::CudaFusionGroup_0[cache_id=0](%inputs.1, %bias) +//! -> (%19) +//! block1(): +//! %20 : Function = prim::Constant[name="fallback_fn", fallback=1]() +//! %21 : (...) = prim::CallFunction(%20, %inputs.1, %bias, %view_shape.1) +//! %22 : Float(...) = prim::TupleUnpack(%21) +//! -> (%22) +//! return (%18) +//! with prim::CudaFusionGroup_0 = graph(%0 : Float(...), +//! %1 : Float(...)): +//! %2 : int[] = prim::Constant[value=[2, 3, 4, 2, 3]]() +//! %3 : int = prim::Constant[value=1]() # dynamic_bvg.py:50:25 +//! %o.1 : Float(...) = aten::add(%0, %1, %3) # dynamic_bvg.py:51:16 +//! %5 : Float(...) = prim::view_copy(%o.1, %2) +//! %6 : Float(...) = aten::relu(%5) # dynamic_bvg.py:53:19 +//! return (%6) +//! +RegisterOperators view_guard({ + Operator( + "prim::CudaFusionViewGuard(...) -> bool", + // prim::CudaFusionViewGuard returns a fresh Boolean type without + // aliasing. if we would ever return refined tensor, which would change + // aliasing analysis, we should update aliasdb pass. + [](const Node* node) -> Operation { + return [](Stack& stack) { + // view_sizes_constraint - Constant List[Int] + at::ArrayRef inputs = last(stack, 4); + + // tensor_sizes is the runtime size for the self tensor + // tensor_sizes - dynamic size List[Int] + TORCH_INTERNAL_ASSERT( + inputs[0].isIntList(), "tensor_sizes needs to be Int List"); + auto tensor_sizes = inputs[0].toIntList(); + + // profiled_view_sizes is the runtime view size + // profiled_view_sizes - profile_ivalue List[Int] + TORCH_INTERNAL_ASSERT( + inputs[1].isIntList(), + "profiled_view_sizes needs to be Int list"); + auto profiled_view_sizes = inputs[1].toIntList(); + + // tensor_constraint is a constant List[Int] + // used to guard tensor_sizes + TORCH_INTERNAL_ASSERT( + inputs[2].isIntList(), + "tensor constraint needs to be Int List"); + auto tensor_constraint = inputs[2].toIntList(); + + // view_sizes_constraint is a constant List[Int] + // used to guard profiled_view_sizes + TORCH_INTERNAL_ASSERT( + inputs[3].isIntList(), + "view_sizes constraint needs to be Int List"); + auto view_sizes_constraint = inputs[3].toIntList(); + + // Drop after gather all input arguments + // If an argument is moved, it is destroyed when dropped from stack + drop(stack, 4); + + auto status = inferViewShape(tensor_sizes, profiled_view_sizes); + if (!status) { + push(stack, IValue(false)); + return; + } + + if (!fuser::cuda::getCudaFusionGuardMode()) { + push(stack, IValue(true)); + return; + } + + auto guard_status = checkViewGuard( + tensor_sizes, + profiled_view_sizes, + tensor_constraint, + view_sizes_constraint); + push(stack, IValue(guard_status)); + return; + }; + }, + aliasAnalysisFromSchema()), +}); + // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) RegisterOperators reg_add_optional({ Operator( @@ -346,6 +560,160 @@ RegisterOperators reg_add_optional({ }, aliasAnalysisFromSchema()), }); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_view_copy({ + Operator( + "prim::view_copy(Tensor self, int[] size) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "view_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, size; + pop(stack, self, size); + push(stack, at::native::view(self.toTensor(), size.toIntVector())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_reshape_copy({ + Operator( + "prim::reshape_copy(Tensor self, int[] shape) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "reshape_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, shape; + pop(stack, self, shape); + push( + stack, + at::native::reshape(self.toTensor(), shape.toIntVector())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_squeeze_copy({ + Operator( + "prim::squeeze_copy(Tensor self) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "squeeze_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self; + pop(stack, self); + push(stack, at::squeeze(self.toTensor())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_squeeze_dim_copy({ + Operator( + "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "squeeze_dim_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, dim; + pop(stack, self, dim); + push(stack, at::squeeze(self.toTensor(), dim.toInt())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_unsqueeze_copy({ + Operator( + "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor", + [](const Node* node) -> Operation { + return [node](Stack& stack) { + TORCH_CHECK( + node->s(attr::name) == "CudaFusionGroup", + "unsqueeze_copy is only used by nvfuser to identify non-mutating ", + "alias ops, should be restored after fusion pass!"); + IValue self, dim; + pop(stack, self, dim); + push(stack, at::unsqueeze(self.toTensor(), dim.toInt())); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_infer_unsqueeze_size({ + Operator( + "prim::infer_unsqueeze_size(int[] a, int dim) -> int[]", + [](const Node* node) -> Operation { + return [](Stack& stack) { + auto dim = pop(stack).toInt(); + auto size = pop(stack).toIntVector(); + if (dim < 0) { + dim = dim + 1 + size.size(); + } + auto it = size.begin() + dim; + size.insert(it, 1); + push(stack, IValue(size)); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_infer_squeeze_dim_size({ + Operator( + "prim::infer_squeeze_size(int[] a, int dim) -> int[]", + [](const Node* node) -> Operation { + return [](Stack& stack) { + auto dim = pop(stack).toInt(); + auto size = pop(stack).toIntVector(); + if (dim < 0) { + dim = dim + size.size(); + } + auto it = size.begin() + dim; + if (*it == 1) { + size.erase(it); + } + push(stack, IValue(size)); + }; + }, + aliasAnalysisFromSchema()), +}); + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +RegisterOperators reg_infer_squeeze_size({ + Operator( + "prim::infer_squeeze_size.dim(int[] a) -> int[]", + [](const Node* node) -> Operation { + return [](Stack& stack) { + auto size = pop(stack).toIntVector(); + + for (auto it = size.begin(); it != size.end(); it++) { + if (*it == 1) { + auto pre = it - 1; + size.erase(it); + it = pre; + } + } + push(stack, IValue(size)); + }; + }, + aliasAnalysisFromSchema()), +}); + } // namespace } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/interface.h b/torch/csrc/jit/codegen/cuda/interface.h index 1ab9e6d8008..8afa854ea5c 100644 --- a/torch/csrc/jit/codegen/cuda/interface.h +++ b/torch/csrc/jit/codegen/cuda/interface.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp index cf3d9c7a8c7..6a094c104df 100644 --- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.cpp @@ -1,8 +1,12 @@ #include #include #include +#include #include #include +#include +#include +#include #include #include @@ -20,16 +24,20 @@ namespace jit { namespace fuser { namespace cuda { +Statement::Statement(IrBuilderPasskey passkey) { + ir_container_ = passkey.ir_container_; +} + Statement::Statement(const Statement* src, IrCloner* ir_cloner) { - // IRCloner when cloning to a new fusion will copy the names of the original - // fusion. If we're cloning into the same fusion, we let Val and Expr get - // their names as usual by registering with the current fusion in their - // constructors, so don't overwrite that here. - if (src->fusion() != ir_cloner->fusion()) { - name_ = src->name_; - } - fusion_ = ir_cloner->fusion(); - ir_cloner->registerClone(src, this); + ir_container_ = ir_cloner->container(); +} + +void Statement::setName(IrContainerPasskey, StmtNameType name) { + name_ = name; +} + +void Statement::setName(IrBuilderPasskey, StmtNameType name) { + name_ = name; } Val* Statement::asVal() { @@ -42,23 +50,36 @@ Expr* Statement::asExpr() { return this->as(); } -void Statement::print() const { - IrPrinter ir_printer(std::cout); +std::string Statement::toString() const { + std::stringstream ss; + IrPrinter ir_printer(ss); ir_printer.handle(this); - std::cout << std::endl; + return ss.str(); +} + +std::string Statement::toInlineString() const { + std::stringstream ss; + IrPrinter ir_printer(ss); + ir_printer.print_inline(this); + return ss.str(); +} + +Fusion* Statement::fusion() const { + TORCH_INTERNAL_ASSERT( + ir_container_->isA(), "Statement does not belong to a fusion."); + return ir_container_->as(); +} + +kir::Kernel* Statement::kernel() const { + TORCH_INTERNAL_ASSERT( + ir_container_->isA(), + "Statement does not belong to a kernel."); + return ir_container_->as(); } // When we create a Val we immediately register them with the active fusion. -Val::Val(ValType _vtype, DataType _dtype, bool register_val) - : vtype_(_vtype), dtype_(_dtype) { - Fusion* fusion = FusionGuard::getCurFusion(); - TORCH_CHECK( - fusion != nullptr, "No active fusion group found when creating a Val."); - fusion_ = fusion; - if (register_val) { - name_ = fusion_->registerVal(this); - } -} +Val::Val(IrBuilderPasskey passkey, ValType _vtype, DataType _dtype) + : Statement(passkey), vtype_(_vtype), dtype_(_dtype) {} // NOTE: we don't clone the definition_ and uses_ here // since they may introduce cloning cycles. Instead, we copy @@ -71,12 +92,7 @@ Val::Val(const Val* src, IrCloner* ir_cloner) vtype_(src->vtype_), dtype_(src->dtype_), is_fusion_input_(src->is_fusion_input_), - is_fusion_output_(src->is_fusion_output_) { - // If we're "cloning" into the same fusion, register with the fusion - if (src->fusion() == ir_cloner->fusion()) { - name_ = src->fusion()->registerVal(this); - } -} + is_fusion_output_(src->is_fusion_output_) {} const std::vector& Val::uses() const { if (vtype_ == ValType::TensorView) { @@ -92,33 +108,33 @@ namespace { // Traverse definition of all values involved in constructing the provided val. // Check if all values involved are constant values, meaning the provided // val is also a constant value. -class ConstCheck : OptOutConstDispatch { +class ConstCheck : private OptOutConstDispatch { private: bool is_const_ = true; - void handle(const Bool* b) override { + void handle(const Bool* b) final { is_const_ = is_const_ && b->isConst(); } - void handle(const Double* d) override { + void handle(const Double* d) final { is_const_ = is_const_ && d->isConst(); } - void handle(const Int* i) override { + void handle(const Int* i) final { is_const_ = is_const_ && i->isConst(); } - void handle(const NamedScalar* ns) override { + void handle(const NamedScalar* ns) final { is_const_ = is_const_ && false; } - void handle(const Expr* expr) override { + void handle(const Expr* expr) final { for (auto inp : expr->inputs()) { handle(inp); } } - void handle(const Val* val) override { + void handle(const Val* val) final { if (val->definition() != nullptr) { handle(val->definition()); } else { @@ -137,15 +153,18 @@ class ConstCheck : OptOutConstDispatch { } // namespace bool Val::isConstScalar() const { - if (!isScalar()) + if (!isScalar()) { return false; + } return ConstCheck::isConst(this); } c10::optional Val::getInt() const { if (isConstScalar() && isAnInt()) { if (this->getValType() == ValType::Scalar) { - return this->as()->value(); + if (this->isA()) { + return this->as()->value(); + } } } return c10::optional(); @@ -169,7 +188,7 @@ c10::optional Val::getDataType() const { bool Val::isProducerOf(const Val* other) const { TORCH_INTERNAL_ASSERT(other != nullptr); - TORCH_INTERNAL_ASSERT(fusion() == other->fusion()); + TORCH_INTERNAL_ASSERT(container() == other->container()); if (definition() == nullptr) { return false; @@ -186,23 +205,14 @@ bool Val::isConsumerOf(const Val* other) const { // We don't register with the active fusion in Expr as this needs to be done // after inputs and outputs are registered with the Expr -Expr::Expr(ExprType type) : type_{type} { - Fusion* fusion = FusionGuard::getCurFusion(); - if (fusion == nullptr) - TORCH_CHECK(false, "No active fusion group found when creating an Expr."); - fusion_ = fusion; -} +Expr::Expr(IrBuilderPasskey passkey, ExprType etype) + : Statement(passkey), etype_{etype} {} Expr::Expr(const Expr* src, IrCloner* ir_cloner) : Statement(src, ir_cloner), - type_(src->type_), + etype_(src->etype_), inputs_(ir_cloner->clone(src->inputs_)), - outputs_(ir_cloner->clone(src->outputs_)) { - // If we're "cloning" into the same fusion, register with the fusion - if (src->fusion() == ir_cloner->fusion()) { - name_ = src->fusion()->registerExpr(this); - } -} + outputs_(ir_cloner->clone(src->outputs_)) {} bool Expr::sameAs(const Statement* other) const { if (this == other) { @@ -227,6 +237,30 @@ bool Expr::sameAs(const Statement* other) const { return true; } +kir::Predicate* Expr::predicate() const { + TORCH_INTERNAL_ASSERT( + container()->isA(), "Function invalid for fusion."); + return predicate_; +} + +void Expr::setPredicate(kir::Predicate* predicate) { + TORCH_INTERNAL_ASSERT( + container()->isA(), "Function invalid for fusion."); + predicate_ = predicate; +} + +kir::Predicate* Expr::writePredicate() const { + TORCH_INTERNAL_ASSERT( + container()->isA(), "Function invalid for fusion."); + return write_predicate_; +} + +void Expr::setWritePredicate(kir::Predicate* write_predicate) { + TORCH_INTERNAL_ASSERT( + container()->isA(), "Function invalid for fusion."); + write_predicate_ = write_predicate; +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h index 2e0fa0885bd..1b8444fae46 100644 --- a/torch/csrc/jit/codegen/cuda/ir_base_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_base_nodes.h @@ -1,9 +1,9 @@ #pragma once #include +#include #include #include -#include #include #include @@ -35,6 +35,8 @@ namespace jit { namespace fuser { namespace cuda { +using ValueId = int32_t; + using StmtNameType = unsigned int; constexpr StmtNameType kInvalidStmName = @@ -48,6 +50,22 @@ class UnaryOp; class BinaryOp; class IterDomain; class IrCloner; +class IrContainer; +class IrBuilderPasskey; +class IrContainerPasskey; + +namespace kir { +class Kernel; +class Predicate; +} // namespace kir + +// Passkey for container to register names with statements +class ExprPasskey { + friend class Expr; + + private: + explicit ExprPasskey() {} +}; TORCH_CUDA_CU_API void swap(Fusion& a, Fusion& b) noexcept; @@ -60,12 +78,12 @@ TORCH_CUDA_CU_API void swap(Fusion& a, Fusion& b) noexcept; //! is also important for the design to have a dispatch system for a Statment. //! Basically beinng able to succienctly traverse down the inhereitance stack of //! a Statment at runtime. This is currently implemented in dispatch.h -//! class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { friend void swap(Fusion&, Fusion&) noexcept; + friend void swap(IrContainer& a, IrContainer& b) noexcept; public: - Statement() = default; + Statement() = delete; // Cloning constructor Statement(const Statement* src, IrCloner* ir_cloner); @@ -78,7 +96,7 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { static void constDispatch(T handler, const Statement* const); template - static Statement* mutatorDispatch(T mutator, Statement*); + static void mutatorDispatch(T mutator, Statement*); // Accessor functions to types. Vals always have a DataType, Exprs never do virtual c10::optional getValType() const { @@ -106,8 +124,14 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { Expr* asExpr(); // Return the fusion this statement belongs to - Fusion* fusion() const { - return fusion_; + Fusion* fusion() const; + + // Return the kernel this statement belongs to + kir::Kernel* kernel() const; + + // Return the container this statement belongs to + IrContainer* container() const { + return ir_container_; } // Return the int that represents its name @@ -115,6 +139,13 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { return name_; } + // Set the statements' name. Typically the container will set the name, + // however if we're dealing with cloning, IrBuilder will set the name, this + // maybe should be from IrCloner, however I didn't want to add another + // passkey. + void setName(IrContainerPasskey, StmtNameType name); + void setName(IrBuilderPasskey, StmtNameType name); + virtual bool sameType(const Statement* const other) { if (isVal() && other->isVal()) return getValType().value() == other->getValType().value(); @@ -129,13 +160,17 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { return this == other; } - void print() const; + std::string toString() const; + std::string toInlineString() const; protected: + Statement(IrBuilderPasskey); + // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) StmtNameType name_ = kInvalidStmName; + // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) - Fusion* fusion_ = nullptr; + IrContainer* ir_container_ = nullptr; }; //! A Val represents a "value." These are objects, like tensors, scalars, and @@ -169,34 +204,43 @@ class TORCH_CUDA_CU_API Statement : public NonCopyable, public PolymorphicBase { //! class TORCH_CUDA_CU_API Val : public Statement { public: - // We may not want to register this value during Val's constructor. The reason - // for this is that if we register the val, then in a derived constructor try - // to throw, fusion's destructor will get called, but the pointer to this Val - // will be invalid. When fusion tries to delete this value it will cause a seg - // fault, instead of showing the thrown error. explicit Val( + IrBuilderPasskey, ValType _vtype, - DataType _dtype = DataType::Null, - bool register_val = true); + DataType _dtype = DataType::Null); Val(const Val* src, IrCloner* ir_cloner); - // TODO: why is this optional? - // + // Dispatch functions, definitions in dispatch.cpp + template + static void dispatch(T handler, Val*); + + template + static void constDispatch(T handler, const Val* const); + + template + static void mutatorDispatch(T mutator, Val*); + c10::optional getValType() const override { return vtype_; } + ValType vtype() const { + return vtype_; + } + + DataType dtype() const { + return dtype_; + } + // Throws if no DataType is found. Vals must have a DataType - // - // TODO: why is this optional? - // c10::optional getDataType() const override; bool isScalar() const { return vtype_ == ValType::Scalar || vtype_ == ValType::NamedScalar; } + // Returns if all dependencies are constant scalars bool isConstScalar() const; bool isAnInt() const { @@ -205,6 +249,11 @@ class TORCH_CUDA_CU_API Val : public Statement { c10::optional getInt() const; + // Returns if no dependencies and is a constant scalar. + virtual bool isConst() const { + return false; + } + bool isZeroInt() const; bool isOneInt() const; @@ -254,15 +303,11 @@ class TORCH_CUDA_CU_API Val : public Statement { return evaluator_index_; } - // Dispatch functions, definitions in dispatch.cpp - template - static void dispatch(T handler, Val*); - - template - static void constDispatch(T handler, const Val* const); - - template - static Statement* mutatorDispatch(T mutator, Val*); + // Following is managed by Fusion (or kirIrBuilder) and can change. + // TODO: Protect with a passkey. + void setDefinition(Expr* expr) { + definition_ = expr; + } protected: friend Fusion; @@ -272,19 +317,17 @@ class TORCH_CUDA_CU_API Val : public Statement { // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) const DataType dtype_; - // Following is managed by Fusion and can change. - void setDefinition(Expr* expr) { - definition_ = expr; - } - + // TODO: Add fusion passkey for this void setIsFusionInput(bool is_fusion_input) { is_fusion_input_ = is_fusion_input; } + // TODO: Add fusion passkey for this void setIsFusionOutput(bool is_fusion_output) { is_fusion_output_ = is_fusion_output; } + // TODO: Add fusion or container passkey for this void setUses(const std::vector& uses) { uses_ = uses; } @@ -297,6 +340,7 @@ class TORCH_CUDA_CU_API Val : public Statement { Expr* definition_ = nullptr; std::vector uses_; + // Expr evaluator idx; int evaluator_index_ = -1; }; @@ -342,15 +386,16 @@ class TORCH_CUDA_CU_API Val : public Statement { //! class TORCH_CUDA_CU_API Expr : public Statement { public: - explicit Expr(ExprType type); + explicit Expr(IrBuilderPasskey, ExprType type); + Expr(const Expr* src, IrCloner* ir_cloner); c10::optional getExprType() const override { - return type_; + return etype_; } - ExprType type() const { - return type_; + ExprType etype() const { + return etype_; } bool sameAs(const Statement* other) const override; @@ -380,23 +425,46 @@ class TORCH_CUDA_CU_API Expr : public Statement { static void constDispatch(T handler, const Expr* const); template - static Statement* mutatorDispatch(T mutator, Expr*); + static void mutatorDispatch(T mutator, Expr*); + + // TODO: Protect based on being in kernel container + kir::Predicate* predicate() const; + + // TODO: Protect based on being in kernel container + void setPredicate(kir::Predicate* predicate); + + // TODO: Protect based on being in kernel container + kir::Predicate* writePredicate() const; + + // TODO: Protect based on being in kernel container + void setWritePredicate(kir::Predicate* write_predicate); protected: + // TODO: Add Fusion passkey void addInput(Val* input) { TORCH_INTERNAL_ASSERT(input != nullptr); inputs_.push_back(input); } + // TODO: Add Fusion passkey void addOutput(Val* output) { TORCH_INTERNAL_ASSERT(output != nullptr); outputs_.push_back(output); } + ExprPasskey exprPasskey() { + return ExprPasskey(); + } + private: - ExprType type_ = ExprType::Invalid; + ExprType etype_ = ExprType::Invalid; std::vector inputs_; std::vector outputs_; + + kir::Predicate* predicate_ = nullptr; + + // Only used for reduction-related expressions + kir::Predicate* write_predicate_ = nullptr; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp b/torch/csrc/jit/codegen/cuda/ir_builder.cpp similarity index 50% rename from torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp rename to torch/csrc/jit/codegen/cuda/ir_builder.cpp index ce3e17d74d2..17a4e59cfb6 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_builder.cpp @@ -1,35 +1,97 @@ -#include +#include +#include +#include +#include namespace torch { namespace jit { namespace fuser { namespace cuda { -namespace kir { + +//! Clone an IR node, forwarding the arguments to the IrCloner constructor. +template +T* IrBuilder::clone(const T* src, IrCloner* ir_cloner) { + TORCH_INTERNAL_ASSERT( + ir_cloner != nullptr, + "Cannot use create when a cloner object is set. Use clone."); + + TORCH_INTERNAL_ASSERT( + ir_cloner->container() != nullptr, + "Cloner doesn't have a valid container to store cloned object."); + + T* dest = new T(src, ir_cloner); + const Statement* src_stmt = dynamic_cast(src); + Statement* dest_stmt = dynamic_cast(dest); + + auto dest_container = ir_cloner->container(); + auto src_container = src_stmt->container(); + + dest_container->registerStmt(IrBuilderPasskey(dest_container), dest_stmt); + + if (src_container != dest_container) { + dest_stmt->setName(IrBuilderPasskey(dest_container), src_stmt->name()); + } + + ir_cloner->registerClone(src_stmt, dest_stmt); + + return dest; +} + +#define IR_BUILDER_INSTANTIATE(T) \ + template T* IrBuilder::clone(const T* src, IrCloner* ir_cloner); + +// Vals +IR_BUILDER_INSTANTIATE(IterDomain) +IR_BUILDER_INSTANTIATE(TensorDomain) +IR_BUILDER_INSTANTIATE(TensorView) +IR_BUILDER_INSTANTIATE(Bool) +IR_BUILDER_INSTANTIATE(Double) +IR_BUILDER_INSTANTIATE(Int) +IR_BUILDER_INSTANTIATE(NamedScalar) + +// Exprs +IR_BUILDER_INSTANTIATE(Split) +IR_BUILDER_INSTANTIATE(Merge) +IR_BUILDER_INSTANTIATE(TransposeOp) +IR_BUILDER_INSTANTIATE(ShiftOp) +IR_BUILDER_INSTANTIATE(GatherOp) +IR_BUILDER_INSTANTIATE(ViewOp) +IR_BUILDER_INSTANTIATE(UnaryOp) +IR_BUILDER_INSTANTIATE(BinaryOp) +IR_BUILDER_INSTANTIATE(TernaryOp) +IR_BUILDER_INSTANTIATE(ReductionOp) +IR_BUILDER_INSTANTIATE(WelfordOp) +IR_BUILDER_INSTANTIATE(BroadcastOp) Val* IrBuilder::newResult(DataType dtype) { switch (dtype) { case DataType::Bool: - return create(c10::nullopt); + return IrBuilder::create(c10::nullopt); case DataType::Double: - return create(c10::nullopt); + return IrBuilder::create(c10::nullopt); case DataType::Int: - return create(c10::nullopt); + return IrBuilder::create(c10::nullopt); default: TORCH_CHECK(false, "Unexpected data type"); } } Val* IrBuilder::newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs) { - TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types"); + TORCH_CHECK( + lhs->dtype() == rhs->dtype(), + "Incompatible operand types: ", + lhs->dtype(), + " and ", + rhs->dtype()); auto result = newResult(lhs->dtype()); - create(op_type, result, lhs, rhs); + IrBuilder::create(op_type, result, lhs, rhs); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) return result; } Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) { - auto result = create(c10::nullopt); - create(op_type, result, lhs, rhs); + auto result = IrBuilder::create(c10::nullopt); + IrBuilder::create(op_type, result, lhs, rhs); // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) return result; } @@ -37,37 +99,37 @@ Val* IrBuilder::newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs) { Val* IrBuilder::whereExpr(Val* pred, Val* lhs, Val* rhs) { TORCH_CHECK(lhs->dtype() == rhs->dtype(), "Incompatible operand types"); auto result = newResult(lhs->dtype()); - create(TernaryOpType::Where, result, pred, lhs, rhs); + IrBuilder::create(TernaryOpType::Where, result, pred, lhs, rhs); return result; } Val* IrBuilder::negExpr(Val* val) { auto result = newResult(val->dtype()); - create(UnaryOpType::Neg, result, val); + IrBuilder::create(UnaryOpType::Neg, result, val); return result; } Val* IrBuilder::notExpr(Val* val) { auto result = newResult(val->dtype()); - create(UnaryOpType::Not, result, val); + IrBuilder::create(UnaryOpType::Not, result, val); return result; } Val* IrBuilder::setExpr(Val* val) { auto result = newResult(val->dtype()); - create(UnaryOpType::Set, result, val); + IrBuilder::create(UnaryOpType::Set, result, val); return result; } Val* IrBuilder::setExprNamedScalar(const std::string& name, Val* val) { - auto result = create(name, val->dtype()); - create(UnaryOpType::Set, result, val); + auto result = IrBuilder::create(name, val->dtype()); + IrBuilder::create(UnaryOpType::Set, result, val); return result; } Val* IrBuilder::addressExprNamedScalar(const std::string& name, Val* val) { - auto result = create(name, DataType::Int); - create(UnaryOpType::Address, result, val); + auto result = IrBuilder::create(name, DataType::Int); + IrBuilder::create(UnaryOpType::Address, result, val); return result; } @@ -127,45 +189,10 @@ Val* IrBuilder::minExpr(Val* lhs, Val* rhs) { return newArithmeticExpr(BinaryOpType::Min, lhs, rhs); } -Int* IrBuilder::zeroVal() { - if (zero_ == nullptr) { - zero_ = create(0); - } - return zero_; -} - -Int* IrBuilder::oneVal() { - if (one_ == nullptr) { - one_ = create(1); - } - return one_; -} - -Bool* IrBuilder::falseVal() { - if (false_ == nullptr) { - false_ = create(false); - } - return false_; -} - -Bool* IrBuilder::trueVal() { - if (true_ == nullptr) { - true_ = create(true); - } - return true_; -} - -NamedScalar* IrBuilder::magicZeroVal() { - if (magic_zero_ == nullptr) { - magic_zero_ = create(kMagicZeroName, DataType::Int); - } - return magic_zero_; -} - Val* SimplifyingIrBuilder::negExpr(Val* val) { - if (auto int_val = dynamic_cast(val)) { + if (auto int_val = dynamic_cast(val)) { if (int_val->isConst()) { - return create(-int_val->value().value()); + return IrBuilder::create(-int_val->value().value()); } } return IrBuilder::negExpr(val); @@ -175,9 +202,9 @@ Val* SimplifyingIrBuilder::notExpr(Val* val) { if (auto bool_val = dynamic_cast(val)) { if (bool_val->isConst()) { if (bool_val->value().value()) { - return falseVal(); + return FusionGuard::getCurFusion()->falseVal(); } else { - return trueVal(); + return FusionGuard::getCurFusion()->trueVal(); } } } @@ -188,13 +215,13 @@ Val* SimplifyingIrBuilder::addExpr(Int* lhs, Int::ScalarType rhs) { if (rhs == 0) { return lhs; } else if (lhs == nullptr) { - return IrBuilder::create(rhs); + return IrBuilder::IrBuilder::create(rhs); } else if (lhs->isConst()) { - return IrBuilder::create(lhs->value().value() + rhs); + return IrBuilder::IrBuilder::create(lhs->value().value() + rhs); } else if (rhs > 0) { - return IrBuilder::addExpr(lhs, IrBuilder::create(rhs)); + return IrBuilder::addExpr(lhs, IrBuilder::IrBuilder::create(rhs)); } else { - return IrBuilder::subExpr(lhs, IrBuilder::create(-rhs)); + return IrBuilder::subExpr(lhs, IrBuilder::IrBuilder::create(-rhs)); } } @@ -228,6 +255,15 @@ Val* SimplifyingIrBuilder::addExpr(Val* lhs, Val* rhs) { } } +Val* SimplifyingIrBuilder::addExpr(Val* lhs, Int::ScalarType rhs) { + auto lhs_int = dynamic_cast(lhs); + if (lhs_int != nullptr) { + return addExpr(lhs_int, rhs); + } else { + return addExpr(lhs, IrBuilder::create(rhs)); + } +} + Val* SimplifyingIrBuilder::subExpr(Val* lhs, Val* rhs) { return addExpr(lhs, negExpr(rhs)); } @@ -257,9 +293,9 @@ Val* SimplifyingIrBuilder::andExpr(Val* lhs, Val* rhs) { } if (lhs_definitely_true && rhs_definitely_true) { - return trueVal(); + return FusionGuard::getCurFusion()->trueVal(); } else if (lhs_definitely_false || rhs_definitely_false) { - return falseVal(); + return FusionGuard::getCurFusion()->falseVal(); } else if (lhs_definitely_true) { return rhs; } else if (rhs_definitely_true) { @@ -269,7 +305,65 @@ Val* SimplifyingIrBuilder::andExpr(Val* lhs, Val* rhs) { return IrBuilder::andExpr(lhs, rhs); } -} // namespace kir +namespace { + +template +Val* minOrMaxExpr( + Int* lhs, + Int* rhs, + IrBuilderFunc ir_builder_func, + IntFunc int_func) { + if (rhs == nullptr) { + return lhs; + } else if (lhs == nullptr) { + return rhs; + } else if (lhs->isConst() && rhs->isConst()) { + return IrBuilder::create( + int_func(lhs->value().value(), rhs->value().value())); + } else { + return ir_builder_func(lhs, rhs); + } +} + +template +Val* minOrMaxExpr( + Val* lhs, + Val* rhs, + IrBuilderFunc ir_builder_func, + IntFunc int_func) { + TORCH_INTERNAL_ASSERT(lhs != nullptr || rhs != nullptr); + if (lhs == nullptr) { + return rhs; + } else if (rhs == nullptr || lhs == rhs) { + return lhs; + } + auto lhs_int = dynamic_cast(lhs); + auto rhs_int = dynamic_cast(rhs); + if (lhs_int != nullptr && rhs_int != nullptr) { + return minOrMaxExpr(lhs_int, rhs_int, ir_builder_func, int_func); + } else { + return ir_builder_func(lhs, rhs); + } +} + +} // namespace + +Val* SimplifyingIrBuilder::maxExpr(Val* lhs, Val* rhs) { + return minOrMaxExpr( + lhs, + rhs, + [](Val* lhs, Val* rhs) { return IrBuilder::maxExpr(lhs, rhs); }, + [](int64_t lhs, int64_t rhs) { return std::max(lhs, rhs); }); +} + +Val* SimplifyingIrBuilder::minExpr(Val* lhs, Val* rhs) { + return minOrMaxExpr( + lhs, + rhs, + [](Val* lhs, Val* rhs) { return IrBuilder::minExpr(lhs, rhs); }, + [](int64_t lhs, int64_t rhs) { return std::min(lhs, rhs); }); +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.h b/torch/csrc/jit/codegen/cuda/ir_builder.h new file mode 100644 index 00000000000..5087f2832a9 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ir_builder.h @@ -0,0 +1,127 @@ +#pragma once + +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace kir { +class Kernel; +} + +class IrCloner; + +// Passkey for builder to register properties with statements, and to call +// functions in IrContainer +class TORCH_CUDA_CU_API IrBuilderPasskey { + friend class IrBuilder; + + public: + // TODO: Collapse ir_container and Kernel once Kernel inherits from + // IrContainer + IrContainer* const ir_container_ = nullptr; + + private: + explicit IrBuilderPasskey(IrContainer* ir_container); +}; + +//! IR builder interface +class TORCH_CUDA_CU_API IrBuilder { + public: + //! Allocate a new IR node, forwarding the arguments to the appropriate + //! constructor and registering with the container + template + static T* create(Args&&... args) { + auto container = FusionGuard::getCurFusion(); + // return create(container, std::forward(args)...); + TORCH_INTERNAL_ASSERT( + container != nullptr, "Need an active container to build IR."); + T* node = new T(IrBuilderPasskey(container), std::forward(args)...); + + container->registerStmt(IrBuilderPasskey(container), node); + + return node; + } + + //! Allocate a new IR node, forwarding the arguments to the appropriate + //! constructor and registering with the container + template + static T* create(IrContainer* container, Args&&... args) { + TORCH_INTERNAL_ASSERT( + container != nullptr, "Need an active container to build IR."); + T* node = new T(IrBuilderPasskey(container), std::forward(args)...); + + container->registerStmt(IrBuilderPasskey(container), node); + + return node; + } + + //! Clone an IR node, forwarding the arguments to the IrCloner constructor. + //! Register clones with IrCloner's target container. + template + static T* clone(const T* src, IrCloner* ir_cloner); + + // Unary operations + static Val* negExpr(Val* val); + static Val* notExpr(Val* val); + static Val* setExpr(Val* val); + static Val* setExprNamedScalar(const std::string& name, Val* val); + static Val* addressExprNamedScalar(const std::string& name, Val* val); + + // Binary operations + static Val* andExpr(Val* lhs, Val* rhs); + static Val* eqExpr(Val* lhs, Val* rhs); + static Val* gtExpr(Val* lhs, Val* rhs); + static Val* ltExpr(Val* lhs, Val* rhs); + static Val* leExpr(Val* lhs, Val* rhs); + static Val* geExpr(Val* lhs, Val* rhs); + static Val* addExpr(Val* lhs, Val* rhs); + static Val* subExpr(Val* lhs, Val* rhs); + static Val* mulExpr(Val* lhs, Val* rhs); + static Val* divExpr(Val* lhs, Val* rhs); + static Val* ceilDivExpr(Val* lhs, Val* rhs); + static Val* modExpr(Val* lhs, Val* rhs); + static Val* maxExpr(Val* lhs, Val* rhs); + static Val* minExpr(Val* lhs, Val* rhs); + + // Ternary operations + static Val* whereExpr(Val* pred, Val* lhs, Val* rhs); + + private: + static Val* newResult(DataType dtype); + static Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs); + static Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs); +}; + +//! A wrapper builder with static expression simplification +//! +//! Example: +//! - addExpr(new Int(1), new Int(2)) -> Int(3) +//! - addExpr(new Int(0), new NamedScalar("foo")) -> NamedScalar("foo") +//! +//! Designed to be used to simplify predicate and index expressions in +//! generated code. Also, the shift validation may fail without +//! this simplification. +class TORCH_CUDA_CU_API SimplifyingIrBuilder : public IrBuilder { + public: + static Val* negExpr(Val* val); + static Val* notExpr(Val* val); + + static Val* addExpr(Int* lhs, Int::ScalarType rhs); + static Val* addExpr(Val* lhs, Int::ScalarType rhs); + static Val* addExpr(Int* lhs, Int* rhs); + static Val* addExpr(Val* lhs, Val* rhs); + static Val* subExpr(Val* lhs, Val* rhs); + static Val* andExpr(Val* lhs, Val* rhs); + static Val* maxExpr(Val* lhs, Val* rhs); + static Val* minExpr(Val* lhs, Val* rhs); +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp index 7e5a9cfa8bc..8a1717e8d05 100644 --- a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp @@ -2,12 +2,15 @@ #include #include +#include namespace torch { namespace jit { namespace fuser { namespace cuda { +IrCloner::IrCloner(IrContainer* container) : ir_container_(container) {} + Statement* IrCloner::clone(const Statement* statement) { if (statement == nullptr) { return nullptr; @@ -30,7 +33,6 @@ Statement* IrCloner::clone(const Statement* statement) { // that something went horribly wrong. TORCH_INTERNAL_ASSERT(new_node != nullptr); TORCH_INTERNAL_ASSERT(clones_map_[statement] == new_node); - TORCH_INTERNAL_ASSERT(new_node->fusion() == fusion_); return new_node; } @@ -39,7 +41,6 @@ Statement* IrCloner::clone(const Statement* statement) { void IrCloner::registerClone(const Statement* src, Statement* clone) { TORCH_CHECK(src != nullptr); TORCH_CHECK(clone != nullptr); - TORCH_CHECK(clone->fusion() == fusion_); TORCH_CHECK(clones_map_.insert({src, clone}).second); } @@ -56,79 +57,79 @@ void IrCloner::handle(const Expr* e) { } void IrCloner::handle(const TensorDomain* td) { - clone_ = new TensorDomain(td, this); + clone_ = IrBuilder::clone(td, this); } void IrCloner::handle(const IterDomain* id) { - clone_ = new IterDomain(id, this); + clone_ = IrBuilder::clone(id, this); } void IrCloner::handle(const Bool* b) { - clone_ = new Bool(b, this); + clone_ = IrBuilder::clone(b, this); } void IrCloner::handle(const Double* d) { - clone_ = new Double(d, this); + clone_ = IrBuilder::clone(d, this); } void IrCloner::handle(const Int* i) { - clone_ = new Int(i, this); + clone_ = IrBuilder::clone(i, this); } void IrCloner::handle(const NamedScalar* named_scalar) { - clone_ = new NamedScalar(named_scalar, this); + clone_ = IrBuilder::clone(named_scalar, this); } void IrCloner::handle(const TensorView* tv) { - clone_ = new TensorView(tv, this); + clone_ = IrBuilder::clone(tv, this); } void IrCloner::handle(const UnaryOp* op) { - clone_ = new UnaryOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const BinaryOp* op) { - clone_ = new BinaryOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const TernaryOp* op) { - clone_ = new TernaryOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const BroadcastOp* op) { - clone_ = new BroadcastOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const ReductionOp* op) { - clone_ = new ReductionOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const WelfordOp* op) { - clone_ = new WelfordOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const TransposeOp* op) { - clone_ = new TransposeOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const ShiftOp* op) { - clone_ = new ShiftOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const GatherOp* op) { - clone_ = new GatherOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const ViewOp* op) { - clone_ = new ViewOp(op, this); + clone_ = IrBuilder::clone(op, this); } void IrCloner::handle(const Split* split) { - clone_ = new Split(split, this); + clone_ = IrBuilder::clone(split, this); } void IrCloner::handle(const Merge* merge) { - clone_ = new Merge(merge, this); + clone_ = IrBuilder::clone(merge, this); } TensorView* RecomputeTv::recompute(TensorView* tv) { @@ -141,7 +142,7 @@ TensorView* RecomputeTv::recompute(TensorView* tv) { "Cannot recompute buffers that are inputs of the fusion."); // Grab all the expressions used to generate the TensorView - auto exprs = ExprSort::getExprs(tv->fusion(), {tv}); + auto exprs = StmtSort::getExprs(tv->fusion(), {tv}, false); // Run the replicator RecomputeTv replicator(tv->fusion(), exprs); @@ -161,7 +162,7 @@ TensorView* RecomputeTv::recompute(TensorView* tv) { } RecomputeTv::RecomputeTv(Fusion* fusion, std::vector exprs) - : IrCloner(fusion) { + : IrCloner(fusion), fusion_(fusion) { // Add inputs to the clones map to prevent cloning them. for (const auto inp : fusion->inputs()) { clones_map_[inp] = inp; @@ -183,7 +184,7 @@ void RecomputeTv::handle(const TensorDomain* td) { // Make sure to recompute the history of the iteration domains, explicitly go // through the expressions and send them to IrCloner. auto exprs = - ExprSort::getExprs(fusion(), {td->domain().begin(), td->domain().end()}); + StmtSort::getExprs(fusion_, {td->domain().begin(), td->domain().end()}); for (auto expr : exprs) { IrCloner::handle(expr); diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.h b/torch/csrc/jit/codegen/cuda/ir_cloner.h index ac83d9edb09..1755b9e9563 100644 --- a/torch/csrc/jit/codegen/cuda/ir_cloner.h +++ b/torch/csrc/jit/codegen/cuda/ir_cloner.h @@ -1,7 +1,8 @@ #pragma once -#include +#include #include +#include #include #include @@ -11,7 +12,7 @@ namespace jit { namespace fuser { namespace cuda { -class Fusion; +class IrContainer; //! Clones nodes from an exiting Fusion //! @@ -21,10 +22,11 @@ class Fusion; //! class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch { friend class Statement; + friend class IrBuilder; public: // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - explicit IrCloner(Fusion* new_fusion) : fusion_(new_fusion) {} + explicit IrCloner(IrContainer* container); Statement* clone(const Statement* statement); @@ -45,8 +47,8 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch { return copy; } - Fusion* fusion() const { - return fusion_; + IrContainer* container() const { + return ir_container_; } protected: @@ -86,12 +88,15 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch { private: // The destination Fusion container - Fusion* fusion_ = nullptr; + IrContainer* ir_container_ = nullptr; // The dispatch interface doesn't allow returning values from // individual `handle()` methods, so they are storing the // result here Statement* clone_ = nullptr; + + // Builder to make all the new nodes + IrBuilder builder_; }; // Replicates all expressions used to generate the provided TensorView. Does not @@ -105,7 +110,9 @@ class RecomputeTv : private IrCloner { private: RecomputeTv(Fusion* fusion, std::vector exprs); - void handle(const TensorDomain*) override; + void handle(const TensorDomain*) final; + + Fusion* fusion_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/ir_container.cpp b/torch/csrc/jit/codegen/cuda/ir_container.cpp new file mode 100644 index 00000000000..e84418eb973 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ir_container.cpp @@ -0,0 +1,279 @@ +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +void swap(IrContainer& a, IrContainer& b) noexcept { + FUSER_PERF_SCOPE("Fusion swap"); + + using std::swap; + + // Swap the content + swap(a.vals_up_, b.vals_up_); + swap(a.vals_, b.vals_); + + swap(a.exprs_up_, b.exprs_up_); + swap(a.exprs_, b.exprs_); + + swap(a.raw_ptrs_, b.raw_ptrs_); + + swap(a.val_type_name_map_, b.val_type_name_map_); + swap(a.expr_name_counter_, b.expr_name_counter_); + + // Fixup the Statement::fusion_ links for a + for (auto val : a.vals_) { + val->ir_container_ = &a; + } + for (auto expr : a.exprs_) { + expr->ir_container_ = &a; + } + + // Fixup the Statement::fusion_ links for b + for (auto val : b.vals_) { + val->ir_container_ = &a; + } + for (auto expr : b.exprs_) { + expr->ir_container_ = &a; + } +} + +IrCloner IrContainer::copy(const IrContainer* from, IrContainer* to) { + to->clear(); + IrCloner ir_cloner(to); + + for (auto val : from->vals_) { + to->vals_.insert(ir_cloner.clone(val)); + } + + for (auto expr : from->exprs_) { + to->exprs_.insert(ir_cloner.clone(expr)); + } + + to->val_type_name_map_ = from->val_type_name_map_; + to->expr_name_counter_ = from->expr_name_counter_; + + return ir_cloner; +} + +IrContainer::IrContainer() = default; + +IrContainer::IrContainer(const IrContainer& other) { + FUSER_PERF_SCOPE("IrContainer copy"); + IrContainer::copy(&other, this); +} + +IrContainer::IrContainer(IrContainer&& other) noexcept { + FUSER_PERF_SCOPE("IrContainer move"); + swap(*this, other); +} + +IrContainer& IrContainer::operator=(const IrContainer& other) { + FUSER_PERF_SCOPE("IrContainer copy assign"); + IrContainer copy(other); + clear(); + swap(*this, copy); + return *this; +} + +IrContainer& IrContainer::operator=(IrContainer&& other) noexcept { + FUSER_PERF_SCOPE("IrContainer move assign"); + clear(); + swap(*this, other); + return *this; +} + +IrContainer::~IrContainer() { + clear(); +} + +//! Register the Statement with this container +void IrContainer::registerStmt(IrBuilderPasskey, Statement* stmt) { + if (stmt->isVal()) { + registerVal(stmt->asVal()); + } else { + registerExpr(stmt->asExpr()); + } +} + +//! Register the Val with this container +void IrContainer::registerVal(IrBuilderPasskey, Val* val) { + registerVal(val); +} + +//! Register expr with this container. +void IrContainer::registerExpr(IrBuilderPasskey, Expr* expr) { + registerExpr(expr); +} + +void IrContainer::registerExpr(ExprPasskey, Expr* expr) { + registerExpr(expr); +} + +void IrContainer::removeExpr(Expr* expr) { + TORCH_INTERNAL_ASSERT( + exprs_.find(expr) != exprs_.end(), + "Wanted to remove an expression but it doesn't exist in this container."); + auto expr_in_deque = std::find_if( + exprs_up_.begin(), + exprs_up_.end(), + [expr](std::unique_ptr& expr_up) { return expr_up.get() == expr; }); + + TORCH_INTERNAL_ASSERT( + expr_in_deque != exprs_up_.end(), + "Wanted to remove an expression but its unique ptr is missing."); + + exprs_.erase(expr); + exprs_up_.erase(expr_in_deque); + raw_ptrs_.erase((void*)expr); +} + +//! Completely remove val from the fusion, break all dependencies associated +//! with it +void IrContainer::removeVal(Val* val) { + // Don't remove shortcuts + if (val == true_val_.get() || val == false_val_.get() || + val == one_val_.get() || val == zero_val_.get() || + val == magic_zero_val_.get()) { + return; + } + + TORCH_INTERNAL_ASSERT( + vals_.find(val) != vals_.end(), + "Wanted to remove a value but it doesn't exist in this container."); + auto val_in_deque = std::find_if( + vals_up_.begin(), vals_up_.end(), [val](std::unique_ptr& val_up) { + return val_up.get() == val; + }); + + TORCH_INTERNAL_ASSERT( + val_in_deque != vals_up_.end(), + "Wanted to remove a value but its unique ptr is missing."); + + vals_.erase(val); + vals_up_.erase(val_in_deque); + raw_ptrs_.erase((void*)val); +} + +//! Register the Val with this container +void IrContainer::registerVal(Val* val) { + if (inContainer(val)) { + return; + } + + vals_up_.emplace_back(std::unique_ptr(val)); + vals_.emplace(vals_up_.back().get()); + val->setName(IrContainerPasskey(), getValName(vals_up_.back()->vtype())); + raw_ptrs_.emplace((void*)vals_up_.back().get()); +} + +//! Register expr with this container. +void IrContainer::registerExpr(Expr* expr) { + if (inContainer(expr)) { + return; + } + exprs_up_.emplace_back(std::unique_ptr(expr)); + exprs_.emplace(exprs_up_.back().get()); + expr->setName(IrContainerPasskey(), getExprName()); + raw_ptrs_.emplace((void*)exprs_up_.back().get()); +} + +void IrContainer::clear() noexcept { + FUSER_PERF_SCOPE("IrContainer clear"); + vals_.clear(); + vals_up_.clear(); + exprs_.clear(); + exprs_up_.clear(); + raw_ptrs_.clear(); + + val_type_name_map_.clear(); + expr_name_counter_ = 0; +} + +bool IrContainer::inContainer(const Statement* stmt) const { + const void* const_void = (const void*)(stmt); + void* nonconst_void = const_cast(const_void); // NOLINT + if (raw_ptrs_.find(nonconst_void) == raw_ptrs_.end()) { + return false; + } + + TORCH_INTERNAL_ASSERT( + stmt->container() == this, + "Container claims to own stmt, but stmt disagrees."); + + Statement* nonconst_stmt = const_cast(stmt); // NOLINT + if (stmt->isExpr()) { + TORCH_INTERNAL_ASSERT( + exprs_.find(nonconst_stmt->as()) != exprs_.end(), + "Somehow container claims to and not to own an Expr."); + } + if (stmt->isVal()) { + TORCH_INTERNAL_ASSERT( + vals_.find(nonconst_stmt->as()) != vals_.end(), + "Somehow container claims to and not to own an Val."); + } + + return true; +} + +// Shortcuts for frequently used vals +Int* IrContainer::zeroVal() { + if (!zero_val_) { + auto zero_val = IrBuilder::create(this, 0); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == zero_val); + zero_val_ = std::unique_ptr(vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return zero_val_.get(); +} + +Int* IrContainer::oneVal() { + if (!one_val_) { + auto one_val = IrBuilder::create(this, 1); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == one_val); + one_val_ = std::unique_ptr(vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return one_val_.get(); +} + +Bool* IrContainer::falseVal() { + if (!false_val_) { + auto false_val = IrBuilder::create(this, false); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == false_val); + false_val_ = std::unique_ptr(vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return false_val_.get(); +} + +Bool* IrContainer::trueVal() { + if (!true_val_) { + auto true_val = IrBuilder::create(this, true); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == true_val); + true_val_ = std::unique_ptr(vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return true_val_.get(); +} + +NamedScalar* IrContainer::magicZeroVal() { + if (!magic_zero_val_) { + auto magic_zero = + IrBuilder::create(kMagicZeroName, DataType::Int); + TORCH_INTERNAL_ASSERT(vals_up_.back().get() == magic_zero); + magic_zero_val_ = std::unique_ptr( + vals_up_.back().release()->as()); + vals_up_.pop_back(); + } + return magic_zero_val_.get(); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ir_container.h b/torch/csrc/jit/codegen/cuda/ir_container.h new file mode 100644 index 00000000000..fb1aaeaf383 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ir_container.h @@ -0,0 +1,174 @@ +#pragma once + +#include + +#include +#include + +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +class IrBuilderPasskey; +class ExprPasskey; +class OptOutMutator; + +class Int; +class Bool; +class NamedScalar; + +// Passkey for container to register names with statements +class IrContainerPasskey { + friend class IrContainer; + + private: + explicit IrContainerPasskey() {} +}; + +class TORCH_CUDA_CU_API IrContainer : public PolymorphicBase { + public: + IrContainer(); + + IrContainer(const IrContainer& other); + IrContainer(IrContainer&& other) noexcept; + + IrContainer& operator=(const IrContainer& other); + IrContainer& operator=(IrContainer&& other) noexcept; + + virtual ~IrContainer(); + + bool inContainer(const Statement* stmt) const; + + void assertInContainer(const Statement* stmt, const std::string& msg) const { + TORCH_CHECK( + inContainer(stmt), msg, " it was not found in the active container."); + } + + //! Return in insertion order + const std::deque deterministic_vals() const noexcept { + std::deque vals_deque; + std::transform( + vals_up_.begin(), + vals_up_.end(), + std::back_inserter(vals_deque), + [](const std::unique_ptr& val_up) { return val_up.get(); }); + return vals_deque; + } + + //! Register the Statement with this container + virtual void registerStmt(IrBuilderPasskey, Statement* stmt); + + //! Register the Val with this container + virtual void registerVal(IrBuilderPasskey, Val* val); + + //! Register expr with this container. + virtual void registerExpr(IrBuilderPasskey, Expr* expr); + + //! Allow expr's to register themselves with a container, this is only used + //! for broadcastOp so it can register itself in its constructor so root maps + //! can be built. + virtual void registerExpr(ExprPasskey, Expr* expr); + + //! Return the set of Exprs registered with this fusion. Warning: This will + //! return exprs outside inputs/outputs, so can be unsafe for use with + //! segmented fusions. + const std::unordered_set& unordered_exprs() const noexcept { + return exprs_; + } + + //! Return the set of Vals registered with this fusion + const std::unordered_set& vals() const noexcept { + return vals_; + } + + // Shortcuts for frequently used vals + Int* zeroVal(); + Int* oneVal(); + Bool* falseVal(); + Bool* trueVal(); + NamedScalar* magicZeroVal(); + + protected: + static IrCloner copy(const IrContainer* from, IrContainer* to); + + friend void swap(IrContainer& a, IrContainer& b) noexcept; + + // Let mutator remove Exprs. + friend OptOutMutator; + + virtual void removeExpr(Expr* expr); + + //! Completely remove val from the fusion, break all dependencies associated + //! with it + virtual void removeVal(Val* val); + + //! Register the Val with this container + virtual void registerVal(Val* val); + + //! Register expr with this container. + virtual void registerExpr(Expr* expr); + + StmtNameType getValName(ValType vtype) { + if (val_type_name_map_.find(vtype) == val_type_name_map_.end()) { + val_type_name_map_[vtype] = 0; + } + return val_type_name_map_[vtype]++; + } + + StmtNameType getExprName() { + return expr_name_counter_++; + } + + void clear() noexcept; + + // Deque of unique pointer is the memory owning data structure + std::deque> vals_up_; + + // A convenient set to return when we just need an unordered set to do + // something like check if a Val is in this container + std::unordered_set vals_; + + // Deque of unique pointer is the memory owning data structure + std::deque> exprs_up_; + + // A convenient set to return when we just need an unordered set to do + // something like check if an Expr is in this container + std::unordered_set exprs_; + + // Used to implement a generic "inContainer" that can be passed an invalid + // pointer. Specifically a pointer to a Statement owned by another container + // that has been freed. We can't check normally with the unordered_sets we + // already have because it would require a const_cast from a constant + // expr/val, or a dynamic cast from a Statement. + std::unordered_set raw_ptrs_; + + // Values names counters + std::unordered_map val_type_name_map_; + + // Expression names counter + StmtNameType expr_name_counter_ = 0; + + // Manually store some persistent, frequently used nodes. It's very + // challenging to do this anything but manually as detecting when a container + // may or may not have one of these vals is tricky. Specifically because if + // the container doesn't own it, it's hard to understand from the outside if + // the node may have been removed then re-registered. It could also be tricky + // to know when we're using a different container as in FusionCopy_test + // demonstrates deleting then creating containers can result in the same + // pointer for the container. + std::unique_ptr true_val_; + std::unique_ptr false_val_; + std::unique_ptr one_val_; + std::unique_ptr zero_val_; + std::unique_ptr magic_zero_val_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp index 5ca8d54aaa9..7511fbd4d6d 100644 --- a/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -303,13 +304,13 @@ void IrGraphGenerator::generateScheduleGraph() { // Maybe not the best way to handle the root domain, but should be okay addArc( tv, - new TensorDomain(tv->getRootDomain()), + IrBuilder::create(tv->getRootDomain()), "[style=dashed, color=green, arrowhead=none]"); if (tv->domain()->hasRFactor()) addArc( tv, - new TensorDomain(tv->domain()->getRFactorDomain()), + IrBuilder::create(tv->domain()->getRFactorDomain()), "[style=dashed, color=green, arrowhead=none]"); } } diff --git a/torch/csrc/jit/codegen/cuda/ir_graphviz.h b/torch/csrc/jit/codegen/cuda/ir_graphviz.h index 1144d95eb15..f9b3adf703d 100644 --- a/torch/csrc/jit/codegen/cuda/ir_graphviz.h +++ b/torch/csrc/jit/codegen/cuda/ir_graphviz.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h index 02c319d3665..28478c64d91 100644 --- a/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_interface_nodes.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -19,6 +19,9 @@ namespace cuda { class WelfordResult; class ViewTransform; +class IrCloner; +class IrBuilderPasskey; + //! A Bool value //! //! This value can be a symbolic value (defined after the kernel @@ -26,17 +29,18 @@ class ViewTransform; //! class TORCH_CUDA_CU_API Bool : public Val { public: - Bool() : Val(ValType::Scalar, DataType::Bool), maybe_value_{c10::nullopt} {} + Bool(IrBuilderPasskey passkey); - explicit Bool(bool value) - : Val(ValType::Scalar, DataType::Bool), maybe_value_{value} {} + explicit Bool(IrBuilderPasskey passkey, bool value); + + explicit Bool(IrBuilderPasskey passkey, c10::optional value); Bool(const Bool* src, IrCloner* ir_cloner); bool isSymbolic() const { return !(maybe_value_.has_value()); } - bool isConst() const { + bool isConst() const final { return maybe_value_.has_value(); } c10::optional value() const { @@ -56,18 +60,18 @@ class TORCH_CUDA_CU_API Double : public Val { public: using ScalarType = double; - Double() - : Val(ValType::Scalar, DataType::Double), maybe_value_{c10::nullopt} {} + Double(IrBuilderPasskey passkey); - explicit Double(ScalarType value) - : Val(ValType::Scalar, DataType::Double), maybe_value_{value} {} + explicit Double(IrBuilderPasskey passkey, ScalarType value); + + explicit Double(IrBuilderPasskey passkey, c10::optional value); Double(const Double* src, IrCloner* ir_cloner); bool isSymbolic() const { return !(maybe_value_.has_value()); } - bool isConst() const { + bool isConst() const final { return maybe_value_.has_value(); } c10::optional value() const { @@ -86,17 +90,18 @@ class TORCH_CUDA_CU_API Int : public Val { public: using ScalarType = int64_t; - Int() : Val(ValType::Scalar, DataType::Int), maybe_value_{c10::nullopt} {} + Int(IrBuilderPasskey passkey); - explicit Int(ScalarType value) - : Val(ValType::Scalar, DataType::Int), maybe_value_{value} {} + explicit Int(IrBuilderPasskey passkey, ScalarType value); + + explicit Int(IrBuilderPasskey passkey, c10::optional value); Int(const Int* src, IrCloner* ir_cloner); bool isSymbolic() const { return !(maybe_value_.has_value()); } - bool isConst() const { + bool isConst() const final { return maybe_value_.has_value(); } c10::optional value() const { @@ -152,14 +157,18 @@ class TVDomainGuard; class TORCH_CUDA_CU_API TensorView : public Val { public: TensorView( + IrBuilderPasskey passkey, TensorDomain* domain, DataType dtype, MemoryType mtype = MemoryType::Local); - explicit TensorView(const std::shared_ptr& tensor_type); + explicit TensorView( + IrBuilderPasskey passkey, + const std::shared_ptr& tensor_type); - explicit TensorView(const std::shared_ptr& jit_value) - : TensorView(jit_value->type()->cast()) {} + explicit TensorView( + IrBuilderPasskey passkey, + const std::shared_ptr& jit_value); TensorView(const TensorView* src, IrCloner* ir_cloner); @@ -187,6 +196,16 @@ class TORCH_CUDA_CU_API TensorView : public Val { //! trivial reductions bool hasAnyReduction() const; + //! Returns true if this tensor is zero dimensional, + //! i.e. a wrapped scalar or an empty placeholder. + bool isZeroDim() const { + return nDims() == 0; + } + + //! Returns true if this tensor does not contain + //! any value. + bool isEmptyTensor() const; + c10::optional getReductionAxis() const; const std::vector& getRootDomain() const; @@ -210,6 +229,24 @@ class TORCH_CUDA_CU_API TensorView : public Val { size_t nDims() const; + // sets cpu_scalar_ value, which is special handling for CPU based zero-dim + // tensors (i.e. CPU Tensors that only have one value). This is only used if + // on an input value, otherwise ignored. This is important as special handling + // because these "scalars" should be type promoted as a tensor, but we want to + // avoid explicit copying of the data, so we want to pass the data value as a + // standard kernel argument value. + void setCpuScalar(bool is_cpu_scalar); + + // returns cpu_scalar_ value, which is special handling for CPU based zero-dim + // tensors (i.e. CPU Tensors that only have one value). This is only used if + // on an input value, otherwise ignored. This is important as special handling + // because these "scalars" should be type promoted as a tensor, but we want to + // avoid explicit copying of the data, so we want to pass the data value as a + // standard kernel argument value. + bool isCpuScalar() const { + return cpu_scalar_; + } + // Returns the position that this tensor is produced at relative to its axes. unsigned int getComputeAtPosition() const { return compute_at_pos_; @@ -356,6 +393,13 @@ class TORCH_CUDA_CU_API TensorView : public Val { return axes_to_swizzle_; } + // Apply double buffering transformation + void doubleBuffer(); + + bool isDoubleBuffered() const { + return is_double_buffered_; + } + friend TORCH_CUDA_CU_API TransformPropagator; friend TORCH_CUDA_CU_API TransformReplay; friend TORCH_CUDA_CU_API OptOutMutator; @@ -393,6 +437,14 @@ class TORCH_CUDA_CU_API TensorView : public Val { MemoryType memory_type_ = MemoryType::Local; SwizzleType swizzle_type_ = SwizzleType::NoSwizzle; std::vector axes_to_swizzle_; + bool is_double_buffered_ = false; + // special handling for CPU based zero-dim tensors (i.e. CPU Tensors that only + // have one value). This is only used if on an input value, otherwise ignored. + // This is important as special handling because these "scalars" should be + // type promoted as a tensor, but we want to avoid explicit copying of the + // data, so we want to pass the data value as a standard kernel argument + // value. + bool cpu_scalar_ = false; }; //! A simple TensorView builder diff --git a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h index 8fd4475d2dd..bb494148be2 100644 --- a/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h +++ b/torch/csrc/jit/codegen/cuda/ir_internal_nodes.h @@ -1,10 +1,11 @@ #pragma once -#include +#include #include #include #include +#include //! Nodes in here should generally not be used by users. They should be behind //! the scenes and users shouldn't have to be aware of what they do to use the @@ -20,6 +21,8 @@ namespace fuser { namespace cuda { class ViewTransform; +class Scope; +class IrCloner; //! Returns true if both v1 and v2 are scalars, are the same type of scalars, //! and dispatches to the inherited Val type's `->sameAs` call. e.g. if both @@ -34,7 +37,7 @@ bool areEqualScalars(Val* v1, Val* v2); //! 4) split/merge class TORCH_CUDA_CU_API UnaryOp : public Expr { public: - UnaryOp(UnaryOpType type, Val* out, Val* in); + UnaryOp(IrBuilderPasskey, UnaryOpType type, Val* out, Val* in); UnaryOp(const UnaryOp* src, IrCloner* ir_cloner); @@ -63,7 +66,7 @@ class TORCH_CUDA_CU_API UnaryOp : public Expr { //! 2) LT (A < B) class TORCH_CUDA_CU_API BinaryOp : public Expr { public: - BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs); + BinaryOp(IrBuilderPasskey, BinaryOpType type, Val* out, Val* lhs, Val* rhs); BinaryOp(const BinaryOp* src, IrCloner* ir_cloner); @@ -97,7 +100,11 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr { //! \param out The output tensor //! \param in The input tensor //! \param is_broadcast_dims True when output dim is a new broadcast domain - BroadcastOp(Val* out, Val* in, std::vector is_broadcast_dims); + BroadcastOp( + IrBuilderPasskey, + Val* out, + Val* in, + std::vector is_broadcast_dims); BroadcastOp(const BroadcastOp* src, IrCloner* ir_cloner); @@ -138,7 +145,12 @@ class TORCH_CUDA_CU_API BroadcastOp : public Expr { //! non-reduction/non-broadcast dimensions. class TORCH_CUDA_CU_API ReductionOp : public Expr { public: - ReductionOp(BinaryOpType reduction_op_type, Val* init, Val* out, Val* in); + ReductionOp( + IrBuilderPasskey, + BinaryOpType reduction_op_type, + Val* init, + Val* out, + Val* in); ReductionOp(const ReductionOp* src, IrCloner* ir_cloner); @@ -169,6 +181,7 @@ class TORCH_CUDA_CU_API ReductionOp : public Expr { class TORCH_CUDA_CU_API WelfordOp : public Expr { public: WelfordOp( + IrBuilderPasskey, Val* out_avg, Val* out_var, Val* out_N, @@ -189,10 +202,6 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr { return in_avg_; } - Val* init() const { - return init_avg_; - } - bool sameAs(const Statement* const other) const override; // Welford Accessors @@ -255,7 +264,11 @@ class TORCH_CUDA_CU_API WelfordOp : public Expr { class TORCH_CUDA_CU_API TransposeOp : public Expr { public: - TransposeOp(TensorView* out, TensorView* in, std::vector new2old); + TransposeOp( + IrBuilderPasskey, + TensorView* out, + TensorView* in, + std::vector new2old); TransposeOp(const TransposeOp* src, IrCloner* ir_cloner); @@ -279,7 +292,13 @@ class TORCH_CUDA_CU_API TransposeOp : public Expr { class TORCH_CUDA_CU_API TernaryOp : public Expr { public: - TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3); + TernaryOp( + IrBuilderPasskey, + TernaryOpType type, + Val* out, + Val* in1, + Val* in2, + Val* in3); TernaryOp(const TernaryOp* src, IrCloner* ir_cloner); @@ -317,7 +336,12 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr { //! \param out //! \param in //! \param offsets - ShiftOp(Val* out, Val* in, std::vector offsets, bool pad); + ShiftOp( + IrBuilderPasskey, + Val* out, + Val* in, + std::vector offsets, + std::vector pad_width); ShiftOp(const ShiftOp* src, IrCloner* ir_cloner); @@ -336,8 +360,14 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr { return offsets_; } - bool pad() const { - return pad_; + const std::vector& padWidth() const { + return pad_width_; + } + + bool hasPadding() const { + return std::any_of(pad_width_.begin(), pad_width_.end(), [](const auto p) { + return p > 0; + }); } bool sameAs(const Statement* other) const override; @@ -349,17 +379,18 @@ class TORCH_CUDA_CU_API ShiftOp : public Expr { //! offsets_. The sign of each value indicates the direction of //! shifting. const std::vector offsets_; - const bool pad_; + const std::vector pad_width_; }; //! Gather a window around each element. class TORCH_CUDA_CU_API GatherOp : public Expr { public: GatherOp( + IrBuilderPasskey, Val* out, Val* in, - std::vector window_shape, - std::vector> pad_width); + std::vector window_shape, + std::vector> pad_width); GatherOp(const GatherOp* src, IrCloner* ir_cloner); @@ -381,20 +412,26 @@ class TORCH_CUDA_CU_API GatherOp : public Expr { return pad_width_; } + bool hasPadding() const { + return std::any_of(pad_width_.begin(), pad_width_.end(), [](const auto& p) { + return p[0] > 0 || p[1] > 0; + }); + } + bool sameAs(const Statement* other) const override; private: Val* const out_ = nullptr; Val* const in_ = nullptr; //! Shape of a window gathered for each element. - std::vector window_shape_; + std::vector window_shape_; //! The size of zero-padding of each axis. - std::vector> pad_width_; + std::vector> pad_width_; }; class TORCH_CUDA_CU_API ViewOp : public Expr { public: - ViewOp(TensorView* out, TensorView* in); + ViewOp(IrBuilderPasskey, TensorView* out, TensorView* in); ViewOp(const ViewOp* src, IrCloner* ir_cloner); @@ -422,6 +459,7 @@ class IndexReferenceReplay; class TORCH_CUDA_CU_API IterDomain : public Val { public: IterDomain( + IrBuilderPasskey, Val* start, Val* extent, ParallelType parallel_type = ParallelType::Serial, @@ -429,6 +467,7 @@ class TORCH_CUDA_CU_API IterDomain : public Val { bool is_rfactor_domain = false); IterDomain( + IrBuilderPasskey, Val* start, Val* extent, Val* stop_offset, @@ -441,20 +480,7 @@ class TORCH_CUDA_CU_API IterDomain : public Val { bool sameAs(const Statement* other) const override; // Returns a new IterDomain matching properties of this - // TODO: parallel_method->getParallelType - IterDomain* clone() const { - auto cloned = new IterDomain( - start(), - extent(), - stopOffset(), - getParallelType(), - getIterType(), - isRFactorProduct()); - - cloned->is_padded_dimension_ = is_padded_dimension_; - cloned->padded_to_size_ = padded_to_size_; - return cloned; - } + IterDomain* clone() const; //! Clone a vector domains static std::vector clone( @@ -631,6 +657,11 @@ class TORCH_CUDA_CU_API IterDomain : public Val { //! domain. std::pair stridedSplit(int factor); + // TODO: Remove + bool isSimple() const { + return definition() == nullptr; + } + protected: friend TensorDomain; friend ReplayTransformations; @@ -647,6 +678,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val { bool is_rfactor_domain_ = false; bool is_padded_dimension_ = false; c10::optional padded_to_size_ = c10::nullopt; + + // TODO: Remove only used in kernel IR because IterDomains don't maintain + // definitions of split/merge. + bool is_simple_ = true; }; //! TensorDomain holds a vector of IterDomains. It holds an IterDomain for every @@ -666,15 +701,18 @@ class TORCH_CUDA_CU_API IterDomain : public Val { class TORCH_CUDA_CU_API TensorDomain : public Val { public: explicit TensorDomain( + IrBuilderPasskey, std::vector root_domain, std::vector contiguity = std::vector()); TensorDomain( + IrBuilderPasskey, std::vector root_domain, std::vector domain, std::vector contiguity = std::vector()); TensorDomain( + IrBuilderPasskey, std::vector root_domain, std::vector rfactor_domain, std::vector domain, @@ -718,6 +756,8 @@ class TORCH_CUDA_CU_API TensorDomain : public Val { bool hasReduction() const; bool hasBlockReduction() const; bool hasGridReduction() const; + bool hasBlockBroadcast() const; + bool hasGridBroadcast() const; bool hasBroadcast() const; bool hasRFactor() const; bool hasVectorize() const; @@ -821,6 +861,7 @@ class TORCH_CUDA_CU_API Split : public Expr { // start_offset and stop_offset are distance from the left end and // right ends, respectively. Split( + IrBuilderPasskey, IterDomain* outer, IterDomain* inner, IterDomain* in, @@ -881,12 +922,13 @@ class TORCH_CUDA_CU_API Split : public Expr { //! dictate which will be traversed first (inner). Both IterDomains must be of //! the same iter or reduction type, as well as the same parallelization //! strategy if there is one -//! -//! \todo Should this be a unary op type? -//! class TORCH_CUDA_CU_API Merge : public Expr { public: - Merge(IterDomain* out, IterDomain* outer, IterDomain* inner); + Merge( + IrBuilderPasskey, + IterDomain* out, + IterDomain* outer, + IterDomain* inner); Merge(const Merge* src, IrCloner* ir_cloner); @@ -918,9 +960,7 @@ class TORCH_CUDA_CU_API Merge : public Expr { //! class TORCH_CUDA_CU_API NamedScalar : public Val { public: - // NOLINTNEXTLINE(modernize-pass-by-value) - NamedScalar(std::string name, DataType dtype) - : Val(ValType::NamedScalar, dtype), name_(name) {} + NamedScalar(IrBuilderPasskey passkey, std::string name, DataType dtype); NamedScalar(const NamedScalar* src, IrCloner* ir_cloner); @@ -931,9 +971,11 @@ class TORCH_CUDA_CU_API NamedScalar : public Val { bool sameAs(const Statement* other) const override; //! Return the named scalar extent of a parallel dimension (e.g. blockDim.x) + //! WARNING: Only works with Fusion container at the moment static NamedScalar* getParallelDim(ParallelType p_type); //! Return the named scalar index of a parallel dimension (e.g. threadIdx.x) + //! WARNING: Only works with Fusion container at the moment static NamedScalar* getParallelIndex(ParallelType p_type); //! Return the parallel type of this NamedScalar if it is an extent of a diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp index a553c59fc2b..8c0e1022308 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -14,6 +15,23 @@ namespace jit { namespace fuser { namespace cuda { +namespace { +const char* boolLiteral(bool value) { + return value ? "true" : "false"; +} + +std::string varName(const Val* val) { + std::stringstream value_name; + if (val == nullptr) { + value_name << "$nullptr"; + } else { + value_name << val->name(); + } + return value_name.str(); +} + +} // namespace + // Make sure we can inline something, before we attempt to. static void checkInlineable(const Expr* expr) { for (auto input : expr->inputs()) { @@ -49,6 +67,70 @@ void IrPrinter::handle(Fusion* fusion) { } } +void IrPrinter::handle(const kir::Kernel* kernel) { + TORCH_CHECK(kernel != nullptr); + + // kernel declaration + os_ << "\nKERNEL ("; + for (auto in : kernel->inputs()) { + handle(in); + if (in != kernel->inputs().back()) { + os_ << ", "; + } + } + os_ << ") -> ("; + for (auto out : kernel->outputs()) { + handle(out); + if (out != kernel->outputs().back()) { + os_ << ", "; + } + } + os_ << ") :\n"; + + // kernel body + indent_size_++; + for (auto expr : kernel->topLevelExprs()) { + handle(expr); + } + indent_size_--; + os_ << "END.\n\n"; +} + +void IrPrinter::handle(kir::Kernel& kernel) { + handle(&kernel); +} + +void IrPrinter::handleScope(const kir::Scope& scope) { + // Save the uses of the parent scope + indent_size_++; + for (auto expr : scope.exprs()) { + handle(expr); + } + indent_size_--; +} + +void IrPrinter::handle(const IterDomain* id) { + os_ << id->getIterType(); + os_ << id->getParallelType(); + os_ << varName(id); + os_ << "{"; + if (!id->start()->isZeroInt()) { + print_inline(id->start()); + os_ << " : "; + } + if (id->stop() != id->extent()) { + print_inline(id->stop()); + os_ << " : "; + } + print_inline(id->extent()); + os_ << "}"; + if (id->isRFactorProduct()) + os_ << "rf"; + if (id->hasPaddingToMultipleOfWarp()) { + os_ << "_p"; + } +} + void IrPrinter::handle(const TensorDomain* td) { if (td->nDims() == 0) { os_ << "[ 0 ]"; @@ -65,9 +147,9 @@ void IrPrinter::handle(const TensorDomain* td) { void IrPrinter::handle(const TensorView* tv) { if (tv->nDims() == 0) { - os_ << typePrefix(tv->getDataType().value()) << tv->name(); + os_ << typePrefix(tv->getDataType().value()) << varName(tv); } else { - os_ << "T" << tv->name(); + os_ << "T" << varName(tv); switch (tv->getMemoryType()) { case MemoryType::Global: os_ << "_g"; @@ -94,28 +176,6 @@ void IrPrinter::handle(const TensorView* tv) { } } -void IrPrinter::handle(const IterDomain* id) { - os_ << id->getIterType(); - os_ << id->getParallelType(); - os_ << id->name(); - os_ << "{"; - if (!id->start()->isZeroInt()) { - print_inline(id->start()); - os_ << " : "; - } - if (id->stop() != id->extent()) { - print_inline(id->stop()); - os_ << " : "; - } - print_inline(id->extent()); - os_ << "}"; - if (id->isRFactorProduct()) - os_ << "rf"; - if (id->hasPaddingToMultipleOfWarp()) { - os_ << "_p"; - } -} - void IrPrinter::handle(const Bool* b) { if (print_inline_ && b->definition() != nullptr) { os_ << "( "; @@ -124,10 +184,9 @@ void IrPrinter::handle(const Bool* b) { return; } - if (b->isSymbolic()) { - os_ << "b" << b->name(); - } else { - os_ << "bool(" << *(b->value()) << ")"; + os_ << "b" << varName(b); + if (b->isConst()) { + os_ << "(" << (b->value().value() ? "true" : "false") << ")"; } } @@ -140,7 +199,7 @@ void IrPrinter::handle(const Double* d) { } if (d->isSymbolic()) { - os_ << "d" << d->name(); + os_ << "d" << varName(d); } else { os_ << "double(" << std::setprecision( @@ -160,30 +219,20 @@ void IrPrinter::handle(const Int* i) { } if (i->isSymbolic()) { - os_ << "i" << i->name(); + os_ << "i" << varName(i); } else { os_ << *(i->value()); } } -void IrPrinter::handle(const NamedScalar* i) { - os_ << i->name(); -} - -static bool isTV(const Val* val) { - return val->getValType().value() == ValType::TensorView; -} - -// Check if we're a TensorView op that we can generate code for. -static bool isTVOp(const Expr* expr) { - return expr->outputs().size() == 1 && isTV(expr->outputs().front()); +void IrPrinter::handle(const NamedScalar* ns) { + os_ << ns->name(); } void IrPrinter::handle(const UnaryOp* uop) { - bool istvop = isTVOp(uop); + bool istvop = ir_utils::isTvOp(uop); if (!print_inline_) { - indent(); - os_ << uop->out(); + indent() << uop->out(); if (istvop) { os_ << "\n"; indent_size_++; @@ -230,10 +279,9 @@ void IrPrinter::handle(const UnaryOp* uop) { } void IrPrinter::handle(const BinaryOp* bop) { - bool istvop = isTVOp(bop); + bool istvop = ir_utils::isTvOp(bop); if (!print_inline_) { - indent(); - os_ << bop->out(); + indent() << bop->out(); // tensor operations tend to be long, break them up into multiple lines if (istvop) { @@ -286,7 +334,7 @@ void IrPrinter::handle(const BinaryOp* bop) { } void IrPrinter::handle(const TernaryOp* top) { - bool istvop = isTVOp(top); + bool istvop = ir_utils::isTvOp(top); if (!print_inline_) { indent(); os_ << top->out(); @@ -327,18 +375,16 @@ void IrPrinter::handle(const TernaryOp* top) { } void IrPrinter::handle(const ReductionOp* rop) { - indent(); - os_ << rop->out() << " = reduction( " << rop->in() - << ", op = " << rop->getReductionOpType() - << ", initial value = " << rop->init() << " )\n"; + indent() << rop->out() << " = reduction( " << rop->in() + << ", op = " << rop->getReductionOpType() + << ", initial value = " << rop->init() << " )\n"; } void IrPrinter::handle(const WelfordOp* wop) { - indent(); - os_ << wop->outAvg() << "(Avg),\n" - << wop->outVar() << "(Var),\n" - << wop->outN() << "(Count)" - << "\n = Welford ( "; + indent() << wop->outAvg() << "(Avg),\n" + << wop->outVar() << "(Var),\n" + << wop->outN() << "(Count)" + << "\n = Welford ( "; if (wop->singleValue()) { os_ << wop->inAvg() << "(Avg), "; } else { @@ -353,47 +399,7 @@ void IrPrinter::handle(const WelfordOp* wop) { } void IrPrinter::handle(const BroadcastOp* bop) { - indent(); - os_ << bop->out() << " = broadcast( " << bop->in() << " )\n"; -} - -void IrPrinter::handle(const TransposeOp* top) { - indent(); - os_ << top->out() << " = transpose( " << top->in() << " )\n"; -} - -void IrPrinter::handle(const ShiftOp* sop) { - indent(); - os_ << sop->out() << " = shift( " << sop->in() << ", {" << sop->offsets() - << "}, padding = " << (sop->pad() ? "true" : "false") << " )\n"; -} - -void IrPrinter::handle(const GatherOp* op) { - indent(); - os_ << op->out() << " = gather( " << op->in() << ", {"; - bool no_comma = true; - for (const auto& s : op->windowShape()) { - if (!no_comma) { - os_ << ", "; - } - os_ << s; - no_comma = false; - } - os_ << "}, {"; - no_comma = true; - for (const auto& pad : op->padWidth()) { - if (!no_comma) { - os_ << ", "; - } - os_ << "{" << pad[0] << ", " << pad[1] << "}"; - no_comma = false; - } - os_ << "} )\n"; -} - -void IrPrinter::handle(const ViewOp* top) { - indent(); - os_ << top->out() << " = view( " << top->in() << " )\n"; + indent() << bop->out() << " = broadcast( " << bop->in() << " )\n"; } void IrPrinter::handle(const Split* s) { @@ -424,6 +430,221 @@ void IrPrinter::handle(const Merge* m) { os_ << "\n"; } +void IrPrinter::handle(const TransposeOp* top) { + indent() << top->out() << " = transpose( " << top->in() << " )\n"; +} + +void IrPrinter::handle(const ShiftOp* sop) { + indent() << sop->out() << " = shift( " << sop->in() << ", {" << sop->offsets() + << "}, {" << sop->padWidth() << "} )\n"; +} + +void IrPrinter::handle(const GatherOp* op) { + indent() << op->out() << " = gather( " << op->in() << ", {"; + bool no_comma = true; + for (const auto& s : op->windowShape()) { + if (!no_comma) { + os_ << ", "; + } + os_ << s; + no_comma = false; + } + os_ << "}, {"; + no_comma = true; + for (const auto& pad : op->padWidth()) { + if (!no_comma) { + os_ << ", "; + } + os_ << "{" << pad[0] << ", " << pad[1] << "}"; + no_comma = false; + } + os_ << "} )\n"; +} + +void IrPrinter::handle(const ViewOp* top) { + indent() << top->out() << " = view( " << top->in() << " )\n"; +} + +void IrPrinter::handle(const kir::Predicate* node) { + switch (node->predicate_type()) { + case PredicateType::Inline: { + os_ << "Inline_Predicate"; + break; + } + case PredicateType::Manual: { + os_ << node->value(); + break; + } + case PredicateType::Misaligned: { + os_ << "Misaligned_Predicate"; + break; + } + case PredicateType::Padding: { + os_ << "Padding_Predicate"; + break; + } + case PredicateType::Shift: { + os_ << "Shift_Predicate"; + break; + } + case PredicateType::Unswitch: { + os_ << "Unswitch_Predicate"; + break; + } + case PredicateType::Vectorize: { + os_ << "Vectorize_Predicate"; + break; + } + default: + break; + } +} + +void IrPrinter::handle(const kir::TensorIndex* ti) { + os_ << "T" << varName(ti); + switch (ti->view()->getMemoryType()) { + case MemoryType::Global: + os_ << "_g"; + break; + case MemoryType::Shared: + os_ << "_s"; + break; + case MemoryType::Local: + os_ << "_l"; + break; + } + os_ << "["; + for (auto index : ti->indices()) { + print_inline(index); + if (index != ti->indices().back()) { + os_ << ", "; + } + } + os_ << "]"; + os_ << " view( T" << varName(ti->view()) << " )"; +} + +void IrPrinter::handle(const kir::Allocate* node) { + indent(); + handle(node->buffer()); + os_ << " = ALLOCATE(" + << "mem_type=" << node->memoryType() << ", " + << "size="; + print_inline(node->size()); + os_ << ", " + << "zero_init=" << boolLiteral(node->zeroInit()) << ")\n"; + if (node->alias() != nullptr) { + indent() << kTab << ".alias="; + handle(node->alias()->buffer()); + os_ << "\n"; + } +} + +void IrPrinter::handle(const kir::Sync* node) { + indent() << "SYNC(war_hazard=" << boolLiteral(node->isWarHazardSync()) + << ")\n"; +} + +void IrPrinter::handle(const kir::ForLoop* node) { + indent() << "FOR "; + handle(node->index()); + os_ << " in "; + handle(node->iter_domain()); + os_ << ":\n"; + handleScope(node->body()); +} + +void IrPrinter::handle(const kir::IfThenElse* node) { + indent() << "IF "; + handle(node->predicate()); + os_ << ":\n"; + handleScope(node->thenBody()); + if (node->hasElse()) { + indent() << "ELSE:\n"; + handleScope(node->elseBody()); + } +} + +void IrPrinter::handle(const kir::GridBroadcast* node) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} + +void IrPrinter::handle(const kir::GridReduction* node) { + const auto* reduction_op = node->reduction_op(); + indent(); + handle(reduction_op->out()); + os_ << " = " + << "GRID_REDUCTION(op='" << reduction_op->getReductionOpType() << "'" + << ", in="; + handle(reduction_op->in()); + os_ << ", init="; + handle(reduction_op->init()); + os_ << ", pred="; + handle(reduction_op->predicate()); + os_ << ")\n"; + indent() << kTab << ".reduction_buffer="; + handle(node->reduction_buffer()->buffer()); + os_ << "\n"; + indent() << kTab << ".sync_buffer="; + handle(node->sync_buffer()->buffer()); + os_ << "\n"; + indent() << kTab << ".grid_pred="; + handle(node->predicate()); + os_ << "\n"; +} + +void IrPrinter::handle(const kir::GridWelford* node) { + const auto* welford_op = node->welford_op(); + indent(); + handle(welford_op->outVar()); + os_ << ","; + handle(welford_op->outAvg()); + os_ << ","; + handle(welford_op->outN()); + os_ << " = " + << "GRID_WELFORD(" + << "inAvg="; + handle(welford_op->inAvg()); + if (!welford_op->inN()->isOneInt()) { + indent() << ", inVar="; + handle(welford_op->inVar()); + } + indent() << ", inN="; + handle(welford_op->inN()); + if (!welford_op->initN()->isZeroInt()) { + indent() << ", initVar="; + handle(welford_op->initVar()); + os_ << " initAvg="; + handle(welford_op->initAvg()); + os_ << " initN="; + handle(welford_op->initN()); + } + indent() << ", pred="; + handle(welford_op->predicate()); + os_ << ")\n"; + indent() << kTab << ".var_buffer="; + handle(node->var_buffer()->buffer()); + os_ << ".avg_buffer="; + handle(node->avg_buffer()->buffer()); + os_ << ".n_buffer="; + handle(node->N_buffer()->buffer()); + os_ << "\n"; + indent() << kTab << ".sync_buffer="; + handle(node->sync_buffer()->buffer()); + os_ << "\n"; + indent() << kTab << ".grid_pred="; + handle(node->predicate()); + os_ << "\n"; +} + +void IrPrinter::handle(const kir::InitMagicZero* node) { + indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n"; +} + +void IrPrinter::handle(const kir::UpdateMagicZero* node) { + indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n"; +} + void IrTransformPrinter::handle(Fusion* f) { auto all_vals = f->usedMathVals(); @@ -450,7 +671,7 @@ void IrTransformPrinter::printTransforms(TensorView* tv) { os() << ")\n"; for (auto exp : all_exp) { - os() << " "; + os() << " "; IrPrinter::handle(exp); } } diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.h b/torch/csrc/jit/codegen/cuda/ir_iostream.h index c080c3f8f99..f8c07886114 100644 --- a/torch/csrc/jit/codegen/cuda/ir_iostream.h +++ b/torch/csrc/jit/codegen/cuda/ir_iostream.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -13,21 +13,30 @@ namespace jit { namespace fuser { namespace cuda { +class Fusion; +namespace kir { +class Kernel; +class Scope; +} // namespace kir + //! Define pretty printing functions for IR nodes //! //! This class is intended for debug printing, so it attempts //! to handle invalid states as well. //! class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch { + static constexpr char const* kTab = " "; + public: explicit IrPrinter(std::ostream& os) : os_(os) {} // Indent the generated code - void indent() { + std::ostream& indent() { for (const auto i : c10::irange(indent_size_)) { (void)i; // Suppress unused variable warning os_ << " "; } + return os_; } void resetIndent() { @@ -38,6 +47,8 @@ class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch { return print_inline_; } + using OptInConstDispatch::handle; + virtual void handle(Fusion* f); // handle calls some non const fusion ops, @@ -52,30 +63,50 @@ class TORCH_CUDA_CU_API IrPrinter : public OptInConstDispatch { handle(&f); } - void handle(const Statement* s) override; - void handle(const Val* v) override; - void handle(const Expr* e) override; + virtual void handle(const kir::Kernel* kernel); + virtual void handle(kir::Kernel& kernel); - void handle(const TensorDomain*) override; - void handle(const TensorView*) override; - void handle(const IterDomain*) override; + void handleScope(const kir::Scope& scope); - void handle(const Bool*) override; - void handle(const Double*) override; - void handle(const Int*) override; - void handle(const NamedScalar*) override; + void handle(const Statement* s) final; + void handle(const Val* v) final; + void handle(const Expr* e) final; - void handle(const UnaryOp*) override; - void handle(const BinaryOp*) override; - void handle(const TernaryOp*) override; - void handle(const ReductionOp*) override; - void handle(const WelfordOp*) override; - void handle(const BroadcastOp*) override; - void handle(const TransposeOp*) override; - void handle(const ShiftOp*) override; - void handle(const GatherOp*) override; - void handle(const ViewOp*) override; + void handle(const IterDomain*) final; + void handle(const TensorDomain*) final; + void handle(const TensorView*) final; + void handle(const Bool*) final; + void handle(const Double*) final; + void handle(const Int*) final; + void handle(const NamedScalar*) final; + + void handle(const UnaryOp*) final; + void handle(const BinaryOp*) final; + void handle(const TernaryOp*) final; + void handle(const ReductionOp*) final; + void handle(const WelfordOp*) final; + void handle(const BroadcastOp*) final; + void handle(const TransposeOp*) final; + void handle(const ShiftOp*) final; + void handle(const GatherOp*) final; + void handle(const ViewOp*) final; + + void handle(const kir::Predicate*) final; + void handle(const kir::TensorIndex*) final; + + void handle(const kir::GridBroadcast*) final; + void handle(const kir::GridReduction*) final; + void handle(const kir::GridWelford*) final; + void handle(const kir::ForLoop*) final; + void handle(const kir::IfThenElse*) final; + void handle(const kir::Allocate*) final; + void handle(const kir::Sync*) final; + void handle(const kir::InitMagicZero*) final; + void handle(const kir::UpdateMagicZero*) final; + + // IR math printer overrides these to prevent them from printing, keep + // override void handle(const Split*) override; void handle(const Merge*) override; diff --git a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp index 1465a88bef3..884b6a6e0ec 100644 --- a/torch/csrc/jit/codegen/cuda/ir_nodes.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_nodes.cpp @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -38,19 +40,19 @@ class ScalarCheck : OptInConstDispatch { } private: - void handle(const Bool* b) override { + void handle(const Bool* b) final { same_ = v1_->as()->sameAs(v2_->as()); } - void handle(const Double* d) override { + void handle(const Double* d) final { same_ = v1_->as()->sameAs(v2_->as()); } - void handle(const Int* i) override { + void handle(const Int* i) final { same_ = v1_->as()->sameAs(v2_->as()); } - void handle(const NamedScalar* ns) override { + void handle(const NamedScalar* ns) final { same_ = v1_->as()->sameAs(v2_->as()); } @@ -70,6 +72,16 @@ bool areEqualScalars(Val* v1, Val* v2) { return ScalarCheck::sameAs(v1, v2); } +Bool::Bool(IrBuilderPasskey passkey) + : Val(passkey, ValType::Scalar, DataType::Bool), + maybe_value_{c10::nullopt} {} + +Bool::Bool(IrBuilderPasskey passkey, bool value) + : Val(passkey, ValType::Scalar, DataType::Bool), maybe_value_{value} {} + +Bool::Bool(IrBuilderPasskey passkey, c10::optional value) + : Val(passkey, ValType::Scalar, DataType::Bool), maybe_value_{value} {} + Bool::Bool(const Bool* src, IrCloner* ir_cloner) : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} @@ -87,6 +99,16 @@ bool Bool::sameAs(const Statement* other) const { return false; } +Double::Double(IrBuilderPasskey passkey) + : Val(passkey, ValType::Scalar, DataType::Double), + maybe_value_{c10::nullopt} {} + +Double::Double(IrBuilderPasskey passkey, ScalarType value) + : Val(passkey, ValType::Scalar, DataType::Double), maybe_value_{value} {} + +Double::Double(IrBuilderPasskey passkey, c10::optional value) + : Val(passkey, ValType::Scalar, DataType::Double), maybe_value_{value} {} + Double::Double(const Double* src, IrCloner* ir_cloner) : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} @@ -103,6 +125,16 @@ bool Double::sameAs(const Statement* other) const { return false; } +Int::Int(IrBuilderPasskey passkey) + : Val(passkey, ValType::Scalar, DataType::Int), + maybe_value_{c10::nullopt} {} + +Int::Int(IrBuilderPasskey passkey, ScalarType value) + : Val(passkey, ValType::Scalar, DataType::Int), maybe_value_{value} {} + +Int::Int(IrBuilderPasskey passkey, c10::optional value) + : Val(passkey, ValType::Scalar, DataType::Int), maybe_value_{value} {} + Int::Int(const Int* src, IrCloner* ir_cloner) : Val(src, ir_cloner), maybe_value_(src->maybe_value_) {} @@ -120,11 +152,13 @@ bool Int::sameAs(const Statement* other) const { return false; } -UnaryOp::UnaryOp(UnaryOpType type, Val* out, Val* in) - : Expr(ExprType::UnaryOp), unary_op_type_{type}, out_{out}, in_{in} { +UnaryOp::UnaryOp(IrBuilderPasskey passkey, UnaryOpType type, Val* out, Val* in) + : Expr(passkey, ExprType::UnaryOp), + unary_op_type_{type}, + out_{out}, + in_{in} { addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } UnaryOp::UnaryOp(const UnaryOp* src, IrCloner* ir_cloner) @@ -146,8 +180,13 @@ bool UnaryOp::sameAs(const Statement* other) const { return Expr::sameAs(other); } -BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs) - : Expr(ExprType::BinaryOp), +BinaryOp::BinaryOp( + IrBuilderPasskey passkey, + BinaryOpType type, + Val* out, + Val* lhs, + Val* rhs) + : Expr(passkey, ExprType::BinaryOp), binary_op_type_{type}, out_{out}, lhs_{lhs}, @@ -155,7 +194,6 @@ BinaryOp::BinaryOp(BinaryOpType type, Val* out, Val* lhs, Val* rhs) addOutput(out); addInput(lhs); addInput(rhs); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } BinaryOp::BinaryOp(const BinaryOp* src, IrCloner* ir_cloner) @@ -178,8 +216,14 @@ bool BinaryOp::sameAs(const Statement* other) const { return Expr::sameAs(other); } -TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3) - : Expr(ExprType::TernaryOp), +TernaryOp::TernaryOp( + IrBuilderPasskey passkey, + TernaryOpType type, + Val* out, + Val* in1, + Val* in2, + Val* in3) + : Expr(passkey, ExprType::TernaryOp), ternary_op_type_{type}, out_{out}, in1_{in1}, @@ -189,7 +233,6 @@ TernaryOp::TernaryOp(TernaryOpType type, Val* out, Val* in1, Val* in2, Val* in3) addInput(in1); addInput(in2); addInput(in3); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } TernaryOp::TernaryOp(const TernaryOp* src, IrCloner* ir_cloner) @@ -213,8 +256,12 @@ bool TernaryOp::sameAs(const Statement* other) const { return Expr::sameAs(other); } -BroadcastOp::BroadcastOp(Val* out, Val* in, std::vector is_broadcast_dims) - : Expr(ExprType::BroadcastOp), +BroadcastOp::BroadcastOp( + IrBuilderPasskey passkey, + Val* out, + Val* in, + std::vector is_broadcast_dims) + : Expr(passkey, ExprType::BroadcastOp), out_(out), in_(in), is_broadcast_dims_(std::move(is_broadcast_dims)) { @@ -226,12 +273,18 @@ BroadcastOp::BroadcastOp(Val* out, Val* in, std::vector is_broadcast_dims) auto in_type = in->getValType().value(); TORCH_INTERNAL_ASSERT( - out_type == ValType::TensorView && in_type == ValType::TensorView, + (out_type == ValType::TensorView && in_type == ValType::TensorView) || + (out_type == ValType::TensorIndex && in_type == ValType::TensorIndex), "Cannot braodcast a non-tensor object."); addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); + + if (!out->isA() || !in->isA()) { + return; + } + + passkey.ir_container_->registerExpr(exprPasskey(), this); // This is a generic check that root dims of a consumer and producer match. // Maybe we shouldn't relegate it to this constructor. @@ -294,37 +347,44 @@ bool BroadcastOp::sameAs(const Statement* other) const { } ReductionOp::ReductionOp( + IrBuilderPasskey passkey, BinaryOpType reduction_op_type, Val* init, Val* out, Val* in) - : Expr(ExprType::ReductionOp), + : Expr(passkey, ExprType::ReductionOp), reduction_op_type_(reduction_op_type), init_(init), out_(out), in_(in) { - TORCH_CHECK(out->getValType().value() == ValType::TensorView); + TORCH_CHECK( + out->getValType().value() == ValType::TensorView || + out->getValType().value() == ValType::TensorIndex); TORCH_INTERNAL_ASSERT( - in->getValType() == ValType::TensorView && - out->getValType() == ValType::TensorView, + (in->getValType() == ValType::TensorView && + out->getValType() == ValType::TensorView) || + (in->getValType() == ValType::TensorIndex && + out->getValType() == ValType::TensorIndex), "Reduction operation was created that does not have tensor inputs and outputs."); - TORCH_INTERNAL_ASSERT( - TensorDomain::noReductions(in->as()->getMaybeRFactorDomain()) - .size() == out->as()->getRootDomain().size(), - "Reduction operation created with mismatched domains."); - + if (in->isA()) { + TORCH_INTERNAL_ASSERT( + TensorDomain::noReductions( + in->as()->getMaybeRFactorDomain()) + .size() == out->as()->getRootDomain().size(), + "Reduction operation created with mismatched domains."); + } TORCH_INTERNAL_ASSERT( init->isConstScalar(), "Tried to create a reduction operation whith an initial value that isn't a constant."); addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } WelfordOp::WelfordOp( + IrBuilderPasskey passkey, Val* out_avg, Val* out_var, Val* out_N, @@ -334,7 +394,7 @@ WelfordOp::WelfordOp( Val* in_avg, Val* in_var, Val* in_N) - : Expr(ExprType::WelfordOp), + : Expr(passkey, ExprType::WelfordOp), out_avg_(out_avg), out_var_(out_var), out_N_(out_N), @@ -345,9 +405,15 @@ WelfordOp::WelfordOp( in_var_(in_var), in_N_(in_N) { // Check output type - TORCH_INTERNAL_ASSERT(out_avg->getValType().value() == ValType::TensorView); - TORCH_INTERNAL_ASSERT(out_var->getValType().value() == ValType::TensorView); - TORCH_INTERNAL_ASSERT(out_N->getValType().value() == ValType::TensorView); + TORCH_INTERNAL_ASSERT( + out_avg->getValType().value() == ValType::TensorView || + out_avg->getValType().value() == ValType::TensorIndex); + TORCH_INTERNAL_ASSERT( + out_var->getValType().value() == ValType::TensorView || + out_var->getValType().value() == ValType::TensorIndex); + TORCH_INTERNAL_ASSERT( + out_N->getValType().value() == ValType::TensorView || + out_N->getValType().value() == ValType::TensorIndex); // check initial value TORCH_INTERNAL_ASSERT(init_N->getValType().value() == ValType::Scalar); @@ -356,22 +422,32 @@ WelfordOp::WelfordOp( // initial value with a count of 1 is un-common enough that I'll push // the responsibility of creating all-zero var tensors to the user TORCH_INTERNAL_ASSERT( - init_avg && init_avg->getValType().value() == ValType::TensorView); + init_avg && + (init_avg->getValType().value() == ValType::TensorView || + init_avg->getValType().value() == ValType::TensorIndex)); TORCH_INTERNAL_ASSERT( - init_var && init_var->getValType().value() == ValType::TensorView); + init_var && + (init_var->getValType().value() == ValType::TensorView || + init_var->getValType().value() == ValType::TensorIndex)); } TORCH_INTERNAL_ASSERT( - in_avg && in_avg->getValType().value() == ValType::TensorView); + in_avg && + (in_avg->getValType().value() == ValType::TensorView || + in_avg->getValType().value() == ValType::TensorIndex), + in_avg->getValType().value()); // check input TORCH_INTERNAL_ASSERT( in_N->getValType().value() == ValType::Scalar || - in_N->getValType().value() == ValType::TensorView); + in_N->getValType().value() == ValType::TensorView || + in_N->getValType().value() == ValType::TensorIndex); if (!in_N->isOneInt()) { // when input is only one value, only the value is required through avg // input the var part is implicitly 0 and codegen will handle that. TORCH_INTERNAL_ASSERT( - in_var && in_var->getValType().value() == ValType::TensorView); + in_var && + (in_var->getValType().value() == ValType::TensorView || + in_var->getValType().value() == ValType::TensorIndex)); } addOutput(out_avg); @@ -384,8 +460,6 @@ WelfordOp::WelfordOp( addInput(in_var); } addInput(in_N); - - name_ = FusionGuard::getCurFusion()->registerExpr(this); } WelfordOp::WelfordOp(const WelfordOp* src, IrCloner* ir_cloner) @@ -444,10 +518,11 @@ bool ReductionOp::sameAs(const Statement* other) const { } TransposeOp::TransposeOp( + IrBuilderPasskey passkey, TensorView* out, TensorView* in, std::vector new2old) - : Expr(ExprType::TransposeOp), + : Expr(passkey, ExprType::TransposeOp), out_(out), in_(in), new2old_(std::move(new2old)) { @@ -481,7 +556,6 @@ TransposeOp::TransposeOp( addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } TransposeOp::TransposeOp(const TransposeOp* src, IrCloner* ir_cloner) @@ -490,12 +564,17 @@ TransposeOp::TransposeOp(const TransposeOp* src, IrCloner* ir_cloner) in_(ir_cloner->clone(src->in_)), new2old_(src->new2old_) {} -ShiftOp::ShiftOp(Val* out, Val* in, std::vector offsets, bool pad) - : Expr(ExprType::ShiftOp), +ShiftOp::ShiftOp( + IrBuilderPasskey passkey, + Val* out, + Val* in, + std::vector offsets, + std::vector pad_width) + : Expr(passkey, ExprType::ShiftOp), out_(out), in_(in), offsets_(std::move(offsets)), - pad_(pad) { + pad_width_(std::move(pad_width)) { // clang-tidy complains about out_ that it may be null. TORCH_INTERNAL_ASSERT(out_ != nullptr); TORCH_INTERNAL_ASSERT(in_ != nullptr); @@ -514,9 +593,15 @@ ShiftOp::ShiftOp(Val* out, Val* in, std::vector offsets, bool pad) "Invalid offset vector: ", offsets_); + TORCH_INTERNAL_ASSERT( + pad_width_.size() == + TensorDomain::noReductions(in_->as()->getRootDomain()) + .size(), + "Invalid padding width vector: ", + pad_width_); + addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } ShiftOp::ShiftOp(const ShiftOp* src, IrCloner* ir_cloner) @@ -524,7 +609,7 @@ ShiftOp::ShiftOp(const ShiftOp* src, IrCloner* ir_cloner) out_(ir_cloner->clone(src->out_)), in_(ir_cloner->clone(src->in_)), offsets_(src->offsets_), - pad_(src->pad_) {} + pad_width_(src->pad_width_) {} bool ShiftOp::sameAs(const Statement* other) const { if (this == other) { @@ -541,11 +626,12 @@ bool ShiftOp::sameAs(const Statement* other) const { } GatherOp::GatherOp( + IrBuilderPasskey passkey, Val* out, Val* in, - std::vector window_shape, - std::vector> pad_width) - : Expr(ExprType::GatherOp), + std::vector window_shape, + std::vector> pad_width) + : Expr(passkey, ExprType::GatherOp), out_(out), in_(in), window_shape_(std::move(window_shape)), @@ -578,28 +664,14 @@ GatherOp::GatherOp( addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } GatherOp::GatherOp(const GatherOp* src, IrCloner* ir_cloner) : Expr(src, ir_cloner), out_(ir_cloner->clone(src->out_)), - in_(ir_cloner->clone(src->in_)) { - std::transform( - src->window_shape_.begin(), - src->window_shape_.end(), - std::back_inserter(window_shape_), - [&ir_cloner](const auto& x) { return ir_cloner->clone(x); }); - for (const auto& pad : src->pad_width_) { - std::vector pad_clone; - std::transform( - pad.begin(), - pad.end(), - std::back_inserter(pad_clone), - [&ir_cloner](const auto& x) { return ir_cloner->clone(x); }); - pad_width_.push_back(pad_clone); - } -} + in_(ir_cloner->clone(src->in_)), + window_shape_(src->window_shape_), + pad_width_(src->pad_width_) {} bool GatherOp::sameAs(const Statement* other) const { if (this == other) { @@ -609,23 +681,10 @@ bool GatherOp::sameAs(const Statement* other) const { return false; } const auto other_op = other->as(); - if (windowShape().size() != other_op->windowShape().size()) { + if (windowShape() != other_op->windowShape() || + padWidth() != other_op->padWidth()) { return false; } - for (const auto i : c10::irange(windowShape().size())) { - if (!windowShape()[i]->sameAs(other_op->windowShape()[i])) { - return false; - } - } - if (padWidth().size() != other_op->padWidth().size()) { - return false; - } - for (const auto i : c10::irange(padWidth().size())) { - if (!padWidth()[i][0]->sameAs(other_op->padWidth()[i][0]) || - !padWidth()[i][1]->sameAs(other_op->padWidth()[i][1])) { - return false; - } - } return Expr::sameAs(other); } @@ -638,11 +697,10 @@ int GatherOp::gatherAxis(int axis) const { return int(windowShape().size()) + axis; } -ViewOp::ViewOp(TensorView* out, TensorView* in) - : Expr(ExprType::ViewOp), out_(out), in_(in) { +ViewOp::ViewOp(IrBuilderPasskey passkey, TensorView* out, TensorView* in) + : Expr(passkey, ExprType::ViewOp), out_(out), in_(in) { addOutput(out); addInput(in); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } ViewOp::ViewOp(const ViewOp* src, IrCloner* ir_cloner) @@ -651,12 +709,14 @@ ViewOp::ViewOp(const ViewOp* src, IrCloner* ir_cloner) in_(ir_cloner->clone(src->in_)) {} IterDomain::IterDomain( + IrBuilderPasskey passkey, Val* start, Val* extent, ParallelType parallel_type, IterType iter_type, bool is_rfactor_domain) : IterDomain( + passkey, start, extent, nullptr, @@ -665,16 +725,19 @@ IterDomain::IterDomain( is_rfactor_domain) {} IterDomain::IterDomain( + IrBuilderPasskey passkey, Val* start, Val* extent, Val* stop_offset, ParallelType parallel_type, IterType iter_type, bool is_rfactor_domain) - : Val(ValType::IterDomain, DataType::Int, false), + : Val(passkey, ValType::IterDomain, DataType::Int), start_(start), extent_(extent), - stop_offset_(stop_offset == nullptr ? new Int(0) : stop_offset), + stop_offset_( + stop_offset == nullptr ? passkey.ir_container_->zeroVal() + : stop_offset), parallel_type_(parallel_type), iter_type_(iter_type), is_rfactor_domain_(is_rfactor_domain) { @@ -693,8 +756,6 @@ IterDomain::IterDomain( "Cannot create an iter domain with a start that is not an int but received ", start, " ."); - - name_ = fusion_->registerVal(this); } IterDomain::IterDomain(const IterDomain* src, IrCloner* ir_cloner) @@ -729,6 +790,22 @@ bool IterDomain::sameAs(const Statement* other) const { return is_same; } +// Returns a new IterDomain matching properties of this +IterDomain* IterDomain::clone() const { + auto cloned = IrBuilder::create( + ir_container_, + start(), + extent(), + stopOffset(), + getParallelType(), + getIterType(), + isRFactorProduct()); + + cloned->is_padded_dimension_ = is_padded_dimension_; + cloned->padded_to_size_ = padded_to_size_; + return cloned; +} + std::vector IterDomain::clone( const std::vector& domains) { std::vector cloned_domains; @@ -781,14 +858,15 @@ IterDomain* IterDomain::merge(IterDomain* outer, IterDomain* inner) { itype = IterType::Iteration; } - IterDomain* merged_id = new IterDomain( - new Int(0), + IterDomain* merged_id = IrBuilder::create( + outer->container(), + outer->container()->zeroVal(), merged_id_size->as(), outer->getParallelType(), itype, outer->isRFactorProduct() || inner->isRFactorProduct()); - new Merge(merged_id, outer, inner); + IrBuilder::create(outer->container(), merged_id, outer, inner); return merged_id; } @@ -811,7 +889,8 @@ std::pair IterDomain::split( if (factor->getValType() == ValType::Scalar) { TORCH_CHECK( factor->isConstScalar() || - FusionGuard::getCurFusion()->hasInput(factor), + (FusionGuard::getCurFusion() == factor->fusion() && + factor->isFusionInput()), factor, " is not a constant nor an input. It must be one or the other to be used in a split.", " If you want a symbolic split based on a thread dimension please use IterDomain::split(IterDomain*, ParallelType);"); @@ -832,24 +911,33 @@ std::pair IterDomain::split( in->definition() == nullptr, "Partial split is only allowed with root domains"); } - // outer loop IterDomain - IterDomain* ido = new IterDomain( - new Int(0), + IterDomain* ido = IrBuilder::create( + in->container(), + in->container()->zeroVal(), inner_split ? remainder->as() : factor, in->getParallelType(), in->getIterType(), in->isRFactorProduct()); // inner loop IterDomain - IterDomain* idi = new IterDomain( - new Int(0), + IterDomain* idi = IrBuilder::create( + in->container(), + in->container()->zeroVal(), inner_split ? factor : remainder->as(), in->getParallelType(), in->getIterType(), in->isRFactorProduct()); - new Split(ido, idi, in, factor, inner_split, start_offset, stop_offset); + IrBuilder::create( + in->container(), + ido, + idi, + in, + factor, + inner_split, + start_offset, + stop_offset); return {ido, idi}; } @@ -864,7 +952,9 @@ std::pair IterDomain::split( } std::pair IterDomain::stridedSplit(int factor) { - auto split_out = IterDomain::split(this, new Int(factor), true); + // Use partial split so that only valid values are retained + auto split_out = IterDomain::split( + this, IrBuilder::create(container(), factor), true, true); split_out.second->iter_type_ = IterType::Stride; split_out.first->is_rfactor_domain_ = true; @@ -907,9 +997,10 @@ Val* IterDomain::stop() const { } TensorDomain::TensorDomain( + IrBuilderPasskey passkey, std::vector root_domain, std::vector contiguity) - : Val(ValType::TensorDomain, DataType::Null, false), + : Val(passkey, ValType::TensorDomain, DataType::Null), root_domain_(std::move(root_domain)), contiguity_( contiguity.empty() ? std::vector(root_domain_.size(), false) @@ -925,14 +1016,14 @@ TensorDomain::TensorDomain( has_nontrivial_reduction_ = false; domain_ = root_domain_; resetDomains(); - name_ = fusion_->registerVal(this); } TensorDomain::TensorDomain( + IrBuilderPasskey passkey, std::vector root_domain, std::vector domain, std::vector contiguity) - : Val(ValType::TensorDomain, DataType::Null, false), + : Val(passkey, ValType::TensorDomain, DataType::Null), root_domain_(std::move(root_domain)), domain_(std::move(domain)), contiguity_( @@ -963,15 +1054,15 @@ TensorDomain::TensorDomain( // Just due to clang-tidy, correct value set in resetDomains has_nontrivial_reduction_ = false; resetDomains(); - name_ = fusion_->registerVal(this); } TensorDomain::TensorDomain( + IrBuilderPasskey passkey, std::vector root_domain, std::vector rfactor_domain, std::vector domain, std::vector contiguity) - : Val(ValType::TensorDomain, DataType::Null, false), + : Val(passkey, ValType::TensorDomain, DataType::Null), root_domain_(std::move(root_domain)), domain_(std::move(domain)), rfactor_domain_(std::move(rfactor_domain)), @@ -1013,7 +1104,6 @@ TensorDomain::TensorDomain( // Just due to clang-tidy, correct value set in resetDomains has_nontrivial_reduction_ = false; resetDomains(); - name_ = fusion_->registerVal(this); } TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner) @@ -1026,6 +1116,30 @@ TensorDomain::TensorDomain(const TensorDomain* src, IrCloner* ir_cloner) contiguity_(src->contiguity()), has_nontrivial_reduction_(src->has_nontrivial_reduction_) {} +namespace { +std::vector lowerIterDomains( + const std::vector& domains) { + std::vector lowered_domains; + lowered_domains.reserve(domains.size()); + for (const auto iter_domain : domains) { + lowered_domains.push_back(iter_domain); + } + return lowered_domains; +}; +} // namespace + +bool TensorDomain::hasBlockBroadcast() const { + return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { + return id->isBroadcast() && id->isThreadDim(); + }); +} + +bool TensorDomain::hasGridBroadcast() const { + return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { + return id->isBroadcast() && id->isBlockDim(); + }); +} + bool TensorDomain::operator==(const TensorDomain& other) const { // Checks equality of each class field. Should not be necessary to // check no_bcast_domain_ and no_reduction_domain_ as they are just @@ -1389,6 +1503,7 @@ std::pair TensorDomain::rFactor( } Split::Split( + IrBuilderPasskey passkey, IterDomain* outer, IterDomain* inner, IterDomain* in, @@ -1396,14 +1511,18 @@ Split::Split( bool inner_split, Val* start_offset, Val* stop_offset) - : Expr(ExprType::Split), + : Expr(passkey, ExprType::Split), outer_{outer}, inner_{inner}, in_{in}, factor_{factor}, inner_split_{inner_split}, - start_offset_{start_offset != nullptr ? start_offset : new Int(0)}, - stop_offset_{stop_offset != nullptr ? stop_offset : new Int(0)} { + start_offset_{ + start_offset != nullptr ? start_offset + : passkey.ir_container_->zeroVal()}, + stop_offset_{ + stop_offset != nullptr ? stop_offset + : passkey.ir_container_->zeroVal()} { TORCH_INTERNAL_ASSERT( factor_->isAnInt(), "Attempted to create a Split node with a non-integer factor."); @@ -1412,7 +1531,6 @@ Split::Split( addInput(in); // TODO add factor as an input, need to check Split::Split during validation // and need to check BestEffortReplay::findFirstMismatchedID addInput(factor); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } Split::Split(const Split* src, IrCloner* ir_cloner) @@ -1453,12 +1571,15 @@ bool Split::sameAs(const Statement* other) const { stopOffset()->sameAs(other->as()->stopOffset()); } -Merge::Merge(IterDomain* out, IterDomain* outer, IterDomain* inner) - : Expr(ExprType::Merge), out_{out}, outer_{outer}, inner_{inner} { +Merge::Merge( + IrBuilderPasskey passkey, + IterDomain* out, + IterDomain* outer, + IterDomain* inner) + : Expr(passkey, ExprType::Merge), out_{out}, outer_{outer}, inner_{inner} { addOutput(out); addInput(outer); addInput(inner); - name_ = FusionGuard::getCurFusion()->registerExpr(this); } Merge::Merge(const Merge* src, IrCloner* ir_cloner) @@ -1477,6 +1598,12 @@ bool Merge::sameAs(const Statement* other) const { return Expr::sameAs(other); } +NamedScalar::NamedScalar( + IrBuilderPasskey passkey, + std::string name, + DataType dtype) + : Val(passkey, ValType::NamedScalar, dtype), name_(std::move(name)) {} + NamedScalar::NamedScalar(const NamedScalar* src, IrCloner* ir_cloner) : Val(src, ir_cloner), name_(src->name_) {} @@ -1495,13 +1622,15 @@ NamedScalar* NamedScalar::getParallelDim(ParallelType p_type) { isParallelTypeThread(p_type), "Cannot get parallel dim of non thread type, received: ", p_type); + TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr); std::string parallel_dim = stringifyThreadSize(p_type); - return new NamedScalar(parallel_dim, DataType::Int); + return IrBuilder::create(parallel_dim, DataType::Int); } NamedScalar* NamedScalar::getParallelIndex(ParallelType p_type) { + TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr); std::string parallel_ind = stringifyThread(p_type); - return new NamedScalar(parallel_ind, DataType::Int); + return IrBuilder::create(parallel_ind, DataType::Int); } c10::optional NamedScalar::getParallelDim() const { diff --git a/torch/csrc/jit/codegen/cuda/ir_printer.h b/torch/csrc/jit/codegen/cuda/ir_printer.h index a2c14386147..91d07b76b80 100644 --- a/torch/csrc/jit/codegen/cuda/ir_printer.h +++ b/torch/csrc/jit/codegen/cuda/ir_printer.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.cpp b/torch/csrc/jit/codegen/cuda/ir_utils.cpp index 5bf05b0f516..004cfa23dff 100644 --- a/torch/csrc/jit/codegen/cuda/ir_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/ir_utils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -140,7 +141,8 @@ struct SubstituteInExpr : public OptInDispatch { reference_->sameAs(unary_expr->in()) ? substitute_ : unary_expr->in(); auto out = reference_->sameAs(unary_expr->out()) ? substitute_ : unary_expr->out(); - expr_ = new UnaryOp(unary_expr->getUnaryOpType(), out, in); + expr_ = IrBuilder::create( + unary_expr->container(), unary_expr->getUnaryOpType(), out, in); } void handle(BinaryOp* binary_expr) final { @@ -151,7 +153,12 @@ struct SubstituteInExpr : public OptInDispatch { auto out = reference_->sameAs(binary_expr->out()) ? substitute_ : binary_expr->out(); - expr_ = new BinaryOp(binary_expr->getBinaryOpType(), out, lhs, rhs); + expr_ = IrBuilder::create( + binary_expr->container(), + binary_expr->getBinaryOpType(), + out, + lhs, + rhs); } void handle(TernaryOp* ternary_expr) final { @@ -163,7 +170,13 @@ struct SubstituteInExpr : public OptInDispatch { : ternary_expr->in3(); auto out = reference_->sameAs(ternary_expr->out()) ? substitute_ : ternary_expr->out(); - expr_ = new TernaryOp(ternary_expr->getTernaryOpType(), out, in1, in2, in3); + expr_ = IrBuilder::create( + ternary_expr->container(), + ternary_expr->getTernaryOpType(), + out, + in1, + in2, + in3); } void handle(ReductionOp* reduction_expr) final { @@ -176,8 +189,12 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(reduction_expr->in()) ? substitute_ : reduction_expr->in(); - expr_ = - new ReductionOp(reduction_expr->getReductionOpType(), init, out, in); + expr_ = IrBuilder::create( + reduction_expr->container(), + reduction_expr->getReductionOpType(), + init, + out, + in); } void handle(BroadcastOp* broadcast_expr) final { @@ -187,7 +204,11 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(broadcast_expr->in()) ? substitute_ : broadcast_expr->in(); - expr_ = new BroadcastOp(out, in, broadcast_expr->getBroadcastDimFlags()); + expr_ = IrBuilder::create( + broadcast_expr->container(), + out, + in, + broadcast_expr->getBroadcastDimFlags()); } void handle(TransposeOp* transpose_expr) final { @@ -201,7 +222,8 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(transpose_expr->in()) ? substitute_->as() : transpose_expr->in(); - expr_ = new TransposeOp(out, in, transpose_expr->new2old()); + expr_ = IrBuilder::create( + transpose_expr->container(), out, in, transpose_expr->new2old()); } void handle(ShiftOp* shift_expr) final { @@ -210,7 +232,12 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(shift_expr->in()) ? substitute_ : shift_expr->in(); - expr_ = new ShiftOp(out, in, shift_expr->offsets(), shift_expr->pad()); + expr_ = IrBuilder::create( + shift_expr->container(), + out, + in, + shift_expr->offsets(), + shift_expr->padWidth()); } void handle(GatherOp* gather_expr) final { @@ -219,8 +246,12 @@ struct SubstituteInExpr : public OptInDispatch { auto in = reference_->sameAs(gather_expr->in()) ? substitute_ : gather_expr->in(); - expr_ = new GatherOp( - out, in, gather_expr->windowShape(), gather_expr->padWidth()); + expr_ = IrBuilder::create( + gather_expr->container(), + out, + in, + gather_expr->windowShape(), + gather_expr->padWidth()); } void handle(ViewOp* view_expr) final { @@ -234,7 +265,7 @@ struct SubstituteInExpr : public OptInDispatch { auto out = reference_->sameAs(view_expr->out()) ? substitute_->as() : view_expr->out(); - expr_ = new ViewOp(out, in); + expr_ = IrBuilder::create(view_expr->container(), out, in); } void handle(WelfordOp* welford_expr) final { @@ -268,7 +299,8 @@ struct SubstituteInExpr : public OptInDispatch { welford_expr->initN() && reference_->sameAs(welford_expr->initN()) ? substitute_ : welford_expr->initN(); - expr_ = new WelfordOp( + expr_ = IrBuilder::create( + welford_expr->container(), out_avg, out_var, out_N, @@ -402,13 +434,31 @@ std::vector allTvs(Fusion* fusion) { return uniqueEntries({used_tvs.begin(), used_tvs.end()}); } -std::vector historyOf(TensorDomain* td) { - return ExprSort::getExprs( - td->fusion(), {td->domain().begin(), td->domain().end()}); -} - -std::vector historyOf(TensorView* tv) { - return historyOf(tv->domain()); +std::vector getReductionOps(Fusion* fusion) { + std::vector red_ops; + for (auto expr : fusion->exprs()) { + const Val* out_val = nullptr; + if (expr->isA()) { + out_val = expr->as()->out(); + } else if (expr->isA()) { + out_val = expr->as()->outAvg(); + } else { + continue; + } + if (out_val == nullptr || !out_val->isA()) { + continue; + } + auto out_tv = out_val->as(); + if (std::any_of( + out_tv->getRootDomain().begin(), + out_tv->getRootDomain().end(), + [](IterDomain* id) { + return id->isReduction() && !id->isTrivialReduction(); + })) { + red_ops.push_back(expr); + } + } + return red_ops; } } // namespace ir_utils diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.h b/torch/csrc/jit/codegen/cuda/ir_utils.h index c8dc2e6f679..1bf3f27ec0b 100644 --- a/torch/csrc/jit/codegen/cuda/ir_utils.h +++ b/torch/csrc/jit/codegen/cuda/ir_utils.h @@ -110,6 +110,9 @@ auto filterByType(InputIt first, InputIt last) { return FilteredView(first, last); } +template +auto filterByType(const ContainerType&& inputs) = delete; + template auto filterByType(const ContainerType& inputs) { return filterByType(inputs.cbegin(), inputs.cend()); @@ -175,11 +178,7 @@ TORCH_CUDA_CU_API std::vector outputTvsOf( // returns all tensor views in fusion that are used between outputs and inputs. TORCH_CUDA_CU_API std::vector allTvs(Fusion* fusion); -// Returns the history of expressions applied to the domains of td -TORCH_CUDA_CU_API std::vector historyOf(TensorDomain* td); - -// Returns the history of expressions applied to the domains of tv -TORCH_CUDA_CU_API std::vector historyOf(TensorView* tv); +TORCH_CUDA_CU_API std::vector getReductionOps(Fusion* fusion); } // namespace ir_utils } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp index 344df98f5a7..894b40f79e3 100644 --- a/torch/csrc/jit/codegen/cuda/iter_visitor.cpp +++ b/torch/csrc/jit/codegen/cuda/iter_visitor.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace torch { @@ -31,21 +32,94 @@ void remove_visited( } } +// Return all dependencies of a node including members of the node. +class RecursiveDependencies : public OptInDispatch { + public: + static std::vector next(Statement* stmt) { + RecursiveDependencies find_next(stmt); + return find_next.next_stmts_; + } + + private: + RecursiveDependencies() = default; + + RecursiveDependencies(Statement* stmt) { + handle(stmt); + } + + using OptInDispatch::handle; + + void handle(Expr* expr) final { + FusionGuard::getCurFusion()->assertInContainer( + expr, + "IterVisitor.cpp::RecursiveDependencies::handle(Expr*) Cannot traverse expr, "); + next_stmts_.insert( + next_stmts_.end(), expr->inputs().begin(), expr->inputs().end()); + } + + void handle(Val* val) final { + FusionGuard::getCurFusion()->assertInContainer( + val, + "IterVisitor.cpp::RecursiveDependencies::handle(Val*) Cannot traverse val, "); + OptInDispatch::handle(val); + } + + void simpleVal(Val* val) { + if (val->definition() == nullptr) { + return; + } + next_stmts_.push_back(val->definition()); + } + + void handle(Bool* stmt) final { + simpleVal(stmt); + } + + void handle(Double* stmt) final { + simpleVal(stmt); + } + + void handle(Int* stmt) final { + simpleVal(stmt); + } + + void handle(NamedScalar* stmt) final { + simpleVal(stmt); + } + + void handle(IterDomain* stmt) final { + next_stmts_.push_back(stmt->start()); + next_stmts_.push_back(stmt->extent()); + next_stmts_.push_back(stmt->stopOffset()); + simpleVal(stmt); + } + + void handle(TensorDomain* stmt) final { + next_stmts_.insert( + next_stmts_.end(), stmt->domain().begin(), stmt->domain().end()); + simpleVal(stmt); + } + + void handle(TensorView* tv) final { + next_stmts_.push_back(tv->domain()); + simpleVal(tv); + } + + std::vector next_stmts_; +}; + } // namespace std::vector IterVisitor::next(Statement* stmt) { if (stmt->isVal()) { return next(stmt->as()); - } else if (stmt->isExpr()) { - return next(stmt->as()); } else { - TORCH_INTERNAL_ASSERT( - false, "IterVisitor could not detect type in next_dispatch."); + return next(stmt->as()); } } std::vector IterVisitor::next(Val* v) { - FusionGuard::getCurFusion()->assertInFusion(v, "Cannot traverse val, "); + FusionGuard::getCurFusion()->assertInContainer(v, "Cannot traverse val, "); if (v->definition() != nullptr) { return {v->definition()}; } @@ -53,7 +127,8 @@ std::vector IterVisitor::next(Val* v) { } std::vector IterVisitor::next(Expr* expr) { - FusionGuard::getCurFusion()->assertInFusion(expr, "Cannot traverse expr, "); + FusionGuard::getCurFusion()->assertInContainer( + expr, "Cannot traverse expr, "); std::vector next_stmts{ expr->inputs().begin(), expr->inputs().end()}; return next_stmts; @@ -93,7 +168,8 @@ void IterVisitor::handle(Val* v) { void IterVisitor::traverseFrom( Fusion* fusion, const std::vector& from, - bool traverseAllPaths) { + bool traverseAllPaths, + bool traverseIntoMembers) { FusionGuard fg(fusion); std::unordered_set visited; @@ -137,7 +213,8 @@ void IterVisitor::traverseFrom( } else { // We're not ready to process this node, so add all its inputs to be // checked Visit input nodes. - auto next_stmts = next(stmt); + auto next_stmts = + traverseIntoMembers ? RecursiveDependencies::next(stmt) : next(stmt); // We may want to retraverse nodes, in that case revisit everything! if (!traverseAllPaths) { // If we don't want to retraverse, remove nodes we already visisted. @@ -308,7 +385,7 @@ void BackwardVisitor::traverseFrom( auto vals = AllVals::get(fusion, from); - auto exprs = ExprSort::getExprs(fusion, from); + auto exprs = StmtSort::getExprs(fusion, from); { size_t pos = 0; @@ -704,22 +781,41 @@ std::unordered_set DependencyCheck::getAllDependentVals( return DependentVals::getAllDependentVals(of); } -void ExprSort::handle(Expr* expr) { - exprs.push_back(expr); +void StmtSort::handle(Statement* stmt) { + stmts.push_back(stmt); } -std::vector ExprSort::getExprs(Fusion* fusion) { - ExprSort es; - es.traverse(fusion); - return es.exprs; +std::vector StmtSort::getExprs(Fusion* fusion, bool traverse_members) { + auto terminating_outputs = fusion->getTerminatingOutputs(); + return StmtSort::getExprs(fusion, terminating_outputs, traverse_members); } -std::vector ExprSort::getExprs( +std::vector StmtSort::getExprs( Fusion* fusion, - const std::vector& from) { - ExprSort es; - es.traverseFrom(fusion, from, false); - return es.exprs; + const std::vector& from, + bool traverse_members) { + StmtSort es; + es.traverseFrom(fusion, from, false, traverse_members); + auto stmts = StmtSort::getStmts(fusion, from, traverse_members); + auto filter = ir_utils::filterByType(stmts.begin(), stmts.end()); + std::vector exprs(filter.begin(), filter.end()); + return exprs; +} + +std::vector StmtSort::getStmts( + Fusion* fusion, + bool traverse_members) { + auto terminating_outputs = fusion->getTerminatingOutputs(); + return StmtSort::getStmts(fusion, terminating_outputs, traverse_members); +} + +std::vector StmtSort::getStmts( + Fusion* fusion, + const std::vector& from, + bool traverse_members) { + StmtSort es; + es.traverseFrom(fusion, from, false, traverse_members); + return es.stmts; } void InputsOf::handle(Val* v) { diff --git a/torch/csrc/jit/codegen/cuda/iter_visitor.h b/torch/csrc/jit/codegen/cuda/iter_visitor.h index d4aa56ea2fe..2447933d737 100644 --- a/torch/csrc/jit/codegen/cuda/iter_visitor.h +++ b/torch/csrc/jit/codegen/cuda/iter_visitor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -83,18 +83,21 @@ class TORCH_CUDA_CU_API IterVisitor : public OptOutDispatch { void traverseHelper(Fusion* fusion, bool traverse_all_paths = false); public: - // Starts at nodes provided in from, traverses from these nodes to inputs. - // Calls handle on all Statement*s in topological sorted order. - // traverseAllPaths = false only call handle on each Statement* once - // traverseAllPaths = true traverses all paths from nodes in from to inputs. - // Handle on a Statement* for every path from "from" nodes, to inputs. - // to argument allows specification of nodes to stop at if we want to stop - // beffore we hit all leaf nodes. This can be helpful when we want to traverse - // from TensorView::domain(), to the rfactor domain, instead of root domain. + //! Starts at nodes provided in from, traverses from these nodes to inputs. + //! Calls handle on all Statement*s in topological sorted order. + //! \param traverseAllPaths = false only call handle on each Statement* once + //! traverseAllPaths = true traverses all paths from nodes in from to + //! inputs. Calls handle on a Statement* for every path from "from" nodes, + //! to inputs. + //! \param traverseIntoMembers = When hitting nodes like TensorView, + //! TensorDomain, or IterDomain where there are members of the nodes that are + //! Val's a value of "true" will also traverse into those member Val's, a + //! value of "false" will not traverse into the members. void traverseFrom( Fusion* fusion, const std::vector& from, - bool traverseAllPaths = false); + bool traverseAllPaths = false, + bool traverseIntoMembers = false); // Iterates from terminating outputs registered with the fusion. Terminating // means value is not used to generate any other value used in producing @@ -246,18 +249,40 @@ class TORCH_CUDA_CU_API DependencyCheck { // Expr sort will take a fusion and return a topologically sorted list of // expressions. -class ExprSort : public IterVisitor { +class StmtSort : public IterVisitor { protected: - std::vector exprs; + std::vector stmts; - void handle(Expr* expr) override; + void handle(Statement* stmt) override; public: - static std::vector getExprs(Fusion* fusion); - + // If traverse_members it will also extract all member nodes in the sorted + // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc static std::vector getExprs( Fusion* fusion, - const std::vector& from); + bool traverse_members = false); + + // If traverse_members it will also extract all member nodes in the sorted + // expr list in the fusion. i.e. all expressions on IterDomains, extents, etc + static std::vector getExprs( + Fusion* fusion, + const std::vector& from, + bool traverse_members = false); + + // If traverse_members it will also extract all member nodes in the sorted + // statement list in the fusion. i.e. all IterDomains, extents, and associated + // expressions of them + static std::vector getStmts( + Fusion* fusion, + bool traverse_members = false); + + // If traverse_members it will also extract all member nodes in the sorted + // expr list in the fusion. i.e. all IterDomains, extents, and associated + // expressions of them + static std::vector getStmts( + Fusion* fusion, + const std::vector& from, + bool traverse_members = false); }; class InputsOf : public IterVisitor { diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp index d3ef9eeb95d..b9062f5bc45 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel.cpp @@ -1,7 +1,8 @@ #include +#include #include #include -#include +#include #include #include @@ -11,22 +12,24 @@ namespace torch { namespace jit { namespace fuser { namespace cuda { + +IrBuilderPasskey::IrBuilderPasskey(IrContainer* ir_container) + : ir_container_(ir_container) {} + namespace kir { namespace { //! Scan all primary expressions in the Kernel IR and build //! lists of specialized nodes and other interesting information -class KernelIrScanner : private kir::IrVisitor { +class KernelIrScanner : private IrVisitor { public: explicit KernelIrScanner(const Kernel* kernel) { - for (const auto& ir_node : kernel->irNodes()) { - ir_node->accept(this); - } + IrVisitor::handle(kernel->topLevelExprs()); const auto gpu_lower = GpuLower::current(); for (auto split : gpu_lower->nonDivisibleSplitInfo().splitsToValidate()) { - auto extent = gpu_lower->lowerValue(split->in()->extent()); - auto factor = gpu_lower->lowerValue(split->factor()); + auto extent = split->in()->extent(); + auto factor = split->factor(); summary_.splits_to_validate.emplace_back(extent, factor); } } @@ -36,7 +39,17 @@ class KernelIrScanner : private kir::IrVisitor { } private: - void visit(const kir::Sync* sync) final { + using IrVisitor::handle; + void handle(Expr* expr) final { + IrVisitor::handle(expr); + for (auto inp : expr->inputs()) { + handle(inp); + } + for (auto out : expr->outputs()) { + handle(out); + } + } + void handle(Sync* sync) final { // TODO: Move to a dedicated validation pass // which is not on the common execution/compilation path if (sync->isWarHazardSync()) { @@ -44,7 +57,7 @@ class KernelIrScanner : private kir::IrVisitor { } } - void visit(const kir::Allocate* allocate) final { + void handle(Allocate* allocate) final { switch (allocate->memoryType()) { case MemoryType::Global: summary_.global_allocations.push_back(allocate); @@ -65,28 +78,23 @@ class KernelIrScanner : private kir::IrVisitor { } } - void visit(const kir::UnaryOp* unary_op) final { - if (unary_op->operation() == UnaryOpType::RandLike) { + void handle(UnaryOp* unary_op) final { + if (unary_op->getUnaryOpType() == UnaryOpType::RandLike) { // This kernel is using random numbers summary_.is_stochastic = true; } } - void visit(const kir::TensorIndex* tensor_index) final { + void handle(TensorIndex* tensor_index) final { const auto tv = tensor_index->view(); const auto domain = tv->domain(); - // Do we have any reductions? summary_.has_block_reductions = summary_.has_block_reductions || domain->hasBlockReduction(); - // Do we have block broadcasts? - summary_.has_block_broadcasts = - summary_.has_block_broadcasts || domain->hasBlockBroadcast(); - // Update the largest smem data type if (domain->hasBlockReduction() || domain->hasGridReduction() || - tv->memoryType() == MemoryType::Shared) { + tv->getMemoryType() == MemoryType::Shared) { const auto data_type = tv->dtype(); const size_t type_size = dataTypeSize(data_type); if (type_size > max_smem_type_size_) { @@ -94,38 +102,50 @@ class KernelIrScanner : private kir::IrVisitor { summary_.largest_smem_data_type = data_type; } } - - // Update Welford - if (tensor_index->definition() != nullptr && - tensor_index->definition()->isA()) { - summary_.has_welford = true; - summary_.has_block_welford = - summary_.has_block_welford || domain->hasBlockReduction(); - summary_.has_grid_welford = - summary_.has_grid_welford || domain->hasGridReduction(); - } } - void visit(const kir::GridWelford* grid_welford) final { - const auto dom = grid_welford->welford_op() - ->out() - ->as() - ->view() - ->domain(); + void handle(WelfordOp* welford_op) final { + summary_.has_welford = true; + TORCH_INTERNAL_ASSERT(welford_op->outAvg()->isA()); + auto out_dom = welford_op->outAvg()->as()->view()->domain(); + summary_.has_block_welford = + summary_.has_block_welford || out_dom->hasBlockReduction(); + } + + void handle(GridWelford* grid_welford) final { + summary_.has_welford = true; + summary_.has_grid_welford = true; + const auto dom = + grid_welford->welford_op()->out()->as()->view()->domain(); updateGridReductionInLoop(dom); } - void visit(const kir::GridReduction* grid_reduction) final { + void handle(GridReduction* grid_reduction) final { + summary_.has_grid_reductions = true; const auto dom = grid_reduction->reduction_op() ->out() - ->as() + ->as() ->view() ->domain(); updateGridReductionInLoop(dom); } - void visit(const kir::GridBroadcast*) final { + void handle(GridBroadcast* grid_broadcast) final { summary_.has_cooperative_grid_reduction = true; + handle(grid_broadcast->broadcast_op()); + } + + void handle(BroadcastOp* bop) final { + const ParallelTypeBitmap parallel_types = + GpuLower::current()->threadPredMap().getParallelBroadcastDomains( + bop->out()->as()->view()); + summary_.broadcast_parallel_types.emplace(bop, parallel_types); + // Do we have block broadcasts? + summary_.has_block_broadcasts = + summary_.has_block_broadcasts || parallel_types.hasTID(); + // Do we have grid broadcasts? + summary_.has_grid_broadcasts = + summary_.has_grid_broadcasts || parallel_types.hasBID(); } private: @@ -136,10 +156,9 @@ class KernelIrScanner : private kir::IrVisitor { void updateGridReductionInLoop(TensorDomain* dom) { summary_.has_grid_reductions = true; - const auto gpu_lower = GpuLower::current(); for (const auto i : c10::irange(dom->nDims())) { - const auto id = - gpu_lower->caParallelMap().getConcreteMappedID(dom->domain()[i]); + const auto id = GpuLower::current()->caParallelMap().getConcreteMappedID( + dom->domain()[i]); summary_.has_cooperative_grid_reduction = summary_.has_cooperative_grid_reduction || @@ -169,7 +188,7 @@ class KernelIrScanner : private kir::IrVisitor { //! MemoryType::Global for tensors parallelized with blockIdx), it is //! assumed that allocation is properly extended for the iteration //! count. -class ValidateAllocation : private kir::IrVisitor { +class ValidateAllocation : private OptOutConstDispatch { public: static void validate(const Kernel* kernel) { ValidateAllocation validate_allocation(kernel); @@ -178,14 +197,14 @@ class ValidateAllocation : private kir::IrVisitor { private: explicit ValidateAllocation(const Kernel* kernel) { live_allocations_.emplace_back(std::vector()); - for (const auto& ir_node : kernel->topLevelExprs()) { - ir_node->accept(this); + for (const auto& expr : kernel->topLevelExprs()) { + OptOutConstDispatch::handle(expr); } live_allocations_.pop_back(); TORCH_INTERNAL_ASSERT(live_allocations_.empty()); } - void visit(const kir::Allocate* allocate) final { + void handle(const Allocate* allocate) final { TORCH_INTERNAL_ASSERT(!live_allocations_.empty()); live_allocations_.back().push_back(allocate); } @@ -195,53 +214,52 @@ class ValidateAllocation : private kir::IrVisitor { // during in the allocation lowering if it's thread-parallel and not // allocated on shared or global memories, or if it's block-parallel // ando not allocated on global memory. - void validate(const kir::ForLoop* for_loop) { + void validate(const ForLoop* for_loop) { const auto loop_id = for_loop->iter_domain(); - const auto gpu_lower = GpuLower::current(); for (const auto& allocations : live_allocations_) { for (const auto& allocate : allocations) { - const auto tv = dynamic_cast(allocate->buffer()); + const auto tv = dynamic_cast(allocate->buffer()); if (tv == nullptr) { continue; } for (const auto& axis : tv->domain()->domain()) { - if (!gpu_lower->caParallelMap().areMapped(loop_id, axis)) { + if (!GpuLower::current()->caParallelMap().areMapped(loop_id, axis)) { continue; } - if (isParallelTypeThreadDim(loop_id->parallelType())) { + if (isParallelTypeThreadDim(loop_id->getParallelType())) { TORCH_INTERNAL_ASSERT( - tv->memoryType() == MemoryType::Shared || - tv->memoryType() == MemoryType::Global, + tv->getMemoryType() == MemoryType::Shared || + tv->getMemoryType() == MemoryType::Global, "Tensor t", tv->name(), " must be allocated on SMEM or GMEM."); - } else if (isParallelTypeBlockDim(loop_id->parallelType())) { - TORCH_INTERNAL_ASSERT(tv->memoryType() == MemoryType::Global); + } else if (isParallelTypeBlockDim(loop_id->getParallelType())) { + TORCH_INTERNAL_ASSERT(tv->getMemoryType() == MemoryType::Global); } } } } } - void visit(const kir::ForLoop* for_loop) final { + void handle(const ForLoop* for_loop) final { if (for_loop->stop() != for_loop->iter_domain()->extent() && - isParallelTypeThread(for_loop->iter_domain()->parallelType())) { + isParallelTypeThread(for_loop->iter_domain()->getParallelType())) { validate(for_loop); } live_allocations_.emplace_back(std::vector()); for (const auto& expr : for_loop->body().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } live_allocations_.pop_back(); } - void visit(const kir::IfThenElse* ite) final { + void handle(const IfThenElse* ite) final { for (const auto& expr : ite->thenBody().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } for (const auto& expr : ite->elseBody().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } } @@ -252,11 +270,9 @@ class ValidateAllocation : private kir::IrVisitor { } // namespace // TODO(kir): Kernel IR validation -void Kernel::finalize(std::vector top_level_exprs) { - TORCH_CHECK(top_level_exprs_.empty()); +void Kernel::finalize(std::vector top_level_exprs) { + TORCH_INTERNAL_ASSERT(top_level_exprs_.empty()); top_level_exprs_ = std::move(top_level_exprs); - predicate_map_ = std::make_unique( - GpuLower::current()->threadPredMap()); warp_padded_parallel_info_ = GpuLower::current()->getWarpPaddedParallelInfo(); ValidateAllocation::validate(this); analyze(); @@ -270,8 +286,63 @@ void Kernel::analyze() { } void Kernel::print() const { - kir::IrPrinter ir_printer(std::cout); - ir_printer.printKernel(this); + IrPrinter ir_printer(std::cout); + ir_printer.handle(this); +} + +//! Register the Val with this fusion +void Kernel::registerVal(Val* val) { + if (inContainer(val)) { + return; + } + if (val->kernel()) { + TORCH_CHECK( + val->kernel() == this, + val->toString(), + " was not found in the active kernel."); + } + + Fusion::registerVal(val); +} + +//! Register expr with this fusion. +//! When we register an expression, we want to update the dependency tracking +//! of Vals. We add expr to our general expr_set_, +void Kernel::registerExpr(Expr* expr) { + if (inContainer(expr)) { + return; + } + + if (expr->kernel()) { + TORCH_CHECK( + expr->kernel() == this, + expr->toString(), + " was not found in the active kernel."); + } + + for (Val* input : expr->inputs()) { + TORCH_INTERNAL_ASSERT( + inContainer(input), + "Input\n", + input->toString(), + " to expr,\n", + expr->toString(), + ",\n is invalid because it is not in the same kernel."); + } + + for (Val* output : expr->outputs()) { + TORCH_INTERNAL_ASSERT( + inContainer(output), + "Output\n", + output->toString(), + " to expr,\n", + expr->toString(), + ",\n is invalid because it is not in the same kernel."); + } + + // Register expr is explicitly non-SSA when coming from a kernel. This is + // detected inside Fusion::registerExpr + Fusion::registerExpr(expr); } } // namespace kir diff --git a/torch/csrc/jit/codegen/cuda/kernel.h b/torch/csrc/jit/codegen/cuda/kernel.h index b273324e1e2..0c8bbdef9df 100644 --- a/torch/csrc/jit/codegen/cuda/kernel.h +++ b/torch/csrc/jit/codegen/cuda/kernel.h @@ -1,12 +1,15 @@ #pragma once -#include -#include -#include +#include + +#include +#include +#include #include #include #include +#include #include #include @@ -47,6 +50,9 @@ struct KernelSummary { //! Do we have any block broadcasts? bool has_block_broadcasts = false; + //! Do we have any grid broadcasts? + bool has_grid_broadcasts = false; + //! Do we have any welford op? bool has_welford = false; @@ -67,87 +73,47 @@ struct KernelSummary { std::vector dynamic_lmem_allocations; //! ceilDiv extents that must be divisible - std::vector> splits_to_validate; + std::vector> splits_to_validate; + + //! Effective ParallelTypes of broadcast ops + std::unordered_map + broadcast_parallel_types; }; //! Container for a lowered Kernel IR //! -//! TODO(kir): currently, it is just pointing to nodes owned -//! by a Fusion object. The goal is to have the Kernel object -//! own the Kernel IR nodes -//! // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class TORCH_CUDA_CU_API Kernel final : public NonCopyable { +class TORCH_CUDA_CU_API Kernel final : public Fusion { public: - Kernel() = default; + // Kernel starts by grabbing all the nodes from the provided fusion. + // Kernel is not SSA, if a definition is not set, we should update it, but + // not remove previous definition if it is set. This is primarily because when + // we do something like generate an initialization statement for a reduction + // TV, we may want to continue to do fusion like analysis on the original + // expression. + Kernel(Fusion* fusion) : Fusion(*fusion) {} + + Kernel() = delete; + + // No move or copy semantics + Kernel(const Kernel&) = delete; + Kernel& operator=(const Kernel&) = delete; //! Finalize a kernel definition //! //! At this point we have a complete kernel definition and we can //! run analysis passes to build a KernelSummary //! - void finalize(std::vector top_level_exprs); + void finalize(std::vector top_level_exprs); - //! Register input as an input of the kernel - void addInput(Val* input) { - inputs_.push_back(input); - input_set_.insert(input); - } - - //! Register output as an output of the kernel - void addOutput(Val* output) { - outputs_.push_back(output); - output_set_.insert(output); - } - - const auto& inputs() const { - return inputs_; - } - - const auto& outputs() const { - return outputs_; - } - - bool isInput(Val* val) const { - return input_set_.find(val) != input_set_.end(); - } - - bool isOutput(Val* val) const { - return output_set_.find(val) != output_set_.end(); - } - - const auto& topLevelExprs() const { + const std::vector& topLevelExprs() const { return top_level_exprs_; } - const auto& irNodes() const { - return ir_nodes_; - } - const KernelSummary& summary() const { return summary_; } - const ThreadPredicateMap& predicateMap() const { - return *predicate_map_; - } - - //! Register a new Kernel IR node - //! - //! \note This is a specialized helper for kir::IrBuilder, not - //! intendted for general use - //! - void registerIrNode(kir::Passkey passkey, std::unique_ptr node) { - TORCH_CHECK(passkey.kernel == this); - ir_nodes_.push_back(std::move(node)); - } - - //! Allocates a new value identifier - kir::ValueId newValueId(kir::Passkey passkey) { - TORCH_CHECK(passkey.kernel == this); - return next_value_id_++; - } - //! Checks if parallel type is padded bool isParallelTypePadded(ParallelType ptype) const { return ptype == ParallelType::TIDx && @@ -161,32 +127,26 @@ class TORCH_CUDA_CU_API Kernel final : public NonCopyable { //! Debug dump of the Kernel IR void print() const; + protected: + //! Register the Val with this fusion + void registerVal(Val* val) override; + + //! Register expr with this fusion. + //! When we register an expression, we want to update the dependency tracking + //! of Vals. We add expr to our general expr_set_, + void registerExpr(Expr* expr) override; + private: // Analyze the kernel IR and caches the summary of interesting data void analyze(); private: - // Kernel IR nodes - std::vector> ir_nodes_; - // Top level statements - std::vector top_level_exprs_; - - // Kernel inputs and outputs - std::vector inputs_; - std::vector outputs_; - std::unordered_set input_set_; - std::unordered_set output_set_; - - // Used to allocate unique value IDs - kir::ValueId next_value_id_ = 1; + std::vector top_level_exprs_; // Summary of interesting kernel data KernelSummary summary_; - // Predicate map - // TODO(kir): consider a simpler, kernel IR based version - std::unique_ptr predicate_map_; WarpPaddedParallelInfo warp_padded_parallel_info_; }; diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp index 39350876bd2..c1c113dbbc4 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp @@ -7,6 +7,7 @@ #include #include +#include namespace torch { namespace jit { @@ -25,6 +26,10 @@ int getCommonDeviceCUDA(const at::ArrayRef& inputs) { continue; } const auto& device = input.toTensor().device(); + // skip cpu scalar tensor as they'll be promoted to scalar later + if (device.is_cpu() && is_cpu_scalar(input.toTensor())) { + continue; + } TORCH_CHECK(device.is_cuda(), "nvfuser only supports cuda device"); auto cur_index = device.index(); if (index != -1 && index != cur_index) { @@ -202,9 +207,9 @@ FusionKernelRuntime* FusionExecutorCache::getKernelRuntimeFor( } // Access kernels associated with the common device id - auto dev_id = getCommonDeviceCUDA(inputs); - TORCH_INTERNAL_ASSERT(dev_id >= 0); - auto& kernel_runtimes = kernel_runtimes_[dev_id]; + auto device_index = getCommonDeviceCUDA(inputs); + TORCH_CHECK(device_index >= 0, "device is not coherent for fusion inputs"); + auto& kernel_runtimes = kernel_runtimes_[device_index]; // Check for re-use hit case // a kernel runtime is re-usable if all the compiled @@ -277,14 +282,6 @@ FusionKernelRuntime::FusionKernelRuntime( } else { auto complete_fusion_heuristic = maybe_complete_fusion_heuristic.value(); - // Translate welfords if apply - if (fusion_copy->hasWelford()) { - bool translated = SegmentCandidateFinder::TranslateWelfordInFusion( - fusion_copy.get(), inputs); - if (translated) { - complete_fusion_heuristic = ScheduleHeuristic::Persistent; - } - } // Take ownership of the transformed fusion single_kernel_fusion_ = std::move(fusion_copy); @@ -358,7 +355,7 @@ std::vector FusionKernelRuntime::runKernelWithInput( launch_params = scheduler_entry->pointwiseParams().lparams; } executors_[group_id].compileFusion( - fusion_to_run.get(), options, inputs, launch_params); + fusion_to_run.get(), inputs, launch_params, options); } else { // Load launch params for reduction and normalization kernels if (scheduler_entry->hasReductionParam()) { @@ -453,6 +450,7 @@ std::vector FusionKernelRuntime::runWithInput( " inputs but expecting ", segmented_fusion_->inputs().size()); + c10::Device device(c10::DeviceType::CUDA, 0); int extent_index_ = 0; // Bind input in the tensor_map for (const auto i : c10::irange(inputs.size())) { @@ -466,6 +464,7 @@ std::vector FusionKernelRuntime::runWithInput( // more convenient and safer than replication if (inputs[i].isTensor()) { auto aten_tensor = inputs[i].toTensor(); + device = aten_tensor.device(); for (auto dim_size : aten_tensor.sizes()) { runtime_workspace_.tensor_map.emplace( runtime_workspace_.group_extent_binding_order[extent_index_++], @@ -504,14 +503,30 @@ std::vector FusionKernelRuntime::runWithInput( if (iter != runtime_workspace_.tensor_map.end()) { fusion_outputs.push_back(iter->second); } else { + bool empty_type_check = output->getDataType().has_value() && + output->getDataType().value() == DataType::Float; + + // Only support two cases of empty tensor here, since + // this is hot path. + auto out_tv = output->as(); + + // TODO: should be only one of the two once the "empty" + // definition has been unified throughout the ops. + bool empty_tensor_check = + out_tv->isZeroDim() || out_tv->isEmptyTensor(); + // This is the check for an empty tensor; TORCH_INTERNAL_ASSERT( - output->as()->nDims() == 0 && - output->getDataType().has_value() && - output->getDataType().value() == DataType::Float, + empty_tensor_check && empty_type_check, "Non empty tensor cannot be found at tensor_map in ", __FUNCTION__); - fusion_outputs.emplace_back(at::Tensor()); + + // TODO: would need to clean up this part when + // we have a unified and consistent way to generate + // size-0 tensors. + const auto tensor_options = + at::TensorOptions().dtype(at::kFloat).device(device); + fusion_outputs.emplace_back(at::empty({0}, tensor_options)); } } diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.h b/torch/csrc/jit/codegen/cuda/kernel_cache.h index ae84c25e4f2..cba42f99dc4 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_cache.h +++ b/torch/csrc/jit/codegen/cuda/kernel_cache.h @@ -7,8 +7,8 @@ #include #include +#include #include -#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp index 7421d2e235a..3605f7a4155 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.cpp @@ -1,7 +1,6 @@ #include #include -#include #include @@ -16,11 +15,11 @@ void ExpressionEvaluator::bind( Int::ScalarType concrete_value) { TORCH_CHECK(value->isScalar()); TORCH_CHECK(value->dtype() == DataType::Int); - TORCH_CHECK(!value->isConst(), "Tried to bind to a constant value"); + TORCH_CHECK(!value->isConstScalar(), "Tried to bind to a constant value"); TORCH_CHECK( value->definition() == nullptr, "Tried to bind to a value that is computed in the kernel IR: ", - toString(value), + value->toString(), " with ", concrete_value); known_values_[value] = concrete_value; @@ -41,14 +40,18 @@ void ExpressionEvaluator::bind( c10::optional ExpressionEvaluator::evaluate(const Val* value) { if (precomputed_integers_ && precomputed_integers_->ready()) { - return precomputed_integers_->getMaybeValueFor(value); - } else if (value->isScalar() && value->isConst()) { + if (precomputed_integers_->getMaybeValueFor(value).has_value()) { + return precomputed_integers_->getMaybeValueFor(value); + } + } + + if (value->isScalar() && value->isConst()) { return value->as()->value(); } else { FUSER_PERF_SCOPE("kir::ExpressionEvaluator::evaluate"); - TORCH_CHECK(value->isScalar()); - TORCH_CHECK(value->dtype() == DataType::Int); + TORCH_CHECK(value->isScalar(), value->toString()); + TORCH_CHECK(value->dtype() == DataType::Int, value->toString()); // Is the value known (either explicit binding or memoized)? const auto pre_eval_it = known_values_.find(value); @@ -56,7 +59,7 @@ c10::optional ExpressionEvaluator::evaluate(const Val* value) { return pre_eval_it->second; } - value->accept(this); + OptOutConstDispatch::handle(value); const auto post_eval_it = known_values_.find(value); return post_eval_it != known_values_.end() @@ -74,24 +77,23 @@ void ExpressionEvaluator::print() const { std::cout << "\nEvaluation context\n"; std::cout << "--------------------\n"; for (const auto& kv : known_values_) { - std::cout << toString(kv.first) << " = " << kv.second << "\n"; + std::cout << kv.first->toString() << " = " << kv.second << "\n"; + } + std::cout << "\nPre-computed Values\n"; + if (precomputed_integers_ != nullptr) { + precomputed_integers_->print(); } std::cout << "--------------------\n\n"; } -void ExpressionEvaluator::unhandled(const void*) { - TORCH_INTERNAL_ASSERT( - false, "Kernel IR expression evaluation reached an unsupported node"); -} - -void ExpressionEvaluator::visit(const Int* value) { +void ExpressionEvaluator::handle(const Int* value) { TORCH_INTERNAL_ASSERT(!value->isConst()); if (auto def = value->definition()) { - def->accept(this); + OptOutConstDispatch::handle(def); } } -void ExpressionEvaluator::visit(const NamedScalar* named_scalar) { +void ExpressionEvaluator::handle(const NamedScalar* named_scalar) { const auto& name = named_scalar->name(); for (auto pt : kParallelTypeThreads) { auto pt_val_it = known_parallel_dimensions_.find(pt); @@ -105,10 +107,10 @@ void ExpressionEvaluator::visit(const NamedScalar* named_scalar) { } } -void ExpressionEvaluator::visit(const UnaryOp* unary_op) { +void ExpressionEvaluator::handle(const UnaryOp* unary_op) { const auto in = evaluate(unary_op->in()); if (in.has_value()) { - switch (unary_op->operation()) { + switch (unary_op->getUnaryOpType()) { case UnaryOpType::Neg: known_values_[unary_op->out()] = -*in; break; @@ -121,11 +123,11 @@ void ExpressionEvaluator::visit(const UnaryOp* unary_op) { } } -void ExpressionEvaluator::visit(const BinaryOp* binary_op) { +void ExpressionEvaluator::handle(const BinaryOp* binary_op) { const auto lhs = evaluate(binary_op->lhs()); const auto rhs = evaluate(binary_op->rhs()); if (lhs.has_value() && rhs.has_value()) { - switch (binary_op->operation()) { + switch (binary_op->getBinaryOpType()) { case BinaryOpType::Add: known_values_[binary_op->out()] = *lhs + *rhs; break; diff --git a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h index 64791387543..63586857ad8 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h +++ b/torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h @@ -1,7 +1,9 @@ #pragma once -#include +#include + +#include #include #include @@ -34,7 +36,7 @@ namespace kir { //! } //! ``` //! -class TORCH_CUDA_CU_API ExpressionEvaluator : private IrVisitor { +class TORCH_CUDA_CU_API ExpressionEvaluator : private OptInConstDispatch { public: //! Set a concrete value for a symbolic value void bind(const Val* value, Int::ScalarType concrete_value); @@ -56,11 +58,10 @@ class TORCH_CUDA_CU_API ExpressionEvaluator : private IrVisitor { } private: - void unhandled(const void*) final; - void visit(const Int* value) final; - void visit(const NamedScalar* named_scalar) final; - void visit(const UnaryOp* unary_op) final; - void visit(const BinaryOp* binary_op) final; + void handle(const Int* value) final; + void handle(const NamedScalar* named_scalar) final; + void handle(const UnaryOp* unary_op) final; + void handle(const BinaryOp* binary_op) final; private: std::unordered_map known_values_; diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp index eebfd41729c..5d2eb44f8a8 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.cpp +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.cpp @@ -1,8 +1,7 @@ +#include #include #include #include -#include -#include #include #include #include @@ -15,369 +14,52 @@ namespace fuser { namespace cuda { namespace kir { -void Node::print() const { - std::cout << "\n"; - IrPrinter(std::cout).printNode(this); - std::cout << "\n"; +Predicate::Predicate( + IrBuilderPasskey passkey, + PredicateType ptype, + const Expr* expr, + Bool* thread_pred) + : Val(passkey, ValType::Predicate, DataType::Bool), + ptype_(ptype), + expr_(expr), + thread_pred_(thread_pred) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); + TORCH_INTERNAL_ASSERT( + ptype != PredicateType::Unswitch && ptype != PredicateType::Manual); } -Val::Val(Passkey passkey, DataType dtype) : Node(passkey), dtype_(dtype) { - // NOLINTNEXTLINE: https://bugs.llvm.org/show_bug.cgi?id=48534 - id_ = passkey.kernel->newValueId(passkey); +Predicate::Predicate(IrBuilderPasskey passkey, ForLoop* unrolled_loop) + : Val(passkey, ValType::Predicate, DataType::Bool), + ptype_(PredicateType::Unswitch), + unrolled_loop_(unrolled_loop) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); + TORCH_INTERNAL_ASSERT(unrolled_loop != nullptr); } -namespace { - -// Traverse definition of all values involved in constructing the provided val. -// Check if all values involved are constant values, meaning the provided -// val is also a constant value. -class ConstCheck : IrVisitor { - private: - bool is_const_ = true; - - using IrVisitor::visit; - - void visit(const Bool* b) override { - is_const_ = is_const_ && b->isConst(); - } - - void visit(const Double* d) override { - is_const_ = is_const_ && d->isConst(); - } - - void visit(const Int* i) override { - is_const_ = is_const_ && i->isConst(); - } - - void visit(const NamedScalar* ns) override { - is_const_ = is_const_ && false; - } - - void visit(const Expr* expr) { - for (auto inp : expr->inputs()) { - visit(inp); - } - } - - void visit(const Val* val) { - if (val->definition() != nullptr) { - visit(val->definition()); - } else { - val->accept(this); - } - } - - public: - static bool isConst(const Val* val) { - ConstCheck cc; - cc.visit(val); - return cc.is_const_; - } -}; - -} // namespace - -bool Val::isConstScalar() const { - if (!isScalar()) - return false; - return ConstCheck::isConst(this); -} - -Expr* Expr::parentScope() const { - if (scope()) { - return scope()->owner(); - } else { - return nullptr; - } -} - -NamedScalar* NamedScalar::getParallelDim(ParallelType p_type) { - std::string parallel_dim = stringifyThreadSize(p_type); - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - return ir_builder.create(parallel_dim, DataType::Int); -} - -NamedScalar* NamedScalar::getParallelIndex(ParallelType p_type) { - std::string parallel_ind = stringifyThread(p_type); - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - return ir_builder.create(parallel_ind, DataType::Int); -} - -c10::optional NamedScalar::getParallelDim() const { - if (stringifyThreadSize(ParallelType::TIDx).compare(name()) == 0) { - return c10::optional(ParallelType::TIDx); - } else if (stringifyThreadSize(ParallelType::TIDy).compare(name()) == 0) { - return c10::optional(ParallelType::TIDy); - } else if (stringifyThreadSize(ParallelType::TIDz).compare(name()) == 0) { - return c10::optional(ParallelType::TIDz); - } else if (stringifyThreadSize(ParallelType::BIDx).compare(name()) == 0) { - return c10::optional(ParallelType::BIDx); - } else if (stringifyThreadSize(ParallelType::BIDy).compare(name()) == 0) { - return c10::optional(ParallelType::BIDy); - } else if (stringifyThreadSize(ParallelType::BIDz).compare(name()) == 0) { - return c10::optional(ParallelType::BIDz); - } - return c10::nullopt; -} - -c10::optional NamedScalar::getParallelIndex() const { - if (stringifyThread(ParallelType::TIDx).compare(name()) == 0) { - return c10::optional(ParallelType::TIDx); - } else if (stringifyThread(ParallelType::TIDy).compare(name()) == 0) { - return c10::optional(ParallelType::TIDy); - } else if (stringifyThread(ParallelType::TIDz).compare(name()) == 0) { - return c10::optional(ParallelType::TIDz); - } else if (stringifyThread(ParallelType::BIDx).compare(name()) == 0) { - return c10::optional(ParallelType::BIDx); - } else if (stringifyThread(ParallelType::BIDy).compare(name()) == 0) { - return c10::optional(ParallelType::BIDy); - } else if (stringifyThread(ParallelType::BIDz).compare(name()) == 0) { - return c10::optional(ParallelType::BIDz); - } - return c10::nullopt; -} - -IterDomain::IterDomain(Passkey passkey, Val* start, Val* extent) - : Val(passkey, DataType::Int), - start_(start), - stop_(extent), - extent_(extent) {} - -IterDomain::IterDomain( - Passkey passkey, - const fuser::cuda::IterDomain* iter_domain) - : Val(passkey, iter_domain->getDataType().value()), - start_(GpuLower::current()->lowerValue(iter_domain->start())), - stop_(GpuLower::current()->lowerValue(iter_domain->stop())), - extent_(GpuLower::current()->lowerValue(iter_domain->extent())), - parallel_type_(iter_domain->getParallelType()), - iter_type_(iter_domain->getIterType()), - is_rfactor_domain_(iter_domain->isRFactorProduct()), - is_simple_(iter_domain->definition() == nullptr), - is_padded_dimension_(iter_domain->hasPaddingToMultipleOfWarp()) { - // preserve the fusion node's name - setName(iter_domain->name()); -} - -//! Note that the parallel dimension, if available, may be different -//! from the actual extent of this IterDomain as the parallel -//! dimension is determined by the largest extent of IterDomains -//! sharing the same loop. -Val* IterDomain::extent() const { - TORCH_INTERNAL_ASSERT(extent_ != nullptr); - return extent_; -} - -TensorDomain::TensorDomain(Passkey passkey, std::vector domain) - : Val(passkey, DataType::Null), root_domain_(std::move(domain)) { - domain_ = root_domain_; - resetDomains(); -} - -TensorDomain::TensorDomain( - Passkey passkey, - const fuser::cuda::TensorDomain* tensor_domain) - : Val(passkey, DataType::Null), contiguity_(tensor_domain->contiguity()) { - // preserve the fusion node's name - setName(tensor_domain->name()); - - const auto lowerIterDomains = - [](const std::vector& domains) { - std::vector lowered_domains; - lowered_domains.reserve(domains.size()); - for (const auto iter_domain : domains) { - lowered_domains.push_back( - GpuLower::current()->lowerValue(iter_domain)->as()); - } - return lowered_domains; - }; - - root_domain_ = lowerIterDomains(tensor_domain->getRootDomain()); - domain_ = lowerIterDomains(tensor_domain->domain()); - no_bcast_domain_ = lowerIterDomains(tensor_domain->noBroadcasts()); - no_reduction_domain_ = lowerIterDomains(tensor_domain->noReductions()); - rfactor_domain_ = lowerIterDomains(tensor_domain->getRFactorDomain()); -} - -bool TensorDomain::hasReduction() const { - return no_reduction_domain_.size() != domain_.size(); -} - -bool TensorDomain::hasBlockReduction() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->isReduction() && id->isThreadDim(); - }); -} - -bool TensorDomain::hasGridReduction() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->isReduction() && id->isBlockDim(); - }); -} - -bool TensorDomain::hasBlockBroadcast() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->isBroadcast() && id->isThreadDim(); - }); -} - -bool TensorDomain::hasGridBroadcast() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->isBroadcast() && id->isBlockDim(); - }); -} - -bool TensorDomain::hasBroadcast() const { - return no_bcast_domain_.size() != domain_.size(); -} - -bool TensorDomain::hasRFactor() const { - return !rfactor_domain_.empty(); -} - -bool TensorDomain::hasVectorize() const { - return std::any_of(domain_.begin(), domain_.end(), [](IterDomain* id) { - return id->parallelType() == ParallelType::Vectorize || - id->parallelType() == ParallelType::MisalignedVectorize; - }); -} - -IterDomain* TensorDomain::axis(int i) const { - TORCH_INTERNAL_ASSERT(i >= 0 && i < int(domain_.size())); - return domain_[i]; -} - -std::vector TensorDomain::noReductions( - const std::vector& td) { - std::vector no_reduction_domains; - for (auto id : td) { - if (!id->isReduction()) { - no_reduction_domains.push_back(id); - } - } - return no_reduction_domains; -} - -std::vector TensorDomain::noBroadcasts( - const std::vector& td) { - std::vector no_broadcast_domains; - for (auto id : td) { - if (!id->isBroadcast()) { - no_broadcast_domains.push_back(id); - } - } - return no_broadcast_domains; -} - -TensorView::TensorView(Passkey passkey, const fuser::cuda::TensorView* tv) - : Val(passkey, tv->getDataType().value()), fuser_tv_(tv) { - setName(tv->name()); - domain_ = GpuLower::current()->lowerValue(tv->domain())->as(); - memory_type_ = tv->getMemoryType(); -} - -TensorView::TensorView( - Passkey passkey, - DataType dtype, - TensorDomain* domain, - MemoryType memory_type) - : Val(passkey, dtype), domain_(domain), memory_type_(memory_type) {} - -UnaryOp::UnaryOp(Passkey passkey, UnaryOpType operation, Val* out, Val* in) - : Expr(passkey), operation_(operation), out_(out), in_(in) { - addOutput(out); - addInput(in); -} - -BinaryOp::BinaryOp( - Passkey passkey, - BinaryOpType operation, - Val* out, - Val* lhs, - Val* rhs) - : Expr(passkey), operation_(operation), out_(out), lhs_(lhs), rhs_(rhs) { - addOutput(out); - addInput(lhs); - addInput(rhs); -} - -TernaryOp::TernaryOp( - Passkey passkey, - TernaryOpType operation, - Val* out, - Val* in1, - Val* in2, - Val* in3) - : Expr(passkey), - operation_(operation), - out_(out), - in1_(in1), - in2_(in2), - in3_(in3) { - addOutput(out); - addInput(in1); - addInput(in2); - addInput(in3); -} - -ReductionOp::ReductionOp( - Passkey passkey, - BinaryOpType operation, - Val* init, - Val* out, - Val* in) - : Expr(passkey), operation_(operation), init_(init), out_(out), in_(in) { - addOutput(out); - addInput(in); -} - -WelfordOp::WelfordOp( - Passkey passkey, - Val* out_var, - Val* out_avg, - Val* out_N, - Val* init_var, - Val* init_avg, - Val* init_N, - Val* in_var, - Val* in_avg, - Val* in_N) - : Expr(passkey), - out_var_(out_var), - out_avg_(out_avg), - out_N_(out_N), - init_var_(init_var), - init_avg_(init_avg), - init_N_(init_N), - in_var_(in_var), - in_avg_(in_avg), - in_N_(in_N) { - addOutput(out_avg); - addOutput(out_var); - addOutput(out_N); - - if (!in_N->isOneInt()) { - addInput(in_var); - } - addInput(in_avg); - addInput(in_N); -} - -BroadcastOp::BroadcastOp(Passkey passkey, Val* out, Val* in) - : Expr(passkey), out_(out), in_(in) { - TORCH_CHECK(in->isA() || in->isA()); - TORCH_CHECK(out->isA() || out->isA()); - addOutput(out); - addInput(in); +Predicate::Predicate(IrBuilderPasskey passkey, Bool* value) + : Val(passkey, ValType::Predicate, DataType::Bool), + ptype_(PredicateType::Manual), + value_(value) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); + TORCH_INTERNAL_ASSERT(value != nullptr); } TensorIndex::TensorIndex( - Passkey passkey, - const fuser::cuda::TensorView* view, + IrBuilderPasskey passkey, + const TensorView* view, std::vector indices) - : Val(passkey, view->getDataType().value()), - view_(GpuLower::current()->lowerValue(view)->as()), + : Val(passkey, ValType::TensorIndex, view->getDataType().value()), + view_(view), indices_(indices) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); TORCH_INTERNAL_ASSERT( std::all_of( indices.begin(), @@ -392,20 +74,33 @@ TensorIndex::TensorIndex( indices_.end()); // If indices becomes empty, just put one ZeroInt if (indices_.empty()) { - indices_.push_back(kir::IrBuilder(GpuLower::current()->kernel()).zeroVal()); + indices_.push_back(FusionGuard::getCurFusion()->zeroVal()); } } -Sync::Sync(Passkey passkey, bool war_sync) - : Expr(passkey), war_sync_(war_sync) {} +Sync::Sync(IrBuilderPasskey passkey, bool war_sync) + : Expr(passkey, ExprType::Sync), war_sync_(war_sync) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} -InitMagicZero::InitMagicZero(Passkey passkey) : Expr(passkey) {} +InitMagicZero::InitMagicZero(IrBuilderPasskey passkey) + : Expr(passkey, ExprType::InitMagicZero) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} -UpdateMagicZero::UpdateMagicZero(Passkey passkey) : Expr(passkey) {} +UpdateMagicZero::UpdateMagicZero(IrBuilderPasskey passkey) + : Expr(passkey, ExprType::UpdateMagicZero) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} void Scope::insert(std::vector::const_iterator pos, Expr* expr) { exprs_.insert(pos, expr); - expr->setScope(this); } void Scope::insert_before(Expr* ref, Expr* expr) { @@ -440,11 +135,6 @@ void Scope::insert(size_t pos, Expr* expr) { void Scope::erase(std::vector::const_iterator pos) { // Remove the scope of the expr if this is the scope auto expr = *pos; - TORCH_INTERNAL_ASSERT( - expr->scope() == this, - "Inconsistent scoping of expression detected: ", - kir::toString(expr)); - expr->setScope(nullptr); exprs_.erase(pos); } @@ -470,7 +160,7 @@ void Scope::clear() { } ForLoop::ForLoop( - Passkey passkey, + IrBuilderPasskey passkey, IterDomain* iter_domain, Val* index, Val* start, @@ -479,7 +169,7 @@ ForLoop::ForLoop( bool vectorize, Val* vectorize_shift, bool unroll_required) - : Expr(passkey), + : Expr(passkey, ExprType::ForLoop), iter_domain_{iter_domain}, index_(index), start_(start), @@ -489,43 +179,42 @@ ForLoop::ForLoop( vectorize_shift_(vectorize_shift), unroll_required_(unroll_required), body_(this) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); TORCH_INTERNAL_ASSERT(index->dtype() == DataType::Int); addInput(index); addInput(iter_domain); if (start_ == nullptr && iter_domain->isThread()) { - start_ = - IrBuilder(GpuLower::current()->kernel()) - .create( - stringifyThread(iter_domain->parallelType()), DataType::Int); + start_ = NamedScalar::getParallelIndex(iter_domain->getParallelType()); } if (step_ == nullptr) { if (iter_domain->isThread()) { - step_ = IrBuilder(GpuLower::current()->kernel()) - .create( - stringifyThreadSize(iter_domain->parallelType()), - DataType::Int); + step_ = NamedScalar::getParallelDim(iter_domain->getParallelType()); } else { - step_ = IrBuilder(GpuLower::current()->kernel()).oneVal(); + step_ = FusionGuard::getCurFusion()->oneVal(); } } } -ForLoop::ForLoop(Passkey passkey, IterDomain* iter_domain) +ForLoop::ForLoop(IrBuilderPasskey passkey, IterDomain* iter_domain) : ForLoop( passkey, iter_domain, - iter_domain->isBroadcast() - ? IrBuilder(GpuLower::current()->kernel()).zeroVal() - : IrBuilder(GpuLower::current()->kernel()) - .create(c10::nullopt), + iter_domain->isBroadcast() ? FusionGuard::getCurFusion()->zeroVal() + : IrBuilder::create(c10::nullopt), nullptr, nullptr, nullptr, - isParallelTypeVectorize(iter_domain->parallelType()), + isParallelTypeVectorize(iter_domain->getParallelType()), nullptr, - false) {} + false) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} -ForLoop::ForLoop(Passkey passkey, const ForLoop* other) +ForLoop::ForLoop(IrBuilderPasskey passkey, const ForLoop* other) : ForLoop( passkey, other->iter_domain(), @@ -535,7 +224,11 @@ ForLoop::ForLoop(Passkey passkey, const ForLoop* other) other->step(), other->vectorize(), other->vectorize_shift(), - other->isUnrollRequired()) {} + other->isUnrollRequired()) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} bool ForLoop::isUnrollable() const { // Start and stop must be constant, must not be a broadcast @@ -550,7 +243,7 @@ bool ForLoop::isUnrolled() const { if (isUnrollRequired() && !isUnrollable()) { TORCH_WARN( "Unroll required but not possible. Register allocation disabled. Loop index: ", - kir::toString(index_)); + index_->toString()); return false; } @@ -570,7 +263,7 @@ bool ForLoop::isUnrolled() const { } // Unrolling is technically possible but avoided - if (iter_domain()->parallelType() == ParallelType::Unswitch) { + if (iter_domain()->getParallelType() == ParallelType::Unswitch) { // Use ParallelType::Unroll if unrolling is desired. Note that // unswitched size-one loops are not unrolled as they are not // materialized as actual for-loops. @@ -605,8 +298,8 @@ Val* ForLoop::step() const { return step_; } -IfThenElse::IfThenElse(Passkey passkey, Predicate* cond) - : Expr(passkey), then_body_(this), else_body_(this) { +IfThenElse::IfThenElse(IrBuilderPasskey passkey, Predicate* cond) + : Expr(passkey, ExprType::IfThenElse), then_body_(this), else_body_(this) { setPredicate(cond); addInput(cond); } @@ -621,17 +314,19 @@ Val* TensorIndex::index(int i) const { } Allocate::Allocate( - Passkey passkey, + IrBuilderPasskey passkey, Val* buffer, MemoryType memory_type, std::vector shape, bool zero_init) - : Expr(passkey), + : Expr(passkey, ExprType::Allocate), buffer_(buffer), memory_type_(memory_type), shape_(std::move(shape)), zero_init_(zero_init) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); if (!shape_.empty()) { TORCH_INTERNAL_ASSERT( (shape_.size() == 1 && shape_[0]->isOneInt()) || @@ -639,7 +334,7 @@ Allocate::Allocate( } else { TORCH_INTERNAL_ASSERT(buffer_->isA()); TORCH_INTERNAL_ASSERT( - buffer_->as()->memoryType() == memory_type_); + buffer_->as()->getMemoryType() == memory_type_); const auto domain = buffer_->as()->domain(); for (auto axis : domain->noReductions()) { shape_.push_back(axis->extent()); @@ -650,19 +345,19 @@ Allocate::Allocate( if (size_ == nullptr) { size_ = s; } else { - size_ = ir_builder.mulExpr(size_, s); + size_ = IrBuilder::mulExpr(size_, s); } } if (size_ == nullptr) { - size_ = ir_builder.oneVal(); + size_ = FusionGuard::getCurFusion()->oneVal(); } addInput(size_); } Allocate::Allocate( - Passkey passkey, + IrBuilderPasskey passkey, Val* buffer, MemoryType memory_type, Val* size, @@ -672,31 +367,57 @@ Allocate::Allocate( buffer, memory_type, size == nullptr ? std::vector{} : std::vector{size}, - zero_init) {} + zero_init) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} GridReduction::GridReduction( - Passkey passkey, + IrBuilderPasskey passkey, ReductionOp* reduction_op, Allocate* reduction_buffer, Allocate* sync_buffer) - : Expr(passkey), + : Expr(passkey, ExprType::GridReduction), reduction_op_(reduction_op), reduction_buffer_(reduction_buffer), - sync_buffer_(sync_buffer) {} + sync_buffer_(sync_buffer) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} + +GridBroadcast::GridBroadcast( + IrBuilderPasskey passkey, + BroadcastOp* broadcast_op, + Allocate* broadcast_buffer, + Allocate* sync_buffer) + : Expr(passkey, ExprType::GridBroadcast), + broadcast_op_(broadcast_op), + broadcast_buffer_(broadcast_buffer), + sync_buffer_(sync_buffer) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} GridWelford::GridWelford( - Passkey passkey, + IrBuilderPasskey passkey, WelfordOp* welford_op, Allocate* var_buffer, Allocate* avg_buffer, Allocate* n_buffer, Allocate* sync_buffer) - : Expr(passkey), + : Expr(passkey, ExprType::GridWelford), welford_op_(welford_op), var_buffer_(var_buffer), avg_buffer_(avg_buffer), n_buffer_(n_buffer), - sync_buffer_(sync_buffer) {} + sync_buffer_(sync_buffer) { + TORCH_INTERNAL_ASSERT( + passkey.ir_container_->isA(), + "IR type only valid for Kernel container."); +} } // namespace kir } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir.h b/torch/csrc/jit/codegen/cuda/kernel_ir.h index c1ac6052783..ad6be90bf98 100644 --- a/torch/csrc/jit/codegen/cuda/kernel_ir.h +++ b/torch/csrc/jit/codegen/cuda/kernel_ir.h @@ -1,16 +1,13 @@ #pragma once +#include +#include +#include #include #include -// TODO(kir): remove these once the Kernel IR is separated from Fusion IR -#include -#include -#include -#include - +#include #include -#include #include #include @@ -21,26 +18,22 @@ namespace torch { namespace jit { namespace fuser { namespace cuda { -namespace kir { -class IrBuilder; -class Kernel; +class IrBuilderPasskey; // Abstract nodes -class Node; class Val; class Expr; // Values -class NamedScalar; -class Predicate; class Bool; class Double; class Int; +class NamedScalar; + class IterDomain; class TensorDomain; class TensorView; -class TensorIndex; // Expressions class UnaryOp; @@ -50,7 +43,14 @@ class ReductionOp; class WelfordOp; class BroadcastOp; -// Statements +namespace kir { +class Kernel; + +// Values +class Predicate; +class TensorIndex; + +// Expressions class Allocate; class Sync; class InitMagicZero; @@ -64,443 +64,17 @@ class GridWelford; // Expr container class Scope; -using ValueId = int32_t; - -//! Token used to restrict the access to Kernel IR creation -//! -//! A token is associated with a kernel, which is passed with the key -//! (Passkey::kernel) -//! -//! It is a "granular friendship" token, used to implement the "passkey" idiom: -//! https://www.spiria.com/en/blog/desktop-software/passkey-idiom-and-better-friendship-c -//! https://arne-mertz.de/2016/10/passkey-idiom -//! -class Passkey { - friend class IrBuilder; - - public: - Kernel* const kernel = nullptr; - - private: - explicit Passkey(Kernel* kernel) : kernel(kernel) {} -}; - -//! Kernel IR visitor interface -class TORCH_CUDA_CU_API IrVisitor : public PolymorphicBase { - public: - // TODO(kir): use Node* instead of void* - virtual void unhandled(const void* node) {} - - // Values - virtual void visit(const NamedScalar* named_scalar) { - unhandled(named_scalar); - } - virtual void visit(const Predicate* value) { - unhandled(value); - } - virtual void visit(const Bool* value) { - unhandled(value); - } - virtual void visit(const Double* value) { - unhandled(value); - } - virtual void visit(const Int* value) { - unhandled(value); - } - virtual void visit(const IterDomain* iter_domain) { - unhandled(iter_domain); - } - virtual void visit(const TensorDomain* tensor_domain) { - unhandled(tensor_domain); - } - virtual void visit(const TensorView* tensor_view) { - unhandled(tensor_view); - } - virtual void visit(const TensorIndex* tensor_index) { - unhandled(tensor_index); - } - - // Expressions - virtual void visit(const UnaryOp* node) { - unhandled(node); - } - virtual void visit(const BinaryOp* node) { - unhandled(node); - } - virtual void visit(const TernaryOp* node) { - unhandled(node); - } - virtual void visit(const ReductionOp* node) { - unhandled(node); - } - virtual void visit(const WelfordOp* node) { - unhandled(node); - } - virtual void visit(const BroadcastOp* node) { - unhandled(node); - } - - // Statements - virtual void visit(const Allocate* node) { - unhandled(node); - } - virtual void visit(const Sync* node) { - unhandled(node); - } - virtual void visit(const InitMagicZero* node) { - unhandled(node); - } - virtual void visit(const UpdateMagicZero* node) { - unhandled(node); - } - virtual void visit(const ForLoop* node) { - unhandled(node); - } - virtual void visit(const IfThenElse* node) { - unhandled(node); - } - virtual void visit(const GridReduction* node) { - unhandled(node); - } - virtual void visit(const GridBroadcast* node) { - unhandled(node); - } - virtual void visit(const GridWelford* node) { - unhandled(node); - } -}; - -//! Kernel IR visitor interface -class TORCH_CUDA_CU_API MutableIrVisitor : public PolymorphicBase { - public: - // TODO(kir): use Node* instead of void* - virtual void unhandled(const void*) {} - - // Values - virtual void visit(NamedScalar* named_scalar) { - unhandled(named_scalar); - } - virtual void visit(Predicate* value) { - unhandled(value); - } - virtual void visit(Bool* value) { - unhandled(value); - } - virtual void visit(Double* value) { - unhandled(value); - } - virtual void visit(Int* value) { - unhandled(value); - } - virtual void visit(IterDomain* iter_domain) { - unhandled(iter_domain); - } - virtual void visit(TensorDomain* tensor_domain) { - unhandled(tensor_domain); - } - virtual void visit(TensorView* tensor_view) { - unhandled(tensor_view); - } - virtual void visit(TensorIndex* tensor_index) { - unhandled(tensor_index); - } - - // Expressions - virtual void visit(UnaryOp* node) { - unhandled(node); - } - virtual void visit(BinaryOp* node) { - unhandled(node); - } - virtual void visit(TernaryOp* node) { - unhandled(node); - } - virtual void visit(ReductionOp* node) { - unhandled(node); - } - virtual void visit(BroadcastOp* node) { - unhandled(node); - } - - virtual void visit(WelfordOp* node) { - unhandled(node); - } - - // Statements - virtual void visit(Allocate* node) { - unhandled(node); - } - virtual void visit(Sync* node) { - unhandled(node); - } - virtual void visit(InitMagicZero* node) { - unhandled(node); - } - virtual void visit(UpdateMagicZero* node) { - unhandled(node); - } - virtual void visit(ForLoop* node) { - unhandled(node); - } - virtual void visit(IfThenElse* node) { - unhandled(node); - } - virtual void visit(GridReduction* node) { - unhandled(node); - } - virtual void visit(GridBroadcast* node) { - unhandled(node); - } - virtual void visit(GridWelford* node) { - unhandled(node); - } -}; - -//! Base class for Kernel IR nodes -class TORCH_CUDA_CU_API Node : public NonCopyable, public PolymorphicBase { - public: - explicit Node(Passkey) {} - - //! IR Visitor double-dispatch interface - //! (https://en.wikipedia.org/wiki/Visitor_pattern) - virtual void accept(IrVisitor* visitor) const = 0; - - //! Non constant IR Visitor - virtual void accept(MutableIrVisitor* visitor) = 0; - - //! Debug helper, prints the textual representation of an IR node - void print() const; -}; - -//! Generic value (scalar or tensor) -class TORCH_CUDA_CU_API Val : public Node { - public: - Val(Passkey passkey, DataType dtype); - - // TODO(kir): consider renaming - StmtNameType name() const { - return name_; - } - - void setName(StmtNameType name) { - name_ = name; - } - - ValueId id() const { - return id_; - } - - DataType dtype() const { - return dtype_; - } - - Expr* definition() const { - return definition_; - } - - void setDefinition(Expr* expr) { - // TODO(kir): extra checks on changing existing definitions? - definition_ = expr; - } - - virtual bool isScalar() const { - return false; - } - - bool isConstScalar() const; - - virtual bool isConst() const { - return false; - } - - // TODO(kir): revisit and find a better interface - virtual bool isZeroInt() const { - return false; - } - - virtual bool isOneInt() const { - return false; - } - - void setEvaluatorIndex(int to) { - TORCH_INTERNAL_ASSERT(evaluator_index_ == -1); - evaluator_index_ = to; - } - - int evaluatorIndex() const { - return evaluator_index_; - } - - private: - const DataType dtype_; - - // The expression which defines this value, or nullptr - Expr* definition_ = nullptr; - - // This is a value name preserved from the Fusion IR (optional) - StmtNameType name_ = kInvalidStmName; - - // All Kernel IR values have IDs (unique within the same Kernel) - ValueId id_ = -1; - - // Expr evaluator idx; - int evaluator_index_ = -1; -}; - -//! Base class for expressions and statements -//! -//! Expressions consume inputs and produce outputs (depending on the context -//! this may imply assignments). Currently some of the expressions -//! don't actually produce any outputs (ForLoop, IfThenElse) and they -//! model statements to be executed. -//! -//! TODO(kir): split the expressions, assignments and statements? -//! -class TORCH_CUDA_CU_API Expr : public Node { - public: - explicit Expr(Passkey passkey) : Node(passkey) {} - - const auto& inputs() const { - return inputs_; - } - - const auto& outputs() const { - return outputs_; - } - - Scope* scope() const { - return scope_; - } - - //! Set the current scope - void setScope(Scope* scope) { - scope_ = scope; - } - - Expr* parentScope() const; - - Predicate* predicate() const { - return predicate_; - } - - void setPredicate(Predicate* predicate) { - predicate_ = predicate; - } - - Predicate* writePredicate() const { - return write_predicate_; - } - - void setWritePredicate(Predicate* write_predicate) { - write_predicate_ = write_predicate; - } - - protected: - // TODO(kir): try to avoid this protected interface - void addInput(Val* input) { - inputs_.push_back(input); - } - - void addOutput(Val* output) { - output->setDefinition(this); - outputs_.push_back(output); - } - - private: - // TODO(kir): can we avoid this? - std::vector inputs_; - std::vector outputs_; - - // TODO(kir): revisit scope/nesting data structures - Scope* scope_ = nullptr; - - Predicate* predicate_ = nullptr; - // Only used for reduction-related expressions - Predicate* write_predicate_ = nullptr; -}; - -class TORCH_CUDA_CU_API NamedScalar final : public Val { - public: - // NOLINTNEXTLINE(modernize-pass-by-value) - NamedScalar(Passkey passkey, std::string name, DataType dtype) - : Val(passkey, dtype), name_(name) {} - - explicit NamedScalar(Passkey passkey, const fuser::cuda::NamedScalar* node) - : Val(passkey, node->getDataType().value()) { - name_ = node->name(); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isScalar() const override { - return true; - } - - // TODO(kir): this is hiding and redefining Val::name() - const std::string& name() const { - return name_; - } - - // Return the named scalar extent of a parallel dimension (e.g. blockDim.x) - static NamedScalar* getParallelDim(ParallelType p_type); - - // Return the named scalar index of a parallel dimension (e.g. threadIdx.x) - static NamedScalar* getParallelIndex(ParallelType p_type); - - // Return the parallel type of this NamedScalar if it is an extent of a - // parallel dimension - c10::optional getParallelDim() const; - - // Return the parallel type of this NamedScalar if it is an index of a - // parallel dimension - c10::optional getParallelIndex() const; - - private: - std::string name_; -}; - class TORCH_CUDA_CU_API Predicate final : public Val { public: explicit Predicate( - Passkey passkey, + IrBuilderPasskey passkey, PredicateType ptype, const Expr* expr = nullptr, - Bool* thread_pred = nullptr) - : Val(passkey, DataType::Bool), - ptype_(ptype), - expr_(expr), - thread_pred_(thread_pred) { - TORCH_INTERNAL_ASSERT( - ptype != PredicateType::Unswitch && ptype != PredicateType::Manual); - } + Bool* thread_pred = nullptr); - explicit Predicate(Passkey passkey, ForLoop* unrolled_loop) - : Val(passkey, DataType::Bool), - ptype_(PredicateType::Unswitch), - unrolled_loop_(unrolled_loop) { - TORCH_INTERNAL_ASSERT(unrolled_loop != nullptr); - } + explicit Predicate(IrBuilderPasskey passkey, ForLoop* unrolled_loop); - explicit Predicate(Passkey passkey, Bool* value) - : Val(passkey, DataType::Bool), - ptype_(PredicateType::Manual), - value_(value) { - TORCH_INTERNAL_ASSERT(value != nullptr); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + explicit Predicate(IrBuilderPasskey passkey, Bool* value); PredicateType predicate_type() const { return ptype_; @@ -543,6 +117,10 @@ class TORCH_CUDA_CU_API Predicate final : public Val { value_ = value; } + bool isConst() const final { + return hasValue() && value_->isConst(); + } + private: PredicateType ptype_ = PredicateType::Manual; @@ -561,603 +139,13 @@ class TORCH_CUDA_CU_API Predicate final : public Val { Bool* value_ = nullptr; }; -class TORCH_CUDA_CU_API Bool final : public Val { - public: - explicit Bool(Passkey passkey, const c10::optional& value) - : Val(passkey, DataType::Bool), maybe_value_(value) {} - - explicit Bool(Passkey passkey, const fuser::cuda::Bool* node) - : Val(passkey, DataType::Bool), maybe_value_(node->value()) { - setName(node->name()); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isScalar() const override { - return true; - } - - bool isConst() const override { - return maybe_value_.has_value(); - } - - c10::optional value() const { - return maybe_value_; - } - - private: - const c10::optional maybe_value_; -}; - -class TORCH_CUDA_CU_API Double final : public Val { - public: - using ScalarType = double; - - explicit Double(Passkey passkey, const c10::optional& value) - : Val(passkey, DataType::Double), maybe_value_(value) {} - - explicit Double(Passkey passkey, const fuser::cuda::Double* node) - : Val(passkey, DataType::Double), maybe_value_(node->value()) { - setName(node->name()); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isScalar() const override { - return true; - } - - bool isConst() const override { - return maybe_value_.has_value(); - } - - c10::optional value() const { - return maybe_value_; - } - - private: - const c10::optional maybe_value_; -}; - -class TORCH_CUDA_CU_API Int final : public Val { - public: - using ScalarType = int64_t; - - explicit Int(Passkey passkey, const c10::optional& value) - : Val(passkey, DataType::Int), maybe_value_(value) {} - - // SFINAE constructor to avoid 0 constant pointer ambiguity - template < - typename T, - typename = typename std::enable_if< - std::is_pointer::value && - std::is_convertible::value>::type> - explicit Int(Passkey passkey, T node) - : Val(passkey, DataType::Int), maybe_value_(node->value()) { - setName(node->name()); - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isScalar() const override { - return true; - } - - bool isConst() const override { - return maybe_value_.has_value(); - } - - bool isZeroInt() const override { - return maybe_value_.has_value() && *maybe_value_ == 0; - } - - bool isOneInt() const override { - return maybe_value_.has_value() && *maybe_value_ == 1; - } - - c10::optional value() const { - return maybe_value_; - } - - private: - const c10::optional maybe_value_; -}; - -class TORCH_CUDA_CU_API IterDomain final : public Val { - public: - IterDomain(Passkey passkey, Val* start, Val* extent); - - explicit IterDomain(Passkey, const fuser::cuda::IterDomain* iter_domain); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - bool isReduction() const { - return iterType() == IterType::Reduction; - } - - bool isRFactorProduct() const { - return is_rfactor_domain_; - } - - bool isBroadcast() const { - return iterType() == IterType::BroadcastWithStride || - iterType() == IterType::BroadcastWithoutStride; - } - - bool isGather() const { - return iterType() == IterType::Gather; - } - - bool isStride() const { - return iterType() == IterType::Stride; - } - - bool isParallelized() const { - return parallelType() != ParallelType::Serial; - } - - // Return if this iter domain is mapped to a grid dimension - bool isBlockDim() const { - return parallelType() == ParallelType::BIDz || - parallelType() == ParallelType::BIDy || - parallelType() == ParallelType::BIDx; - } - - // Return if this iter domain is mapped to a block dimension - bool isThreadDim() const { - return parallelType() == ParallelType::TIDz || - parallelType() == ParallelType::TIDy || - parallelType() == ParallelType::TIDx; - } - - // Return if this iter domain is either mapped to a block or grid dimension - bool isThread() const { - return isBlockDim() || isThreadDim(); - } - - ParallelType parallelType() const { - return parallel_type_; - } - - IterType iterType() const { - return iter_type_; - } - - Val* start() const { - return start_; - } - - Val* stop() const { - return stop_; - } - - Val* extent() const; - - bool isSimple() const { - return is_simple_; - } - - bool hasPaddingToMultipleOfWarp() const { - return is_padded_dimension_; - } - - private: - Val* const start_ = nullptr; - Val* const stop_ = nullptr; - Val* const extent_ = nullptr; - ParallelType parallel_type_ = ParallelType::Serial; - IterType iter_type_ = IterType::Iteration; - bool is_rfactor_domain_ = false; - - // An IterDomain is "simple" if the original Fusion IterDomain - // doesn't have a definition ("definition" expression) - // - // TODO(kir): this feels like a hack, revisit - // - bool is_simple_ = true; - - //! Indicates if this iterdomain is a padded parallel dimension - bool is_padded_dimension_ = false; -}; - -// TODO(kir): is this really a value? -class TORCH_CUDA_CU_API TensorDomain final : public Val { - public: - explicit TensorDomain(Passkey, std::vector domain); - - explicit TensorDomain( - Passkey passkey, - const fuser::cuda::TensorDomain* tensor_domain); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - std::vector::size_type nDims() const { - return domain_.size(); - } - - // TODO(kir): rename this - const std::vector& domain() const { - return domain_; - } - - const std::vector& contiguity() const { - return contiguity_; - } - - std::string getContiguityString() const { - std::stringstream ss; - for (auto b : contiguity()) { - ss << (b ? "t" : "f"); - } - return ss.str(); - } - - bool hasReduction() const; - bool hasBlockReduction() const; - bool hasGridReduction() const; - bool hasBlockBroadcast() const; - bool hasGridBroadcast() const; - bool hasBroadcast() const; - bool hasRFactor() const; - bool hasVectorize() const; - - const std::vector& noReductions() const { - return no_reduction_domain_; - } - - const std::vector& noBroadcasts() const { - return no_bcast_domain_; - } - - const std::vector& rootDomain() const { - return root_domain_; - }; - - const std::vector& rfactorDomain() const { - return rfactor_domain_; - }; - - void resetDomains() { - no_reduction_domain_ = noReductions(domain_); - no_bcast_domain_ = noBroadcasts(domain_); - } - - IterDomain* axis(int i) const; - - // TODO(kir): overloading non-static and static methods is not a good idea - static std::vector noReductions(const std::vector&); - static std::vector noBroadcasts(const std::vector&); - - private: - std::vector root_domain_; - std::vector domain_; - std::vector no_bcast_domain_; - std::vector no_reduction_domain_; - std::vector rfactor_domain_; - const std::vector contiguity_; -}; - -class TORCH_CUDA_CU_API TensorView final : public Val { - public: - explicit TensorView(Passkey, const fuser::cuda::TensorView* tv); - - TensorView( - Passkey, - DataType dtype, - TensorDomain* domain, - MemoryType memory_type); - - TensorDomain* domain() const { - return domain_; - } - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - MemoryType memoryType() const { - return memory_type_; - } - - fuser::cuda::TensorView* fuserTv() const { - TORCH_INTERNAL_ASSERT(fuser_tv_ != nullptr); - // TODO(kir): remove the need for const_cast - return const_cast(fuser_tv_); // NOLINT - } - - private: - TensorDomain* domain_ = nullptr; - MemoryType memory_type_ = MemoryType::Local; - - // TODO(kir): remove temporary hack - const fuser::cuda::TensorView* fuser_tv_ = nullptr; -}; - -class TORCH_CUDA_CU_API UnaryOp final : public Expr { - public: - UnaryOp(Passkey passkey, UnaryOpType operation, Val* out, Val* in); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* in() const { - return in_; - } - - UnaryOpType operation() const { - return operation_; - } - - private: - const UnaryOpType operation_; - Val* const out_ = nullptr; - Val* const in_ = nullptr; -}; - -class TORCH_CUDA_CU_API BinaryOp final : public Expr { - public: - BinaryOp( - Passkey passkey, - BinaryOpType operation, - Val* out, - Val* lhs, - Val* rhs); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* lhs() const { - return lhs_; - } - - Val* rhs() const { - return rhs_; - } - - BinaryOpType operation() const { - return operation_; - } - - private: - const BinaryOpType operation_; - Val* const out_ = nullptr; - Val* const lhs_ = nullptr; - Val* const rhs_ = nullptr; -}; - -class TORCH_CUDA_CU_API TernaryOp final : public Expr { - public: - TernaryOp( - Passkey passkey, - TernaryOpType operation, - Val* out, - Val* in1, - Val* in2, - Val* in3); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* in1() const { - return in1_; - } - - Val* in2() const { - return in2_; - } - - Val* in3() const { - return in3_; - } - - TernaryOpType operation() const { - return operation_; - } - - private: - const TernaryOpType operation_; - Val* const out_ = nullptr; - Val* const in1_ = nullptr; - Val* const in2_ = nullptr; - Val* const in3_ = nullptr; -}; - -class TORCH_CUDA_CU_API ReductionOp final : public Expr { - public: - ReductionOp( - Passkey passkey, - BinaryOpType operation, - Val* init, - Val* out, - Val* in); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* in() const { - return in_; - } - - Val* init() const { - return init_; - } - - BinaryOpType operation() const { - return operation_; - } - - private: - const BinaryOpType operation_; - Val* const init_ = nullptr; - Val* const out_ = nullptr; - Val* const in_ = nullptr; -}; - -class TORCH_CUDA_CU_API WelfordOp final : public Expr { - public: - WelfordOp( - Passkey passkey, - Val* out_var, - Val* out_avg, - Val* out_N, - Val* init_var, - Val* init_avg, - Val* init_N, - Val* in_var, - Val* in_avg, - Val* in_N); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_avg_; - } - - Val* in() const { - return in_avg_; - } - - // Welford Specific accessors - // Almost wanted to add a new struct for {var, avg, N} - Val* outVar() const { - return out_var_; - } - - Val* outAvg() const { - return out_avg_; - } - - Val* outN() const { - return out_N_; - } - - Val* initVar() const { - return init_var_; - } - - Val* initAvg() const { - return init_avg_; - } - - Val* initN() const { - return init_N_; - } - - Val* inVar() const { - return in_var_; - } - - Val* inAvg() const { - return in_avg_; - } - - Val* inN() const { - return in_N_; - } - - private: - Val* const out_var_; - Val* const out_avg_; - Val* const out_N_; - Val* const init_var_; - Val* const init_avg_; - Val* const init_N_; - Val* const in_var_; - Val* const in_avg_; - Val* const in_N_; -}; - class TORCH_CUDA_CU_API TensorIndex final : public Val { public: TensorIndex( - Passkey, + IrBuilderPasskey, const fuser::cuda::TensorView* view, std::vector indices); - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - std::vector::size_type nDims() const { return indices_.size(); } @@ -1170,8 +158,7 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val { TensorView* view() const { TORCH_INTERNAL_ASSERT(view_ != nullptr); - // TODO(kir): remove the need for const_cast - return const_cast(view_); // NOLINT + return const_cast(view_); // NOLINT } private: @@ -1179,46 +166,17 @@ class TORCH_CUDA_CU_API TensorIndex final : public Val { std::vector indices_; }; -class TORCH_CUDA_CU_API BroadcastOp final : public Expr { - public: - BroadcastOp(Passkey passkey, Val* out, Val* in); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - - Val* out() const { - return out_; - } - - Val* in() const { - return in_; - } - - private: - Val* const out_ = nullptr; - Val* const in_ = nullptr; -}; - //! Allocate is a lower level Node that describes a buffer of memory that //! is required as an intermediate within a kernel. The extent is the expression //! of the size of the buffer that is generated from the TensorView that //! describes the output of an operation. -//! -//! TODO(kir): The components of Allocate like Type and Name could be separated -//! from the the assocated TensorView. Perhaps that is more appropriate? -//! class TORCH_CUDA_CU_API Allocate final : public Expr { public: //! Allocation of a multi-dimensional buffer //! //! param shape Size of each dimension explicit Allocate( - Passkey passkey, + IrBuilderPasskey passkey, Val* buffer, MemoryType memory_type, std::vector shape = {}, @@ -1228,20 +186,12 @@ class TORCH_CUDA_CU_API Allocate final : public Expr { //! //! param size Size of allocation explicit Allocate( - Passkey passkey, + IrBuilderPasskey passkey, Val* buffer, MemoryType memory_type, Val* size, bool zero_init = false); - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - Val* buffer() const { return buffer_; } @@ -1292,15 +242,7 @@ class TORCH_CUDA_CU_API Allocate final : public Expr { // class TORCH_CUDA_CU_API Sync final : public Expr { public: - explicit Sync(Passkey passkey, bool war_sync = false); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + explicit Sync(IrBuilderPasskey passkey, bool war_sync = false); bool isWarHazardSync() const { return war_sync_; @@ -1315,30 +257,14 @@ class TORCH_CUDA_CU_API Sync final : public Expr { // in helpers.cu class TORCH_CUDA_CU_API InitMagicZero final : public Expr { public: - explicit InitMagicZero(Passkey passkey); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + explicit InitMagicZero(IrBuilderPasskey passkey); }; // Simply prints "UPDATE_MAGIC_ZERO" in the code in accordance with magic_zero // in helpers.cu class TORCH_CUDA_CU_API UpdateMagicZero final : public Expr { public: - explicit UpdateMagicZero(Passkey passkey); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + explicit UpdateMagicZero(IrBuilderPasskey passkey); }; // TODO(kir): promote to IR node @@ -1377,7 +303,6 @@ class TORCH_CUDA_CU_API Scope { void push_back(Expr* e) { exprs_.push_back(e); - e->setScope(this); } // Erase expr at pos @@ -1425,7 +350,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { //! //! TODO: cleaner way to set options? ForLoop( - Passkey passkey, + IrBuilderPasskey passkey, IterDomain* iter_domain, Val* index, Val* start, @@ -1435,17 +360,9 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { Val* vectorize_shift, bool unroll_required); - ForLoop(Passkey passkey, IterDomain* iter_domain); + ForLoop(IrBuilderPasskey passkey, IterDomain* iter_domain); - ForLoop(Passkey passkey, const ForLoop* other); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + ForLoop(IrBuilderPasskey passkey, const ForLoop* other); Val* index() const { return index_; @@ -1465,6 +382,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { return iter_domain_; } + // TODO: Return pointer instead of reference to be more consistent Scope& body() { return body_; } @@ -1524,15 +442,7 @@ class TORCH_CUDA_CU_API ForLoop final : public Expr { //! class TORCH_CUDA_CU_API IfThenElse final : public Expr { public: - explicit IfThenElse(Passkey passkey, Predicate* cond); - - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } + explicit IfThenElse(IrBuilderPasskey passkey, Predicate* cond); Scope& thenBody() { return then_body_; @@ -1567,16 +477,8 @@ class TORCH_CUDA_CU_API IfThenElse final : public Expr { //! reduction and sync buffers. class TORCH_CUDA_CU_API GridReduction final : public Expr { public: - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - GridReduction( - Passkey passkey, + IrBuilderPasskey passkey, ReductionOp* reduction_op, Allocate* reduction_buffer, Allocate* sync_buffer); @@ -1620,23 +522,11 @@ class TORCH_CUDA_CU_API GridReduction final : public Expr { //! broadcast and sync buffers. class TORCH_CUDA_CU_API GridBroadcast final : public Expr { public: - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - GridBroadcast( - Passkey passkey, + IrBuilderPasskey passkey, BroadcastOp* broadcast_op, Allocate* broadcast_buffer, - Allocate* sync_buffer) - : Expr(passkey), - broadcast_op_(broadcast_op), - broadcast_buffer_(broadcast_buffer), - sync_buffer_(sync_buffer){}; + Allocate* sync_buffer); BroadcastOp* broadcast_op() const { return broadcast_op_; @@ -1665,16 +555,8 @@ class TORCH_CUDA_CU_API GridBroadcast final : public Expr { //! reduction and sync buffers. class TORCH_CUDA_CU_API GridWelford final : public Expr { public: - void accept(IrVisitor* visitor) const override { - visitor->visit(this); - } - - void accept(MutableIrVisitor* visitor) override { - visitor->visit(this); - } - GridWelford( - Passkey passkey, + IrBuilderPasskey passkey, WelfordOp* welford_op, Allocate* var_buffer, Allocate* avg_buffer, diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h b/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h deleted file mode 100644 index 17a095baf12..00000000000 --- a/torch/csrc/jit/codegen/cuda/kernel_ir_builder.h +++ /dev/null @@ -1,131 +0,0 @@ -#pragma once - -#include -#include -#include - -#include - -namespace torch { -namespace jit { -namespace fuser { -namespace cuda { -namespace kir { - -//! Kernel IR builder interface -//! -//! The only way to create new Kernel IR nodes is through the -//! kir::IrBuilder interface. An IrBuilder instance is attached to a -//! particular Kernel instance and it provides methods for creating -//! single nodes (kir::IrBuilder::create()) or basic composite expressions -//! (ex. kir::IrBuilder::addExpr()). -//! -//! If the Kernel object is readily available, an IrBuilder can be "wrapped" -//! around it directly: -//! -//! kir::IrBuilder ir_builder(kernel); -//! -//! During lowering, another option is to create an IrBuilder for the -//! kernel that is being created: -//! -//! kir::IrBuilder ir_builder(GpuLower::current()->kernel()); -//! -//! Once we have an IR builder instance, creating nodes looks like: -//! -//! auto new_node = ir_builder.create(1)); -//! auto result = ir_builder.mulExpr(lhs, rhs); -//! -class TORCH_CUDA_CU_API IrBuilder { - public: - explicit IrBuilder(Kernel* kernel) : kernel_(kernel) {} - - //! Allocate a new Kernel IR node, forwarding the arguments - //! to the appropriate constructor - template - T* create(Args&&... args) { - const kir::Passkey passkey(kernel_); - const auto node = new T(passkey, std::forward(args)...); - kernel_->registerIrNode(passkey, std::unique_ptr(node)); - return node; - } - - // Unary operations - Val* negExpr(Val* val); - Val* notExpr(Val* val); - Val* setExpr(Val* val); - Val* setExprNamedScalar(const std::string& name, Val* val); - Val* addressExprNamedScalar(const std::string& name, Val* val); - - // Binary operations - Val* andExpr(Val* lhs, Val* rhs); - Val* eqExpr(Val* lhs, Val* rhs); - Val* gtExpr(Val* lhs, Val* rhs); - Val* ltExpr(Val* lhs, Val* rhs); - Val* leExpr(Val* lhs, Val* rhs); - Val* geExpr(Val* lhs, Val* rhs); - Val* addExpr(Val* lhs, Val* rhs); - Val* subExpr(Val* lhs, Val* rhs); - Val* mulExpr(Val* lhs, Val* rhs); - Val* divExpr(Val* lhs, Val* rhs); - Val* ceilDivExpr(Val* lhs, Val* rhs); - Val* modExpr(Val* lhs, Val* rhs); - Val* maxExpr(Val* lhs, Val* rhs); - Val* minExpr(Val* lhs, Val* rhs); - - // Ternary operations - Val* whereExpr(Val* pred, Val* lhs, Val* rhs); - - // Shortcuts for frequently used vals - Int* zeroVal(); - Int* oneVal(); - Bool* falseVal(); - Bool* trueVal(); - - NamedScalar* magicZeroVal(); - - private: - Val* newResult(DataType dtype); - Val* newArithmeticExpr(BinaryOpType op_type, Val* lhs, Val* rhs); - Val* newLogicExpr(BinaryOpType op_type, Val* lhs, Val* rhs); - - private: - // Non-owning pointer to the kernel to be modified - Kernel* kernel_ = nullptr; - // Frequently used constant vals - Int* zero_ = nullptr; - Int* one_ = nullptr; - Bool* false_ = nullptr; - Bool* true_ = nullptr; - - // Magic zero corresponds to runtime/helpers.cu magic_zero - NamedScalar* magic_zero_ = nullptr; -}; - -//! A wrapper builder with static expression simplification -//! -//! Example: -//! - addExpr(new Int(1), new Int(2)) -> Int(3) -//! - addExpr(new Int(0), new NamedScalar("foo")) -> NamedScalar("foo") -//! -//! Designed to be used to simplify predicate and index expressions in -//! generated code. Also, the shift validation may fail without -//! this simplification. -class TORCH_CUDA_CU_API SimplifyingIrBuilder : public IrBuilder { - public: - explicit SimplifyingIrBuilder(Kernel* kernel) : IrBuilder(kernel) {} - - Val* negExpr(Val* val); - Val* notExpr(Val* val); - - Val* addExpr(Int* lhs, Int::ScalarType rhs); - Val* addExpr(Int* lhs, Int* rhs); - Val* addExpr(Val* lhs, Val* rhs); - Val* subExpr(Val* lhs, Val* rhs); - Val* andExpr(Val* lhs, Val* rhs); -}; - -} // namespace kir -} // namespace cuda -} // namespace fuser -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp new file mode 100644 index 00000000000..bfc4794e299 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.cpp @@ -0,0 +1,180 @@ +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { +namespace kir { +std::vector IrVisitor::handle(const std::vector& exprs) { + exprs_ = std::vector(exprs); + for (auto expr : exprs) { + handle(expr); + } + return exprs_; +} + +void IrVisitor::handle(ForLoop* fl) { + for_loops_.push_back(fl); + scope_.push_back(&fl->body()); + auto body_exprs = std::vector(fl->body().exprs()); + for (auto expr : body_exprs) { + handle(expr); + } + scope_.pop_back(); + for_loops_.pop_back(); +} + +void IrVisitor::handle(IfThenElse* ite) { + scope_.push_back(&ite->thenBody()); + auto then_exprs = std::vector(ite->thenBody().exprs()); + for (auto expr : then_exprs) { + handle(expr); + } + scope_.pop_back(); + + scope_.push_back(&ite->elseBody()); + auto else_exprs = std::vector(ite->elseBody().exprs()); + for (auto expr : else_exprs) { + handle(expr); + } + scope_.pop_back(); +} + +std::vector ExprMutator::mutate(bool reverse_order) { + if (insertions_.empty() && replacements_.empty()) { + return exprs_; + } + + auto run_insertion = [&](MutationInformation info) { + if (info.scope == nullptr) { + // If reference is nullptr and there are no expressions, simply insert the + // expr + if (exprs_.empty() && info.reference == nullptr) { + exprs_.push_back(info.new_expr); + return; + } + auto pos_it = std::find(exprs_.begin(), exprs_.end(), info.reference); + TORCH_INTERNAL_ASSERT( + pos_it != exprs_.end(), + "Issue finding reference expression for insertion."); + if (info.mode == MutationMode::BEFORE) { + exprs_.insert(pos_it, info.new_expr); + } else { + exprs_.insert(pos_it + 1, info.new_expr); + } + } else { + // If reference is nullptr and there are no expressions, simply insert the + // expr + if (info.scope->exprs().empty() && info.reference == nullptr) { + info.scope->push_back(info.new_expr); + return; + } + if (info.mode == MutationMode::BEFORE) { + info.scope->insert_before(info.reference, info.new_expr); + } else { + info.scope->insert_after(info.reference, info.new_expr); + } + } + }; + + if (reverse_order) { + for (auto it = insertions_.rbegin(); it != insertions_.rend(); ++it) { + run_insertion(*it); + } + } else { + for (auto insertion_info : insertions_) { + run_insertion(insertion_info); + } + } + + for (auto replacement_info : replacements_) { + if (replacement_info.scope == nullptr) { + auto pos_it = + std::find(exprs_.begin(), exprs_.end(), replacement_info.reference); + TORCH_INTERNAL_ASSERT( + pos_it != exprs_.end(), + "Issue finding reference expression for replacement."); + exprs_.insert(pos_it, replacement_info.new_expr); + // iterator can be invalidated from insertion + pos_it = + std::find(exprs_.begin(), exprs_.end(), replacement_info.reference); + exprs_.erase(pos_it); + } else { + replacement_info.scope->insert_before( + replacement_info.reference, replacement_info.new_expr); + replacement_info.scope->erase(replacement_info.reference); + } + } + + insertions_.clear(); + replacements_.clear(); + + return exprs_; +} + +std::vector ExprMutator::traverseAndInsert( + const std::vector& exprs, + bool reverse_order) { + IrVisitor::handle(exprs); + return mutate(reverse_order); +} + +void ExprMutator::registerMutation( + Expr* reference, + Expr* new_expr, + Scope* scope, + MutationMode mode) { + MutationInformation mutation; + mutation.reference = reference; + mutation.new_expr = new_expr; + mutation.scope = scope; + mutation.mode = mode; + if (mode == MutationMode::BEFORE || mode == MutationMode::AFTER) { + insertions_.push_back(mutation); + } else { + replacements_.push_back(mutation); + } +} + +void ExprMutator::registerInsertBefore( + Expr* reference, + Expr* new_expr, + Scope* scope) { + registerMutation(reference, new_expr, scope, MutationMode::BEFORE); +} + +void ExprMutator::registerInsertAfter( + Expr* reference, + Expr* new_expr, + Scope* scope) { + registerMutation(reference, new_expr, scope, MutationMode::AFTER); +} + +void ExprMutator::registerReplace( + Expr* reference, + Expr* new_expr, + Scope* scope) { + registerMutation(reference, new_expr, scope, MutationMode::REPLACE); +} + +void ExprMutator::registerInsertBefore(Expr* reference, Expr* new_expr) { + Scope* scope = scope_.empty() ? nullptr : scope_.back(); + registerInsertBefore(reference, new_expr, scope); +} + +void ExprMutator::registerInsertAfter(Expr* reference, Expr* new_expr) { + Scope* scope = scope_.empty() ? nullptr : scope_.back(); + registerInsertAfter(reference, new_expr, scope); +} + +void ExprMutator::registerReplace(Expr* reference, Expr* new_expr) { + Scope* scope = scope_.empty() ? nullptr : scope_.back(); + registerReplace(reference, new_expr, scope); +} + +} // namespace kir +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h new file mode 100644 index 00000000000..2140498af14 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/kernel_ir_dispatch.h @@ -0,0 +1,118 @@ +#pragma once + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +class Expr; + +namespace kir { +class Predicate; +class TensorIndex; +class ForLoop; +class IfThenElse; +class Scope; + +// Base visitor class that visits all nodes in provided vector. +// +// Includes visiting through scopes like IfThenElse and ForLoop, and tracks +// them in scopes_ and for_loops_. +// +// Makes a copy of exprs at exprs_ which could be used to modify and return. +// +// When traversing through ITE/FLs it will use a copy +// of the provided expressions to make it safe to insert/delete nodes. +// +// Provides a simple base class to inherit from for typical lowering passes on +// Expr list +class TORCH_CUDA_CU_API IrVisitor : public OptOutDispatch { + public: + std::vector handle(const std::vector& expr); + + protected: + using OptOutDispatch::handle; + + virtual void handle(ForLoop*) override; + virtual void handle(IfThenElse*) override; + + protected: + std::vector for_loops_; + std::vector scope_; + std::vector exprs_; +}; + +// Base Expr Mutator class that visits all nodes with IrVisitor, and then +// inserts new expressions or replaces expressions based on insertion/replace +// maps provided. These replacement maps are expected to accumulate during an +// initial traversal, then runs an insertion based on them after the overloaded +// traversal. +// +// Order of mutations may be important, mutations are ordered according to the +// following rules: +// Before/After insertions are ordered as registered when reverse_order == +// false, +// +// Before/After insertions are in reverse order as registered when +// reverse_order == true, +// +// Before/After insertions are done before Expr replacements, so reference for +// insertions must be on pre-replaced Exprs +// +// To place in a scope that is empty, simply provide a nullptr reference +// Since insertions are done in order, it's possible to insert an expression in +// an empty scope, and then use that inserted scope as a reference for +// subsequent mutations. +class ExprMutator : public IrVisitor { + protected: + std::vector traverseAndInsert( + const std::vector& expr, + bool reverse_order = false); + + std::vector mutate(bool reverse_order = false); + + using IrVisitor::handle; + // Registration function which *don't* need to be called "in place" during + // visiting. + void registerInsertBefore(Expr* reference, Expr* new_expr, Scope* scope); + void registerInsertAfter(Expr* reference, Expr* new_expr, Scope* scope); + void registerReplace(Expr* reference, Expr* new_expr, Scope* scope); + + // Registration function which need to be called "in place" during visiting. + // I.E. + // if you want to insert before/after or replace an Expr, you must register + // when in handle(Expr*) of that expr. + void registerInsertBefore(Expr* reference, Expr* new_expr); + void registerInsertAfter(Expr* reference, Expr* new_expr); + void registerReplace(Expr* reference, Expr* new_expr); + + private: + enum class MutationMode { BEFORE, AFTER, REPLACE }; + + void registerMutation( + Expr* ref, + Expr* new_expr, + Scope* scope, + MutationMode mode); + + struct MutationInformation { + Expr* reference = nullptr; + Expr* new_expr = nullptr; + Scope* scope = nullptr; + MutationMode mode = MutationMode::BEFORE; + }; + + // Track insertions as they're registered + std::vector insertions_; + + // Track replacements as they're registered + std::vector replacements_; +}; + +} // namespace kir +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp b/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp deleted file mode 100644 index e00da31423c..00000000000 --- a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.cpp +++ /dev/null @@ -1,451 +0,0 @@ -#include -#include - -#include -#include - -#include - -namespace torch { -namespace jit { -namespace fuser { -namespace cuda { -namespace kir { - -namespace { - -const char* boolLiteral(bool value) { - return value ? "true" : "false"; -} - -std::string varName(const kir::Val* val, const char* prefix) { - std::stringstream value_name; - if (val == nullptr) { - value_name << "$nullptr"; - } else if (val->name() != kInvalidStmName) { - value_name << prefix << val->name(); - } else { - value_name << "k" << prefix << val->id(); - } - return value_name.str(); -} - -} // namespace - -void IrPrinter::printNode(const kir::Node* node) { - os_ << gen(node, true); -} - -void IrPrinter::printKernel(const Kernel* kernel) { - TORCH_CHECK(kernel != nullptr); - - // kernel declaration - os_ << "\nKERNEL ("; - for (auto in : kernel->inputs()) { - os_ << gen(in); - if (in != kernel->inputs().back()) { - os_ << ", "; - } - } - os_ << ") -> ("; - for (auto out : kernel->outputs()) { - os_ << gen(out); - if (out != kernel->outputs().back()) { - os_ << ", "; - } - } - os_ << ") :\n"; - - // kernel body - startBlock(); - for (auto expr : kernel->topLevelExprs()) { - os_ << gen(expr, true); - } - endBlock(); - os_ << "END.\n\n"; -} - -std::ostream& IrPrinter::indent() { - for (const auto i : c10::irange(indent_level_)) { - (void)i; // Suppress unused variable warning - ir_str_ << kTab; - } - ir_str_ << margin_; - return ir_str_; -} - -std::string IrPrinter::gen(const kir::Node* node, bool top_level) { - if (node == nullptr) { - return "$nullptr"; - } - - // If we're generatign a top level statement we expect to start - // with an empty set of uses - TORCH_INTERNAL_ASSERT(!implicit_definition_ || uses_.empty() || !top_level); - - // Mark the node as generated - visited_.insert(node); - - // Generate the node itself - std::stringstream node_str; - std::swap(node_str, ir_str_); - node->accept(this); - std::swap(node_str, ir_str_); - - if (!implicit_definition_) { - return node_str.str(); - } - - if (top_level) { - // Implicitly mark top level nodes as used, so we - // get their definitions printed (useful for debugging) - if (auto val = dynamic_cast(node)) { - uses_.insert(val); - } - - // Make a copy of the node uses (and reset global state) - const auto node_uses = uses_; - uses_.clear(); - - std::stringstream top_level_str; - - // Hoist implicit definitions - for (auto use : node_uses) { - const auto def = use->definition(); - if (def && visited_.find(def) == visited_.end()) { - margin_ = "~ "; - top_level_str << gen(def, true); - margin_ = ""; - } - } - - top_level_str << node_str.str(); - return top_level_str.str(); - } else { - return node_str.str(); - } -} - -std::string IrPrinter::use(const kir::Val* val) { - if (val != nullptr) { - uses_.insert(val); - } - return gen(val); -} - -void IrPrinter::startBlock() { - ++indent_level_; -} - -void IrPrinter::endBlock() { - TORCH_CHECK(indent_level_ > 0); - --indent_level_; -} - -void IrPrinter::handleBlock(const kir::Scope& scope) { - // Save the uses of the parent scope - decltype(uses_) outer_uses; - std::swap(uses_, outer_uses); - - startBlock(); - for (auto expr : scope.exprs()) { - ir_str_ << gen(expr, true); - } - endBlock(); - - // Restore parent's uses - std::swap(uses_, outer_uses); -} - -void IrPrinter::visit(const kir::Bool* node) { - if (node->isConst()) { - ir_str_ << boolLiteral(*node->value()); - } else { - ir_str_ << varName(node, "b"); - } -} - -void IrPrinter::visit(const kir::Double* node) { - if (node->isConst()) { - const int digits = std::numeric_limits::max_digits10; - ir_str_ << "double(" << std::setprecision(digits) << *node->value() << ")"; - } else { - ir_str_ << varName(node, "d"); - } -} - -void IrPrinter::visit(const kir::Int* node) { - if (node->isConst()) { - ir_str_ << *node->value(); - } else { - ir_str_ << varName(node, "i"); - } -} - -void IrPrinter::visit(const kir::NamedScalar* node) { - ir_str_ << node->name(); -} - -void IrPrinter::visit(const kir::Predicate* node) { - switch (node->predicate_type()) { - case PredicateType::Inline: { - ir_str_ << "Inline"; - break; - } - case PredicateType::Manual: { - ir_str_ << node->value(); - break; - } - case PredicateType::Misaligned: { - ir_str_ << "Misaligned"; - break; - } - case PredicateType::Padding: { - ir_str_ << "Padding"; - break; - } - case PredicateType::Shift: { - ir_str_ << "Shift"; - break; - } - case PredicateType::Unswitch: { - ir_str_ << "Unswitch"; - break; - } - case PredicateType::Vectorize: { - ir_str_ << "Vectorize"; - break; - } - default: - break; - } -} - -void IrPrinter::visit(const kir::TensorIndex* node) { - ir_str_ << gen(node->view()) << "["; - for (auto index : node->indices()) { - ir_str_ << use(index); - if (index != node->indices().back()) { - ir_str_ << ", "; - } - } - ir_str_ << "]"; -} - -void IrPrinter::visit(const kir::IterDomain* node) { - ir_str_ << varName(node, "id") << "["; - if (node->isRFactorProduct()) { - ir_str_ << "rfactor."; - } - ir_str_ << node->parallelType() << "." << node->iterType() << "(" - << use(node->start()) << " .. " << use(node->extent()) << ")]"; -} - -void IrPrinter::visit(const kir::TensorDomain*) { - // TODO(kir): print Tensor shapes? - ir_str_ << "kir::TensorDomain"; -} - -void IrPrinter::visit(const kir::TensorView* node) { - // TODO(kir): print memory type too? - ir_str_ << varName(node, "T"); -} - -void IrPrinter::visit(const kir::UnaryOp* node) { - indent() << gen(node->out()) << " = "; - - auto op_type = node->operation(); - - if (auto op = inline_op_str(op_type)) { - if (alsoBooleanOperator(op_type) && - node->out()->dtype() == DataType::Bool) { - ir_str_ << stringifyBooleanOp(op_type) << gen(node->in()); - } else { - ir_str_ << *op << gen(node->in()); - } - } else { - if (op_type == UnaryOpType::Cast) { - const auto cast_str = - cast_func_str({node->in()->dtype(), node->out()->dtype()}); - ir_str_ << cast_str.value(); - } else { - ir_str_ << op_type; - if (needFloatSuffix(op_type) && node->out()->dtype() == DataType::Float) { - ir_str_ << "f"; - } - } - - if (op_type == UnaryOpType::RandLike) { - ir_str_ << "(RND"; - } else { - ir_str_ << "("; - ir_str_ << use(node->in()); - } - ir_str_ << ")"; - } - - ir_str_ << "\n"; -} - -void IrPrinter::visit(const kir::BinaryOp* node) { - indent() << gen(node->out()) << " = "; - - const auto op_type = node->operation(); - const auto lhs = use(node->lhs()); - const auto rhs = use(node->rhs()); - - if (auto op = inline_op_str(op_type)) { - ir_str_ << lhs << " "; - if (alsoBooleanOperator(op_type) && - node->out()->dtype() == DataType::Bool) { - ir_str_ << stringifyBooleanOp(op_type); - } else { - ir_str_ << *op; - } - ir_str_ << " " << rhs; - } else { - ir_str_ << op_type; - if (needFloatSuffix(op_type) && node->out()->dtype() == DataType::Float) { - ir_str_ << "f"; - } - ir_str_ << "(" << lhs << ", " << rhs << ")"; - } - - ir_str_ << "\n"; -} - -void IrPrinter::visit(const kir::TernaryOp* node) { - indent() << gen(node->out()) << " = " << node->operation() << "(" - << use(node->in1()) << ", " << use(node->in2()) << ", " - << use(node->in3()) << ")\n"; -} - -void IrPrinter::visit(const kir::ReductionOp* node) { - indent() << gen(node->out()) << " = " - << "REDUCTION(op='" << node->operation() << "'" - << ", in=" << use(node->in()) << ", init=" << use(node->init()) - << ", pred=" << use(node->predicate()) << ")\n"; -} - -void IrPrinter::visit(const kir::WelfordOp* node) { - indent() << gen(node->outVar()) << "," << gen(node->outAvg()) << "," - << gen(node->outN()) << " = " - << "Welford( inAvg=" << use(node->inAvg()); - if (!node->inN()->isOneInt()) { - indent() << " inVar=" << use(node->inVar()); - } - indent() << " inN=" << use(node->inN()); - if (!node->initN()->isZeroInt()) { - indent() << ", initVar=" << use(node->initVar()) - << " initAvg=" << use(node->initAvg()) - << " initN=" << use(node->initN()); - } - indent() << ", pred=" << use(node->predicate()) << ")\n"; -} - -void IrPrinter::visit(const kir::GridReduction* node) { - const auto* reduction_op = node->reduction_op(); - indent() << gen(reduction_op->out()) << " = " - << "GRID_REDUCTION(op='" << reduction_op->operation() << "'" - << ", in=" << use(reduction_op->in()) - << ", init=" << use(reduction_op->init()) - << ", pred=" << use(reduction_op->predicate()) << ")\n"; - indent() << kTab << kTab - << ".reduction_buffer=" << use(node->reduction_buffer()->buffer()) - << "\n"; - indent() << kTab << kTab - << ".sync_buffer=" << use(node->sync_buffer()->buffer()) << "\n"; - indent() << kTab << kTab << ".grid_pred=" << use(node->predicate()) << "\n"; -} - -void IrPrinter::visit(const kir::GridWelford* node) { - const auto* welford_op = node->welford_op(); - indent() << gen(welford_op->outVar()) << "," << gen(welford_op->outAvg()) - << "," << gen(welford_op->outN()) << " = " - << "GRID_WELFORD(" - << "inAvg=" << use(welford_op->inAvg()); - if (!welford_op->inN()->isOneInt()) { - indent() << ", inVar=" << use(welford_op->inVar()); - } - indent() << ", inN=" << use(welford_op->inN()); - if (!welford_op->initN()->isZeroInt()) { - indent() << ", initVar=" << use(welford_op->initVar()) - << " initAvg=" << use(welford_op->initAvg()) - << " initN=" << use(welford_op->initN()); - } - indent() << ", pred=" << use(welford_op->predicate()) << ")\n"; - indent() << kTab << kTab - << ".var_buffer=" << use(node->var_buffer()->buffer()) - << ".avg_buffer=" << use(node->avg_buffer()->buffer()) - << ".n_buffer=" << use(node->N_buffer()->buffer()) << "\n"; - indent() << kTab << kTab - << ".sync_buffer=" << use(node->sync_buffer()->buffer()) << "\n"; - indent() << kTab << kTab << ".grid_pred=" << use(node->predicate()) << "\n"; -} - -void IrPrinter::visit(const kir::BroadcastOp* node) { - indent() << gen(node->out()) << " = BROADCAST(" << use(node->in()) << ")\n"; -} - -void IrPrinter::visit(const kir::ForLoop* node) { - indent() << "FOR " << gen(node->index()) << " in " << gen(node->iter_domain()) - << ":\n"; - handleBlock(node->body()); -} - -void IrPrinter::visit(const kir::IfThenElse* node) { - indent() << "IF " << use(node->predicate()) << ":\n"; - handleBlock(node->thenBody()); - if (node->hasElse()) { - indent() << "ELSE:\n"; - handleBlock(node->elseBody()); - } -} - -void IrPrinter::visit(const kir::Allocate* node) { - indent() << gen(node->buffer()) << " = ALLOCATE(" - << "mem_type=" << node->memoryType() << ", " - << "size=" << use(node->size()) << ", " - << "zero_init=" << boolLiteral(node->zeroInit()) << ")\n"; - if (node->alias() != nullptr) { - indent() << kTab << kTab << ".alias=" << gen(node->alias()->buffer()) - << "\n"; - } -} - -void IrPrinter::visit(const kir::Sync* node) { - indent() << "SYNC(war_hazard=" << boolLiteral(node->isWarHazardSync()) - << ")\n"; -} - -void IrPrinter::visit(const kir::InitMagicZero* node) { - indent() << "NVFUSER_DEFINE_MAGIC_ZERO\n"; -} - -void IrPrinter::visit(const kir::UpdateMagicZero* node) { - indent() << "NVFUSER_UPDATE_MAGIC_ZERO\n"; -} - -std::string toString(const kir::Node* stmt, bool implicit_definitions) { - std::stringstream ss; - IrPrinter ir_printer(ss, implicit_definitions); - ir_printer.printNode(stmt); - return ss.str(); -} - -std::string toString( - const std::vector& exprs, - bool implicit_definitions) { - std::stringstream ss; - IrPrinter ir_printer(ss, implicit_definitions); - for (auto expr : exprs) { - ir_printer.printNode(expr); - } - return ss.str(); -} - -} // namespace kir -} // namespace cuda -} // namespace fuser -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h b/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h deleted file mode 100644 index 115901a031a..00000000000 --- a/torch/csrc/jit/codegen/cuda/kernel_ir_printer.h +++ /dev/null @@ -1,129 +0,0 @@ -#pragma once - -#include - -#include -#include - -#include -#include -#include -#include - -namespace torch { -namespace jit { -namespace fuser { -namespace cuda { -namespace kir { - -//! Define pretty printing functions for Kernel IR nodes -//! -//! This class is intended for debug printing, so it attempts -//! to handle invalid IR states as much as possible. -//! -//! implicit_definition_ = true will recurisvely print the definition of all -//! inputs to an expression if they haven't been printed. -class TORCH_CUDA_CU_API IrPrinter : private kir::IrVisitor { - static constexpr char const* kTab = " "; - - public: - //! Constructs a new IrPrinter which outputs to the specified stream - explicit IrPrinter(std::ostream& os, bool implicit_definition = true) - : os_(os), implicit_definition_(implicit_definition) {} - - //! Print a single Kernel IR node - void printNode(const kir::Node* node); - - //! Print a complete Kernel definition - void printKernel(const Kernel* kernel); - - private: - // Generates a string representation of an IR node - // - // If `top_level` is true, all the value uses are tracked and - // their definitions are implicitly printed before the node itself - // - std::string gen(const kir::Node* node, bool top_level = false); - - // Generate a string representation of an used value - // (this helps automatically tracking the value uses) - std::string use(const kir::Val* val); - - std::ostream& indent(); - - void startBlock(); - void endBlock(); - void handleBlock(const kir::Scope& scope); - - void visit(const kir::Bool*) final; - void visit(const kir::Double*) final; - void visit(const kir::Int*) final; - void visit(const kir::NamedScalar*) final; - void visit(const kir::Predicate*) final; - - void visit(const kir::TensorIndex*) final; - void visit(const kir::IterDomain*) final; - void visit(const kir::TensorDomain*) final; - void visit(const kir::TensorView*) final; - - void visit(const kir::UnaryOp*) final; - void visit(const kir::BinaryOp*) final; - void visit(const kir::TernaryOp*) final; - void visit(const kir::ReductionOp*) final; - void visit(const kir::WelfordOp*) final; - void visit(const kir::BroadcastOp*) final; - - void visit(const kir::GridReduction*) final; - void visit(const kir::GridWelford*) final; - void visit(const kir::ForLoop*) final; - void visit(const kir::IfThenElse*) final; - void visit(const kir::Allocate*) final; - void visit(const kir::Sync*) final; - void visit(const kir::InitMagicZero*) final; - void visit(const kir::UpdateMagicZero*) final; - - private: - std::ostream& os_; - - // Current indentation level - int indent_level_ = 0; - - // Internal IR generation stream - std::stringstream ir_str_; - - // Tracks the set of nodes which have been printed - std::unordered_set visited_; - - // Optional left margin printed after the indentation - const char* margin_ = ""; - - // The set of values used by the current top-level IR node - std::unordered_set uses_; - - // If the definition of all inputs to an expression haven't been printed - // already implicit_definition_ = true will print them before printing the - // requested node. - bool implicit_definition_ = true; -}; - -//! Returns the string representation of a Kernel IR node. If the definition of -//! all inputs to an expression haven't been printed already -//! implicit_definition_ = true will print them before printing the requested -//! node. -TORCH_CUDA_CU_API std::string toString( - const kir::Node* stmt, - bool implicit_definitions = true); - -//! Returns the string representation of a vector of kir::Expr, convenient -//! debugm echanism during lowering. If the definition of all inputs to an -//! expression haven't been printed already implicit_definition_ = true will -//! print them before printing the requested node. -TORCH_CUDA_CU_API std::string toString( - const std::vector& exprs, - bool implicit_definitions = true); - -} // namespace kir -} // namespace cuda -} // namespace fuser -} // namespace jit -} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower2device.cpp b/torch/csrc/jit/codegen/cuda/lower2device.cpp index 036eee58206..21eb6e02fb8 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.cpp +++ b/torch/csrc/jit/codegen/cuda/lower2device.cpp @@ -6,18 +6,19 @@ #include #include #include -#include #include #include +#include #include +#include #include #include #include #include #include #include +#include #include -#include #include #include #include @@ -33,152 +34,15 @@ namespace jit { namespace fuser { namespace cuda { -// TODO(kir): revisit this thread_local GpuLower* active_gpu_lower = nullptr; // NOLINT namespace { -// Going to generate a map of tensor view root domain extents to reduce the -// number used during lowering. For example if we have: -// -// T2[i0, i1] = T1[i0, i1] + T2[i2, i3] -// -// We know it would be safe to use: -// -// T2[i0, i1] = T1[i0, i1] + T2[i0, i1] -// -// And that way we don't generate T2.size[0] and T2.size[1], instead we will -// reuse T1.size[0] and T1.size[1] -// This is important when doing CSE as T2 and T1 would otherwise look like -// they're using different values, even though we know they're the same -// -// There's some duplicate logic here that's in computeAt map, but it's not so -// concice there to pull out. May want to consider making this mapping its own -// class especially as it may be useful during scheduling. -std::unordered_map getSimplificationMap(Fusion* fusion) { - std::list> disjoint_root_sets; - std::unordered_map*> - id_to_disjoint_root_set; - - auto map_root_ids = [&disjoint_root_sets, &id_to_disjoint_root_set]( - IterDomain* id0, IterDomain* id1) { - if (id0->isBroadcast() || id1->isBroadcast()) { - return; - } - - auto disjoint_set_0_it = id_to_disjoint_root_set.find(id0); - auto disjoint_set_1_it = id_to_disjoint_root_set.find(id1); - bool set_0_found = disjoint_set_0_it != id_to_disjoint_root_set.end(); - bool set_1_found = disjoint_set_1_it != id_to_disjoint_root_set.end(); - - if (set_0_found && set_1_found) { - if (disjoint_set_0_it->second == disjoint_set_1_it->second) { - return; - } - // merge second disjoint set into first - auto* set_0 = disjoint_set_0_it->second; - auto* set_1 = disjoint_set_1_it->second; - for (auto id : *set_1) { - set_0->emplace(id); - id_to_disjoint_root_set[id] = set_0; - } - // remove second set from disjoint_root_sets - disjoint_root_sets.erase(std::find( - disjoint_root_sets.begin(), disjoint_root_sets.end(), *set_1)); - } else if (set_0_found || set_1_found) { - auto existing_set = - set_0_found ? disjoint_set_0_it->second : disjoint_set_1_it->second; - auto to_add_id = set_0_found ? id1 : id0; - existing_set->emplace(to_add_id); - id_to_disjoint_root_set[to_add_id] = existing_set; - // add entry into existing set - } else { - // create new set entry - disjoint_root_sets.emplace_back(std::unordered_set()); - auto* new_set = &disjoint_root_sets.back(); - new_set->emplace(id0); - new_set->emplace(id1); - id_to_disjoint_root_set[id0] = new_set; - id_to_disjoint_root_set[id1] = new_set; - } - }; - - auto fusion_vals = fusion->usedMathVals(); - for (auto producer_tv : ir_utils::filterByType(fusion_vals)) { - auto consumer_tvs = ir_utils::consumerTvsOf(producer_tv); - for (auto consumer_tv : consumer_tvs) { - auto pairwise_map = PairwiseRootDomainMap(producer_tv, consumer_tv); - auto c2p_root_map = pairwise_map.mapConsumerToProducer( - consumer_tv->domain(), producer_tv->domain()); - for (auto entry : c2p_root_map) { - auto c_id = entry.first; - auto p_id = entry.second; - map_root_ids(p_id, c_id); - } - } - } - - // Map each set to an input ID (if it exists) that has the smallest ->name() - // entry value - std::unordered_map*, IterDomain*> - set_to_input_id; - - // Loop over the root domains, of the inputs to the fusion. Pick an input ID - // to use as the representative ID of the collected sets. Only consider inputs - // as those are the ones that map to values like "T0.size[1]". They are he - // ID's that propagated their extents into the problem. We could also check - // the outputs as we do have C++ examples of using output dimensions for the - // problem size instead of inputs. However, we don't do anything where we can - // translate to those kinds of kernels integrated into PyTorch. - for (auto input_tv : ir_utils::filterByType(fusion->inputs())) { - for (auto id : - TensorDomain::noReductions(input_tv->getMaybeRFactorDomain())) { - auto id_set_it = id_to_disjoint_root_set.find(id); - if (id_set_it == id_to_disjoint_root_set.end()) { - continue; - } - auto* id_set = id_set_it->second; - if (set_to_input_id.find(id_set) == set_to_input_id.end()) { - set_to_input_id[id_set] = id; - } else { - auto input_id_of_set = set_to_input_id.at(id_set); - // Swap id's if new name is less than previously set - bool swap_ids = id->name() < input_id_of_set->name(); - // If new id is a const scalar but previously was'nt use the const - // scalar - swap_ids = swap_ids || - (id->extent()->isConstScalar() && - !input_id_of_set->extent()->isConstScalar()); - // If previous scalar was const and new isn't, don't swap - swap_ids = swap_ids && - !(input_id_of_set->extent()->isConstScalar() && - !id->extent()->isConstScalar()); - - if (swap_ids) { - set_to_input_id[id_set] = id; - } - } - } - } - - // Finally make map from ID extents to the representitive ID extent. - std::unordered_map extent_to_min_input_id_extent; - for (auto entry : set_to_input_id) { - auto* set = entry.first; - auto input_id = entry.second; - for (auto id : *set) { - extent_to_min_input_id_extent[id->extent()] = input_id->extent(); - } - } - return extent_to_min_input_id_extent; -} - -class KIRCleaner : public kir::MutableIrVisitor { +class KIRCleaner : public OptOutDispatch { public: //! Remove nop IR nodes - static std::vector cleanUp( - const std::vector& loop_nests) { + static std::vector cleanUp(const std::vector& loop_nests) { KIRCleaner cleaner; - std::vector out_loop_nests; + std::vector out_loop_nests; for (auto loop_nest : loop_nests) { cleaner.handle(loop_nest); // No need to keep the loop nest if it's determined to be nop @@ -190,16 +54,17 @@ class KIRCleaner : public kir::MutableIrVisitor { } private: - void handle(kir::Expr* expr) { + using OptOutDispatch::handle; + void handle(Expr* expr) final { if (expr->isA() || expr->isA()) { - expr->accept(this); + OptOutDispatch::handle(expr); } else { // Any non-scoping expr is not considered nop is_nop_ = false; } } - void visit(kir::ForLoop* fl) final { + void handle(kir::ForLoop* fl) final { auto exprs = fl->body().exprs(); fl->body().clear(); for (auto expr : exprs) { @@ -213,7 +78,7 @@ class KIRCleaner : public kir::MutableIrVisitor { is_nop_ = fl->body().empty(); } - void visit(kir::IfThenElse* ite) final { + void handle(kir::IfThenElse* ite) final { const auto conditional = ite->predicate()->value(); // Visit the then block @@ -248,9 +113,8 @@ class KIRCleaner : public kir::MutableIrVisitor { // conditional and move the exprs in the else block to the then // block. if (then_nop && !else_nop) { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - kir::Bool* pred = ite->predicate()->value(); - kir::Bool* not_pred = ir_builder.notExpr(pred)->as(); + Bool* pred = ite->predicate()->value(); + Bool* not_pred = SimplifyingIrBuilder::notExpr(pred)->as(); ite->predicate()->setValue(not_pred); for (auto expr : ite->elseBody().exprs()) { ite->thenBody().push_back(expr); @@ -269,84 +133,6 @@ class KIRCleaner : public kir::MutableIrVisitor { } // namespace -void GpuLower::replaceSymbolicSizes() { - FUSER_PERF_SCOPE("GpuLower::Lower::replaceSymbolicSizes"); - - kir::IrBuilder ir_builder(kernel()); - - // Grab inputs and outputs - std::vector inputs_and_outputs; - for (auto val : fusion_->inputs()) { - if (ir_utils::isTV(val)) { - inputs_and_outputs.push_back(val->as()); - } - } - // Symbolic size is necessary for outputs if there are no inputs. - // Otherwise infer output sizes from the inputs via expression evaluation. - if (fusion_->inputs().empty()) { - for (auto val : fusion_->outputs()) { - if (ir_utils::isTV(val)) { - inputs_and_outputs.push_back(val->as()); - } - } - } - - // Generate map for all tensorview root domain values to map them to symbolic - // values. i.e. T0->getRootDomain()[0] would map to a named scalar - // "T0.size[0]". This map will be used when lowering fusion ir to kernel ir. - for (TensorView* tv : inputs_and_outputs) { - // Replace the domain with one based on Ti.size[j] - const std::vector& root_td = tv->getRootDomain(); - - size_t dim = 0; - for (auto id : root_td) { - const Val* orig_size = id->extent(); - - // Output sizes could have reduction axes, which isn't what gets output. - // NOLINTNEXTLINE(bugprone-branch-clone) - if (id->isReduction() || - (id->getIterType() == IterType::BroadcastWithoutStride)) { - continue; - } else if ( - id->isRFactorProduct() || - // NOLINTNEXTLINE(bugprone-branch-clone) - (id->getIterType() == IterType::BroadcastWithStride) || - orig_size->isConstScalar()) { - dim++; - continue; - } - - // TODO(kir): consider a different implementation which doesn't - // hijack the kir_val_map_ - // Currently turn off this part for inputs of segmented fusion, - // since FusionKernelRuntime will provide these as integer inputs - if (kir_val_map_.find(orig_size) == kir_val_map_.end() && - !orig_size->isFusionInput() && !orig_size->isConstScalar()) { - std::stringstream ss; - ss << "T" << tv->name() << ".size[" << dim++ << "]"; - kir_val_map_[orig_size] = ir_builder.create( - ss.str(), orig_size->getDataType().value()); - } else { - dim++; - } - } - } - - // Use a minimal number of sizes from provided tensors. - auto extent_simplification_map = getSimplificationMap(fusion_); - for (auto extent_entry : extent_simplification_map) { - auto orig_extent = extent_entry.first; - auto simplified_extent = extent_entry.second; - if (kir_val_map_.count(orig_extent)) { - if (kir_val_map_.count(simplified_extent)) { - kir_val_map_[orig_extent] = kir_val_map_[simplified_extent]; - } else { - kir_val_map_[orig_extent] = lowerValue(simplified_extent); - } - } - } -} - void GpuLower::collectPaddedParallelDims() { ExpressionEvaluator ee(fusion_); bool can_be_single_warp = true; @@ -398,14 +184,12 @@ void GpuLower::collectPaddedParallelDims() { } } -void GpuLower::lower() { +void GpuLower::lower(Fusion* fusion) { FUSER_PERF_SCOPE("GpuLower::lower"); - - TORCH_INTERNAL_ASSERT(fusion_ != nullptr); + TORCH_INTERNAL_ASSERT(fusion != nullptr); TORCH_INTERNAL_ASSERT( active_gpu_lower == nullptr, "Nested lowering passes are not supported"); - // TODO(kir): revisit this struct LowerGuard { LowerGuard(GpuLower* gpu_lower) { active_gpu_lower = gpu_lower; @@ -414,17 +198,21 @@ void GpuLower::lower() { active_gpu_lower = nullptr; } } lower_guard(this); + // Copy fusion into a new kernel for processing + kernel_ = std::make_unique(fusion); + // Alias the fusion kernel caries around as a view of itself. + fusion_ = kernel_.get(); FusionGuard fg(fusion_); - - // Start with a fresh kernel - kernel_ = std::make_unique(); - // prepare for lowering validateIr(fusion_); - replaceSymbolicSizes(); + collectPaddedParallelDims(); - trivial_reduction_info_.build(fusion_, this); + + replaceSymbolicSizes(fusion_); + + trivial_reduction_info_.build(fusion_); + trivialReductionReplacement(fusion_, trivialReductionInfo()); // In the future we may directly use this map, but for now it will propagate // and validate (to some extent) the parallelization strategy. @@ -447,9 +235,12 @@ void GpuLower::lower() { parallelDimensionMap().build(fusion_); if (isDebugDumpEnabled(DebugDumpOption::ParallelDimensions)) { - std::cout << parallelDimensionMap().toString(); + std::cout << "Parallel dimension map:" << std::endl; + std::cout << parallel_dimension_map_.toString() << std::endl; } + concretized_broadcast_domains_.build(fusion_); + // Compute thread predicates. Depends on parallel_dimension_map_ thread_pred_map_.build(fusion_); @@ -469,61 +260,67 @@ void GpuLower::lower() { nonDivisibleSplitInfo().build(fusion_); - // Set the kernel inputs & outputs - for (auto input : fusion_->inputs()) { - kernel_->addInput(GpuLower::lowerValue(input)); - } - - for (auto output : fusion_->outputs()) { - kernel_->addOutput(GpuLower::lowerValue(output)); - } + doubleBufferInfo().build(fusion_); // Run our passes keeping the lowered expressions and forwarding // them // Reorder expressions for loop-nest generation respecting computeAt // relationships - auto sorted_exprs = reorderExprsForComputeAt(); + const auto exprs_sorted = reorderExprsForComputeAt(); // Generate loop-nests and place each expression at its // corresponding loop - const auto lowered_exprs = LoopNestGenerator::loweredExprs(sorted_exprs); + const auto exprs_lowered = LoopNestGenerator::loweredExprs(exprs_sorted); + + // Replace trivial reductions, Transpose, Shift, Gather, and View ops with + // unary ops since they're not separately processed in lowering. + const auto exprs_unary_replaced = unarySetOpInserter(exprs_lowered); // Insert allocations - const auto alloced_exprs = insertAllocations(lowered_exprs); + const auto exprs_alloced = insertAllocations(exprs_unary_replaced); // Insert read after write smem syncs - const auto raw_sync_exprs = insertRawThreadSynchronization(alloced_exprs); + const auto exprs_raw_sync = insertRawThreadSynchronization(exprs_alloced); // Reuse memory locations - const auto reuse_mem_exprs = reuseMemoryAllocations(raw_sync_exprs); - - // Inserts predicates after this, need to be careful in later passes when - // inserting in loop nest structure as insertions could be on if then else - // instead of directly on a for loop - const auto unrolled_loops = UnrollPass::runPass(fusion_, reuse_mem_exprs); - - const auto unrolled_mv_loops = - processMisalignedVectorization(fusion_, unrolled_loops); + const auto exprs_reuse_mem = reuseMemoryAllocations(exprs_raw_sync); // Insert SyncThreads at end of for-loop to avoid WAR race condition - const auto war_sync_exprs = insertWarThreadSynchronization(unrolled_mv_loops); + const auto exprs_war_sync = insertWarThreadSynchronization(exprs_reuse_mem); - const auto indexed_loops = IndexLowering::getIndexedExprs(war_sync_exprs); + const auto exprs_double_buffered = DoubleBufferPass::run(exprs_war_sync); - const auto exprs_with_fused_broadcast = fuseWarpReduce(indexed_loops); + // This pass inserts predicates as well as branches in the code. Up until now + // the code is explicitly single shot for loop based. Need to be careful in + // later passes when doing any kind of insertions in loop nest structure as + // insertions could be on if then or else instead of directly on a for loop. + const auto exprs_unrolled_loops = + UnrollPass::runPass(fusion_, exprs_double_buffered); - const auto conditional_loops = - generateConditionalFromPredicate(fusion_, exprs_with_fused_broadcast); + const auto exprs_unrolled_mv_loops = + processMisalignedVectorization(exprs_unrolled_loops); + + const auto exprs_indexed_loops = + IndexLowering::getIndexedExprs(exprs_unrolled_mv_loops); + + // TODO: It seems this type of optimization would be far easier to implement + // on fusion ir than kernel ir. We should likely refactor this to at least run + // before allocation insertion. + const auto exprs_with_fused_broadcast = fuseWarpReduce(exprs_indexed_loops); + + const auto exprs_conditional_loops = + generateConditionalFromPredicate(exprs_with_fused_broadcast); // Insert fake zero updates to make sure nvrtc doesn't blow out register use // on index and predicate reuse - const auto register_adjusted = insertMagicZero(conditional_loops); + const auto exprs_register_adjusted = insertMagicZero(exprs_conditional_loops); - const auto cleaned_up_loops = KIRCleaner::cleanUp(register_adjusted); + const auto exprs_cleaned_up_loops = + KIRCleaner::cleanUp(exprs_register_adjusted); // We now have the lowered expressions, finalize the kernel IR - kernel_->finalize(cleaned_up_loops); + kernel_->finalize(exprs_cleaned_up_loops); } kir::Kernel* GpuLower::kernel() const { @@ -531,213 +328,9 @@ kir::Kernel* GpuLower::kernel() const { return kernel_.get(); } -// Maps Fusion IR nodes to the Kernel IR counterparts -class GpuLower::KernelIrMapper : private OptInConstDispatch { - public: - explicit KernelIrMapper(GpuLower* gpu_lower) - : gpu_lower_(gpu_lower), ir_builder_(gpu_lower->kernel()) {} - - kir::Val* lowerValue(const Val* value) { - const auto it = gpu_lower_->kir_val_map_.find(value); - if (it != gpu_lower_->kir_val_map_.end()) { - return it->second; - } else { - handle(value); - const auto kir_value = gpu_lower_->kir_val_map_[value]; - TORCH_CHECK(kir_value != nullptr); - - // Lower the value definition, if any - if (value->isScalar()) { - if (auto def = value->definition()) { - const auto kir_def = lowerExpr(def); - TORCH_INTERNAL_ASSERT(kir_value->definition() == kir_def); - } - } - - return kir_value; - } - } - - kir::Expr* lowerExpr(const Expr* expr) { - const auto it = gpu_lower_->kir_expr_map_.find(expr); - if (it != gpu_lower_->kir_expr_map_.end()) { - return it->second; - } else { - handle(expr); - const auto lowered_node = gpu_lower_->kir_expr_map_[expr]; - TORCH_CHECK(lowered_node != nullptr); - return lowered_node; - } - // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDeleteLeaks) - } - - private: - void handle(const Statement* node) final { - OptInConstDispatch::handle(node); - } - - void handle(const Val* node) final { - OptInConstDispatch::handle(node); - } - - void handle(const Expr* node) final { - OptInConstDispatch::handle(node); - } - - void handle(const TensorDomain* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const IterDomain* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const TensorView* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const Bool* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const Double* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const Int* node) final { - const auto lowered_node = ir_builder_.create(node); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const NamedScalar* node) final { - const auto lowered_node = ir_builder_.create( - node->name(), node->getDataType().value()); - TORCH_CHECK(gpu_lower_->kir_val_map_.insert({node, lowered_node}).second); - } - - void handle(const UnaryOp* node) final { - const auto lowered_node = ir_builder_.create( - node->getUnaryOpType(), - lowerValue(node->out()), - lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const BinaryOp* node) final { - const auto lowered_node = ir_builder_.create( - node->getBinaryOpType(), - lowerValue(node->out()), - lowerValue(node->lhs()), - lowerValue(node->rhs())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const TernaryOp* node) final { - const auto lowered_node = ir_builder_.create( - node->getTernaryOpType(), - lowerValue(node->out()), - lowerValue(node->in1()), - lowerValue(node->in2()), - lowerValue(node->in3())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const ReductionOp* node) final { - auto out_tv = node->out()->as(); - // If trivial reduction operation lower to set operation. - if (std::all_of( - out_tv->domain()->domain().begin(), - out_tv->domain()->domain().end(), - [&](IterDomain* id) { - // If id is a reduction axis, is it a trivial reduction? - if (id->isReduction()) { - return gpu_lower_->trivialReductionInfo().isDerived(id); - } else { - return true; - } - })) { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK( - gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - return; - } - - const auto lowered_node = ir_builder_.create( - node->getReductionOpType(), - lowerValue(node->init()), - lowerValue(node->out()), - lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const WelfordOp* node) final { - auto lowerOptional = [&](Val* v) { return v ? lowerValue(v) : nullptr; }; - const auto lowered_node = ir_builder_.create( - lowerValue(node->outVar()), - lowerValue(node->outAvg()), - lowerValue(node->outN()), - lowerValue(node->initVar()), - lowerValue(node->initAvg()), - lowerValue(node->initN()), - lowerOptional(node->inVar()), - lowerValue(node->inAvg()), - lowerValue(node->inN())); - - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const BroadcastOp* node) final { - const auto lowered_node = ir_builder_.create( - lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const TransposeOp* node) final { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const ShiftOp* node) final { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const GatherOp* node) final { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - void handle(const ViewOp* node) final { - const auto lowered_node = ir_builder_.create( - UnaryOpType::Set, lowerValue(node->out()), lowerValue(node->in())); - TORCH_CHECK(gpu_lower_->kir_expr_map_.insert({node, lowered_node}).second); - } - - private: - GpuLower* gpu_lower_ = nullptr; - kir::IrBuilder ir_builder_; -}; - -kir::Val* GpuLower::lowerValue(const Val* val) { - KernelIrMapper kir_mapper(this); - return kir_mapper.lowerValue(val); -} - -kir::Expr* GpuLower::lowerExpr(const Expr* expr) { - KernelIrMapper kir_mapper(this); - return kir_mapper.lowerExpr(expr); -} - GpuLower* GpuLower::current() { + TORCH_INTERNAL_ASSERT( + active_gpu_lower != nullptr, "No active GpuLower available"); return active_gpu_lower; } diff --git a/torch/csrc/jit/codegen/cuda/lower2device.h b/torch/csrc/jit/codegen/cuda/lower2device.h index b807bb4d480..b97c6ac1837 100644 --- a/torch/csrc/jit/codegen/cuda/lower2device.h +++ b/torch/csrc/jit/codegen/cuda/lower2device.h @@ -1,14 +1,17 @@ #pragma once -#include +#include #include #include #include #include #include +#include #include #include +#include +#include #include #include #include @@ -29,29 +32,27 @@ namespace cuda { // container for this information that we can reuse. Would be nice to generate // such a structure and propagate it through lowering. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -class TORCH_CUDA_CU_API GpuLower { +class TORCH_CUDA_CU_API GpuLower : public NonCopyable { class KernelIrMapper; public: - GpuLower() = default; + GpuLower() = delete; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - explicit GpuLower(Fusion* fusion) : fusion_(fusion) { - lower(); + explicit GpuLower(Fusion* fusion) { + lower(fusion); } kir::Kernel* kernel() const; - //! Converts a Fusion IR value into the Kernel IR equivalent - kir::Val* lowerValue(const Val* val); - - //! Converts a Fusion IR expression into the Kernel IR equivalent - kir::Expr* lowerExpr(const Expr* expr); - //! Returns the currently active lowering object //! (or nullptr if no lowering is in progress) static GpuLower* current(); + ConcretizedBroadcastDomains& concretizedBroadcastDomains() { + return concretized_broadcast_domains_; + } + const ThreadPredicateMap& threadPredMap() const { return thread_pred_map_; } @@ -68,7 +69,7 @@ class TORCH_CUDA_CU_API GpuLower { return ca_parallel_map_; } - const auto& trivialReductionInfo() const { + const TrivialReductionInfo& trivialReductionInfo() const { return trivial_reduction_info_; } @@ -120,16 +121,12 @@ class TORCH_CUDA_CU_API GpuLower { return non_divisible_split_info_; } - private: - void lower(); + DoubleBufferInfo& doubleBufferInfo() { + return double_buffer_info_; + } - // TensorViews are all based on symbolic sizes. When we first initialize them - // we don't know if they're inputs or outputs which would mean that they have - // runtime shapes. Intermediate tensors (those not going to global memory) do - // not have this information. Since we need to have the correct information in - // the kernel being fetched for shapes, we want to replace input and output - // tensors to reference the runtime structure containing sizes. - void replaceSymbolicSizes(); + private: + void lower(Fusion* fusion); // Goes through the parallelized iterdomains of the used TVs and find // the parallel dimensions that need to be padded to a multiples of @@ -140,11 +137,8 @@ class TORCH_CUDA_CU_API GpuLower { // Lowered Kernel IR std::unique_ptr kernel_; - // Fusion IR node to Kernel IR node mapping - std::unordered_map kir_val_map_; - std::unordered_map kir_expr_map_; - // Some stateful information during lowering + ConcretizedBroadcastDomains concretized_broadcast_domains_; ThreadPredicateMap thread_pred_map_; PredicateElimination pred_elimination_; ComputeAtMap ca_loop_map_; @@ -157,6 +151,7 @@ class TORCH_CUDA_CU_API GpuLower { ParallelDimensionMap parallel_dimension_map_; PartialSplitMap partial_split_map_; NonDivisibleSplitInfo non_divisible_split_info_; + DoubleBufferInfo double_buffer_info_; Fusion* fusion_ = nullptr; }; diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp index 80e2e58c9cf..17a2db069d8 100644 --- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_alias_memory.cpp @@ -1,10 +1,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -22,40 +22,42 @@ namespace { //! Get string representation of Allocate size for symbolic comparison //! //! TODO: Some expr simplifications could also be helpful -class SymbolicSizePrinter : private kir::IrVisitor { +class SymbolicSizePrinter : private OptOutConstDispatch { public: static std::string printSize(const kir::Allocate* allocate) { SymbolicSizePrinter printer; - allocate->size()->accept(&printer); + printer.handle(allocate->size()); return printer.os_.str(); } private: - void visit(const kir::Int* node) final { + using OptOutConstDispatch::handle; + + void handle(const Int* node) final { if (auto def = node->definition()) { - def->accept(this); + OptOutConstDispatch::handle(def); } else if (node->isConst()) { os_ << *node->value(); } else { - os_ << "ki" << node->id(); + os_ << "ki" << node->name(); } } - void visit(const kir::NamedScalar* named_scalar) final { + void handle(const NamedScalar* named_scalar) final { os_ << "@" << named_scalar->name(); } - void visit(const kir::UnaryOp* unary_op) final { - os_ << unary_op->operation() << "("; - unary_op->in()->accept(this); + void handle(const UnaryOp* unary_op) final { + os_ << unary_op->getUnaryOpType() << "("; + OptOutConstDispatch::handle(unary_op); os_ << ")"; } - void visit(const kir::BinaryOp* binary_op) final { - os_ << binary_op->operation() << "("; - binary_op->lhs()->accept(this); + void handle(const BinaryOp* binary_op) final { + os_ << binary_op->getBinaryOpType() << "("; + OptOutConstDispatch::handle(binary_op->lhs()); os_ << ","; - binary_op->rhs()->accept(this); + OptOutConstDispatch::handle(binary_op->rhs()); os_ << ")"; } @@ -74,11 +76,11 @@ class BufferReuseDebugPrinter { DebugLineType line_type = DebugLineType::EXPR; }; - using DebugEntry = std::pair; + using DebugEntry = std::pair; using DebugEntryPtr = std::unique_ptr; public: - BufferReuseDebugPrinter() : ir_printer_(os_, false){}; + BufferReuseDebugPrinter() : ir_printer_(os_){}; std::string dumpDebugInfo() { os_.clear(); @@ -105,7 +107,7 @@ class BufferReuseDebugPrinter { private: friend class BufferUseDefInfo; - void pushBack(int lineno, kir::Expr* expr) { + void pushBack(int lineno, Expr* expr) { makeExprEntry(lineno, expr); } @@ -117,7 +119,7 @@ class BufferReuseDebugPrinter { makeScopeEntry(DebugLineType::END_BLOCK); } - void makeExprEntry(int lineno, kir::Expr* expr) { + void makeExprEntry(int lineno, Expr* expr) { auto debug_entry_ptr = std::make_unique(); debug_entry_ptr->first.lineno = lineno; debug_entry_ptr->second = expr; @@ -134,14 +136,14 @@ class BufferReuseDebugPrinter { debug_info_.emplace_back(std::move(debug_entry_ptr)); } - void handle(const kir::Expr* node) { + void handle(const Expr* node) { if (auto for_loop = dynamic_cast(node)) { handle(for_loop); } else if (auto ite = dynamic_cast(node)) { handle(ite); } else { indent(); - ir_printer_.printNode(node); + ir_printer_.handle(node); } if (auto alloc = dynamic_cast(node)) { printAllocInfo(alloc); @@ -151,9 +153,9 @@ class BufferReuseDebugPrinter { void handle(const kir::ForLoop* node) { indent(); os_ << "FOR "; - ir_printer_.printNode(node->index()); + ir_printer_.handle(node->index()); os_ << " in "; - ir_printer_.printNode(node->iter_domain()); + ir_printer_.handle(node->iter_domain()); os_ << ":\n"; } @@ -186,7 +188,7 @@ class BufferReuseDebugPrinter { private: std::stringstream os_; - kir::IrPrinter ir_printer_; + IrPrinter ir_printer_; int indent_level_ = 0; std::vector debug_info_; @@ -340,7 +342,7 @@ class BufferUseDefInfo { static constexpr long kRegisterSizeThreshold = 1; BufferUseDefInfo( - const std::vector& exprs, + const std::vector& exprs, BufferReuseDebugPrinter* debug_printer = nullptr) : debug_printer_(debug_printer) { if (debug_printer) { @@ -410,7 +412,7 @@ class BufferUseDefInfo { } private: - void handle(kir::Expr* expr) { + void handle(Expr* expr) { current_pos_++; if (debug_printer_) { debug_printer_->pushBack(current_pos_, expr); @@ -426,7 +428,7 @@ class BufferUseDefInfo { } } - void handleScope(const std::vector& exprs) { + void handleScope(const std::vector& exprs) { if (debug_printer_) { debug_printer_->pushScope(); } @@ -460,15 +462,15 @@ class BufferUseDefInfo { return; } - auto kir_tv = dynamic_cast(alloc->buffer()); - if (!kir_tv) { + auto tv = dynamic_cast(alloc->buffer()); + if (!tv) { return; } // Collect the allocate info data // Collect memory type, skip global buffers - auto mem_type = kir_tv->memoryType(); + auto mem_type = tv->getMemoryType(); if (mem_type != MemoryType::Local && mem_type != MemoryType::Shared) { return; } @@ -487,12 +489,12 @@ class BufferUseDefInfo { } } - auto data_type = kir_tv->dtype(); + auto data_type = tv->dtype(); auto size_print = SymbolicSizePrinter::printSize(alloc); // Make sure we don't have conflicting information on record TORCH_INTERNAL_ASSERT(!map_allocate_to_info_.count(alloc)); - TORCH_INTERNAL_ASSERT(!map_tv_to_allocations_.count(kir_tv->name())); + TORCH_INTERNAL_ASSERT(!map_tv_to_allocations_.count(tv->name())); // make AllocationUseDefInfo: auto alloc_info = makeUseDefInfo(); @@ -505,10 +507,10 @@ class BufferUseDefInfo { // record short cuts map_allocate_to_info_[alloc] = alloc_info; - map_tv_to_allocations_[kir_tv->name()] = alloc_info; + map_tv_to_allocations_[tv->name()] = alloc_info; } - void collectScopeUseDefInfo(const std::vector& exprs) { + void collectScopeUseDefInfo(const std::vector& exprs) { // Reset position pointer resetExprCounter(); TORCH_INTERNAL_ASSERT(global_scope_info_ != nullptr); @@ -516,14 +518,14 @@ class BufferUseDefInfo { handleScope(exprs); } - void collectScopeInfo(const std::vector& exprs) { + void collectScopeInfo(const std::vector& exprs) { // Reset position pointer resetExprCounter(); collectScopeInfoWithinLoop(exprs, nullptr); } void collectScopeInfoWithinLoop( - const std::vector& exprs, + const std::vector& exprs, kir::ForLoop* current_loop) { auto loop_info = makeScopeInfo(current_loop); for (auto expr : exprs) { @@ -584,22 +586,20 @@ class BufferUseDefInfo { // Iterate over the inputs and outputs of exprs and update // the liveness info of local buffers if applicaable. - void collectLivenessInfo(const kir::Expr* expr) { - if (!ir_utils::isTVOp(expr)) { + void collectLivenessInfo(const Expr* expr) { + if (!ir_utils::isTvOp(expr)) { return; } - auto out_tv = expr->outputs()[0]->as(); - auto fuser_out_tv = out_tv->fuserTv(); + auto out_tv = expr->outputs()[0]->as(); // Collect all tv's that resolves broadcast in this // expr. The current analysis isn't enough to capture // their liveness range. - for (auto input_tv : - ir_utils::filterByType(expr->inputs())) { + for (auto input_tv : ir_utils::filterByType(expr->inputs())) { auto maybe_alloc_info = getMaybeAllocInfoFromTV(input_tv); if (maybe_alloc_info.has_value()) { - if (isSerialBroadcastResolution(input_tv->fuserTv(), fuser_out_tv)) { + if (isSerialBroadcastResolution(input_tv, out_tv)) { maybe_alloc_info.value()->inner_live_interval->markRead(current_pos_); } else { // Disable inner alias info for this buffer, since line number based @@ -621,8 +621,7 @@ class BufferUseDefInfo { } } } - for (auto output_tv : - ir_utils::filterByType(expr->outputs())) { + for (auto output_tv : ir_utils::filterByType(expr->outputs())) { auto maybe_alloc_info = getMaybeAllocInfoFromTV(output_tv); if (maybe_alloc_info.has_value()) { maybe_alloc_info.value()->inner_live_interval->markWrite(current_pos_); @@ -675,8 +674,7 @@ class BufferUseDefInfo { return nullptr; } - c10::optional getMaybeAllocInfoFromTV( - kir::TensorView* tv) { + c10::optional getMaybeAllocInfoFromTV(TensorView* tv) { auto alloc_it = map_tv_to_allocations_.find(tv->name()); if (alloc_it == map_tv_to_allocations_.end()) { return c10::nullopt; @@ -810,11 +808,11 @@ void BufferReuseDebugPrinter::printAllocInfo(const kir::Allocate* alloc) { //! Reuse Allocation nodes via pointer aliasing class AllocateReuseModifier { public: - static void modify(const std::vector& exprs) { + static void modify(const std::vector& exprs) { AllocateReuseModifier modifier(exprs); } - static void debugPrint(const std::vector& exprs) { + static void debugPrint(const std::vector& exprs) { BufferReuseDebugPrinter debug_printer; AllocateReuseModifier modifier(exprs, &debug_printer); std::cout << debug_printer.dumpDebugInfo(); @@ -822,7 +820,7 @@ class AllocateReuseModifier { private: AllocateReuseModifier( - const std::vector& exprs, + const std::vector& exprs, BufferReuseDebugPrinter* debug_printer_ = nullptr) : buffer_info_(exprs, debug_printer_) { // Perform in-place sharing first and then outer liveness @@ -941,7 +939,7 @@ class AllocateReuseModifier { return false; } - void handle(kir::Expr* expr) { + void handle(Expr* expr) { if (auto ite = dynamic_cast(expr)) { handle(ite); } else if (auto for_loop = dynamic_cast(expr)) { @@ -961,7 +959,7 @@ class AllocateReuseModifier { "lower_alias_memory: IfThenElse before unrolling is not yet supported"); } - void handleScope(const std::vector& exprs) { + void handleScope(const std::vector& exprs) { current_visible_buffer_stack_.emplace_back( std::make_unique()); for (auto expr : exprs) { @@ -990,10 +988,8 @@ class AllocateReuseModifier { } // Assume inputs are TV allocations, which should have been checked // before reaching this point. - auto this_tv = - alloc_info->alloc_expr->buffer()->as()->fuserTv(); - auto reuse_tv = - to_reuse->alloc_expr->buffer()->as()->fuserTv(); + auto this_tv = alloc_info->alloc_expr->buffer()->as(); + auto reuse_tv = to_reuse->alloc_expr->buffer()->as(); // Check the values in between the two buffers. auto vals_between_this_and_reuse = @@ -1068,8 +1064,8 @@ class AllocateReuseModifier { } bool allocationDomainsIndexMapped( - std::vector& alloc_domains, - std::vector& reuse_domains) { + std::vector& alloc_domains, + std::vector& reuse_domains) { // Require that the allocated domains are exactly mapped. if (alloc_domains.size() != reuse_domains.size()) { return false; @@ -1099,7 +1095,7 @@ class AllocateReuseModifier { // Do we have a true pointwise op? // (ie. a TV op, excluding direct assignments and reductions) bool isPointwiseTvOp(const Expr* expr) { - if (ir_utils::isTVOp(expr)) { + if (ir_utils::isTvOp(expr)) { return expr->isA() || expr->isA() || expr->isA(); } @@ -1108,7 +1104,7 @@ class AllocateReuseModifier { // Utility to capture reduction ops bool isReductionTvOp(const Expr* expr) { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { return false; } return expr->isA() || expr->isA(); @@ -1116,7 +1112,7 @@ class AllocateReuseModifier { // Utility to capture reduction ops bool isBroadcastTvOp(const Expr* expr) { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { return false; } return expr->isA(); @@ -1138,8 +1134,7 @@ class AllocateReuseModifier { } // namespace -std::vector reuseMemoryAllocations( - const std::vector& exprs) { +std::vector reuseMemoryAllocations(const std::vector& exprs) { FUSER_PERF_SCOPE("reuseMemoryAllocations"); bool debug_print = isDebugDumpEnabled(DebugDumpOption::BufferReuseInfo); if (debug_print) { diff --git a/torch/csrc/jit/codegen/cuda/lower_alias_memory.h b/torch/csrc/jit/codegen/cuda/lower_alias_memory.h index 26b33b6d5dc..0d144b9f2f4 100644 --- a/torch/csrc/jit/codegen/cuda/lower_alias_memory.h +++ b/torch/csrc/jit/codegen/cuda/lower_alias_memory.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -28,8 +28,7 @@ namespace cuda { //! is not used after this op: //! then alias output Allocate to input Allocate. //! -std::vector reuseMemoryAllocations( - const std::vector& exprs); +std::vector reuseMemoryAllocations(const std::vector& exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp index 2f70c275832..c03848ccff8 100644 --- a/torch/csrc/jit/codegen/cuda/lower_allocation.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_allocation.cpp @@ -1,10 +1,8 @@ -#include #include #include #include #include -#include -#include +#include #include #include @@ -17,8 +15,12 @@ namespace cuda { namespace { -class AllocationInserter : public kir::MutableIrVisitor { +class AllocationInserter : public kir::ExprMutator { private: + using kir::ExprMutator::handle; + + // Expanded version of BasicAllocInfo in lower_utils.h helps to track + // additional information struct AllocationInformation { // The for loop that the initialization of this allocation must be // placed in, nullptr if not within a loop @@ -26,7 +28,7 @@ class AllocationInserter : public kir::MutableIrVisitor { // The expression that the initialization of this allocation must // be placed before - kir::Expr* init_place_before = nullptr; + Expr* init_place_before = nullptr; // Keep track of the actual allocation loop. This can be different // from init_for_loop only with unswitched shared memory allocations, @@ -37,143 +39,96 @@ class AllocationInserter : public kir::MutableIrVisitor { // The expression that this allocation must be placed // before. Similar to alloc_for_loop, this is different from // init_place_before only with unswitched shared memory allocations. - kir::Expr* alloc_place_before = nullptr; + Expr* alloc_place_before = nullptr; // The allocation position relative to buffer size_t alloc_pos = 0; // The buffer this allocation is for - kir::TensorView* buffer = nullptr; - - // The allocation expression - kir::Allocate* alloc_expr = nullptr; - - // Initialization - kir::Expr* init_expr = nullptr; + TensorView* buffer = nullptr; // Info to transfer to GPU lower bool has_halo = false; // Local Iterdomains that this allocation covers - std::unique_ptr> allocation_domains; + std::unique_ptr> allocation_domains; }; // Find allocation point - void findAllocationPosition(AllocationInformation& info, kir::Expr* expr) { + // Fills info.buffer, info.alloc_pos, info.init_for_loop, + // info.init_place_before, info.alloc_for_loop, info.alloc_place_before + void fillAllocationInformation(AllocationInformation& info, Expr* expr) { size_t alloc_pos = 0; kir::ForLoop* init_for_loop = nullptr; - auto fuser_tv = info.buffer->fuserTv(); size_t fl_idx_next = 0; + auto loop_alloc_info = + loop_utils::getAllocInformation(info.buffer, for_loops_); - bool outer_alloc_found = false; - kir::ForLoop* alloc_for_loop = nullptr; - size_t alloc_fl_idx_next = 0; + info.init_for_loop = loop_alloc_info.init_for_loop; + info.alloc_for_loop = loop_alloc_info.alloc_for_loop; + info.alloc_pos = loop_alloc_info.alloc_pos; - for (auto fl : for_loops) { - if (alloc_pos == fuser_tv->getComputeAtPosition()) { - break; + auto next_fl = [](kir::ForLoop* fl, const std::vector fls) { + for (auto i : c10::irange(fls.size())) { + if (fl == fls[i]) { + if (i + 1 < fls.size()) { + return fls[i + 1]; + } + } } - - if (fuser_tv->axis(alloc_pos)->isReduction()) { - const auto outputs = - FusionGuard::getCurFusion()->getTerminatingOutputs(); - TORCH_INTERNAL_ASSERT( - std::find(outputs.begin(), outputs.end(), fuser_tv) != - outputs.end(), - "Invalid computeAt of T", - fuser_tv->name(), - ". A reducation axis is detected within computeAt axes even though it is not an output tensor."); - break; - } - - auto fl_id = fl->iter_domain(); - - if (fl_id->parallelType() == ParallelType::Unroll) { - break; - } - - // Shared memory must be allocated outside of unswitched - // domains. See issue #1133. - if (fl_id->parallelType() == ParallelType::Unswitch && - fuser_tv->getMemoryType() == MemoryType::Shared) { - outer_alloc_found = true; - } - - auto local_id = gpu_lower->lowerValue(fuser_tv->axis(alloc_pos)) - ->as(); - - if (gpu_lower->caLoopMap().areMapped(local_id, fl_id)) { - alloc_pos++; - } - - init_for_loop = fl; - ++fl_idx_next; - - if (!outer_alloc_found) { - alloc_for_loop = fl; - ++alloc_fl_idx_next; - } - } - - info.alloc_pos = alloc_pos; - info.init_for_loop = init_for_loop; + TORCH_INTERNAL_ASSERT(false, "Could not find desired loop."); + }; if (info.init_for_loop == nullptr) { - info.init_place_before = for_loops.size() > 0 ? for_loops[0] : expr; + info.init_place_before = for_loops_.size() > 0 ? for_loops_[0] : expr; } else { - if (info.init_for_loop == for_loops.back()) { + if (info.init_for_loop == for_loops_.back()) { // Inline allocation, place before expr info.init_place_before = expr; } else { // Place allocation after the last computeAt axis // TODO: may be more efficient to place before the first non-computeAt // axis - info.init_place_before = for_loops.at(fl_idx_next); + info.init_place_before = next_fl(info.init_for_loop, for_loops_); } } // Set the allocation loop and the place_before expression in the // same way as the initialization loop and place_before expression - if (!outer_alloc_found) { + if (info.alloc_for_loop == info.init_for_loop) { info.alloc_for_loop = info.init_for_loop; info.alloc_place_before = info.init_place_before; } else { - info.alloc_for_loop = alloc_for_loop; if (info.alloc_for_loop == nullptr) { - info.alloc_place_before = for_loops.size() > 0 ? for_loops[0] : expr; + info.alloc_place_before = for_loops_.size() > 0 ? for_loops_[0] : expr; } else { // Since there must be an inner unswitched domain, // alloc_for_loop should never be the inner-most loop. - TORCH_INTERNAL_ASSERT(info.alloc_for_loop != for_loops.back()); - info.alloc_place_before = for_loops.at(alloc_fl_idx_next); + TORCH_INTERNAL_ASSERT(info.alloc_for_loop != for_loops_.back()); + info.alloc_place_before = next_fl(info.alloc_for_loop, for_loops_); } } } // Create initialization expression if init_val is non-null. - void createInitExpr(AllocationInformation& info, kir::Val* init_val) { + Expr* createInitExpr(AllocationInformation& info, Val* init_val) { if (init_val == nullptr) { - info.init_expr = nullptr; - return; + return nullptr; } - auto fuser_tv = info.buffer->fuserTv(); - - std::vector init_dims; - for (const auto axis_i : c10::irange(info.alloc_pos, fuser_tv->nDims())) { - if (info.buffer->fuserTv()->axis(axis_i)->isReduction() || - info.buffer->fuserTv()->axis(axis_i)->isBroadcast()) { + std::vector init_dims; + for (const auto axis_i : + c10::irange(info.alloc_pos, info.buffer->nDims())) { + if (info.buffer->axis(axis_i)->isReduction() || + info.buffer->axis(axis_i)->isBroadcast()) { continue; } - auto concrete_id = - gpu_lower - ->lowerValue(gpu_lower->caParallelMap().getConcreteMappedID( - fuser_tv->axis(axis_i))) - ->as(); + auto concrete_id = gpu_lower->caParallelMap().getConcreteMappedID( + info.buffer->axis(axis_i)); init_dims.push_back(concrete_id); } - kir::Expr* init_expr = ir_builder.create( - UnaryOpType::Set, info.buffer, init_val); + Expr* init_expr = + IrBuilder::create(UnaryOpType::Set, info.buffer, init_val); for (auto init_loop_it = init_dims.rbegin(); init_loop_it != init_dims.rend(); ++init_loop_it) { @@ -181,9 +136,9 @@ class AllocationInserter : public kir::MutableIrVisitor { kir::ForLoop* new_loop = nullptr; auto extent_with_halo = gpu_lower->haloInfo().getExtent(id); if (extent_with_halo) { - new_loop = ir_builder.create( + new_loop = IrBuilder::create( id, - ir_builder.create(c10::nullopt), + IrBuilder::create(c10::nullopt), nullptr, extent_with_halo, nullptr, @@ -191,31 +146,33 @@ class AllocationInserter : public kir::MutableIrVisitor { nullptr, false); } else { - new_loop = ir_builder.create(id); + new_loop = IrBuilder::create(id); } new_loop->body().push_back(init_expr); init_expr = new_loop; } - info.init_expr = init_expr; + return init_expr; } - std::vector getGlobalAllocationSizes(AllocationInformation& info) { + std::vector getGlobalAllocationSizes(AllocationInformation& info) { const auto& domain = info.buffer->domain(); - const auto& maybe_rfactor_domain = - domain->hasRFactor() ? domain->rfactorDomain() : domain->rootDomain(); + const auto& maybe_rfactor_domain = domain->hasRFactor() + ? domain->getRFactorDomain() + : domain->getRootDomain(); - std::vector alloc_dims; + std::vector alloc_dims; for (const auto id : maybe_rfactor_domain) { if (id->isReduction() || id->isStride() || - id->iterType() == IterType::BroadcastWithoutStride) { + id->getIterType() == IterType::BroadcastWithoutStride) { continue; } auto extent = id->extent(); // Use halo-extended extent if found auto halo_extent = gpu_lower->haloInfo().getRootAxisInfo(id); if (halo_extent.hasHalo()) { - extent = ir_builder.addExpr(extent, halo_extent.width()); + extent = IrBuilder::addExpr( + extent, IrBuilder::create(halo_extent.width())); } alloc_dims.push_back(extent); } @@ -244,7 +201,7 @@ class AllocationInserter : public kir::MutableIrVisitor { // fall back to the leaf-based allocation. // // See the FusionShiftDoubleSplit test for an example case. - std::vector getNonGlobalAllocExprWithHalo( + std::vector getNonGlobalAllocExprWithHalo( TensorView* tv, const std::vector& alloc_domains) { std::vector start_vals; @@ -255,18 +212,18 @@ class AllocationInserter : public kir::MutableIrVisitor { [](IterDomain* dom) { return dom->as(); }); // Get all exprs involved in generating the allocation IDs - auto exprs = ExprSort::getExprs(tv->fusion(), start_vals); + auto exprs = StmtSort::getExprs(tv->fusion(), start_vals); // Get the halo extent if found auto getExtent = [this](IterDomain* id) { auto extent = gpu_lower->haloInfo().getExtent(id); if (extent == nullptr) { - extent = gpu_lower->lowerValue(id->extent()); + extent = id->extent(); } return extent; }; - std::unordered_map known_extents; + std::unordered_map known_extents; // IterDomains that are allocated fully. For example, if an ID is // split and only one of them is used for allocation, that's not @@ -314,7 +271,7 @@ class AllocationInserter : public kir::MutableIrVisitor { } else { known_extents.insert( {split->in(), - ir_builder.mulExpr(outer_it->second, inner_it->second)}); + IrBuilder::mulExpr(outer_it->second, inner_it->second)}); } known_extents.erase(inner_it); known_extents.erase(outer_it); @@ -330,7 +287,7 @@ class AllocationInserter : public kir::MutableIrVisitor { } } - std::vector alloc_dims; + std::vector alloc_dims; for (auto root_axis : tv->getRootDomain()) { auto it = known_extents.find(root_axis); @@ -355,24 +312,22 @@ class AllocationInserter : public kir::MutableIrVisitor { return alloc_dims; } - std::vector getNonGlobalAllocExpr(AllocationInformation& info) { - auto fuser_tv = info.buffer->fuserTv(); - const auto memory_type = info.buffer->memoryType(); + std::vector getNonGlobalAllocExpr(AllocationInformation& info) { + const auto memory_type = info.buffer->getMemoryType(); TORCH_INTERNAL_ASSERT( memory_type != MemoryType::Global, "Invalid memory type: ", memory_type); - std::vector alloc_dims; + std::vector alloc_dims; bool has_halo = false; std::vector alloc_domains; - info.allocation_domains = std::make_unique>(); + info.allocation_domains = std::make_unique>(); - for (const auto axis_i : c10::irange(fuser_tv->nDims())) { - const auto local_id = - gpu_lower->lowerValue(fuser_tv->axis(axis_i))->as(); + for (const auto axis_i : c10::irange(info.buffer->nDims())) { + const auto local_id = info.buffer->axis(axis_i); // Don't use reduction/stride/broadcast axis in the allocation // computation @@ -381,16 +336,14 @@ class AllocationInserter : public kir::MutableIrVisitor { continue; } - auto concrete_id = - gpu_lower - ->lowerValue(gpu_lower->caParallelMap().getConcreteMappedID( - fuser_tv->axis(axis_i))) - ->as(); + auto concrete_id = gpu_lower->caParallelMap().getConcreteMappedID( + info.buffer->axis(axis_i)); const bool is_block_dim = - isParallelTypeBlockDim(concrete_id->parallelType()); + isParallelTypeBlockDim(concrete_id->getParallelType()); const bool is_thread_dim = - isParallelTypeThreadDim(concrete_id->parallelType()); - const bool is_thread = isParallelTypeThread(concrete_id->parallelType()); + isParallelTypeThreadDim(concrete_id->getParallelType()); + const bool is_thread = + isParallelTypeThread(concrete_id->getParallelType()); if (axis_i < info.alloc_pos) { // Even when the axis is outside the allocation position, if the @@ -403,7 +356,7 @@ class AllocationInserter : public kir::MutableIrVisitor { (memory_type == MemoryType::Global && is_thread))) { continue; } - alloc_domains.push_back(fuser_tv->axis(axis_i)); + alloc_domains.push_back(info.buffer->axis(axis_i)); } else { if ( // If shared memory, don't use any IDs bound to a grid dimension @@ -413,12 +366,13 @@ class AllocationInserter : public kir::MutableIrVisitor { (memory_type == MemoryType::Local && is_thread)) { continue; } - alloc_domains.push_back(fuser_tv->axis(axis_i)); + alloc_domains.push_back(info.buffer->axis(axis_i)); } auto extent = concrete_id->extent(); - if (gpu_lower->haloInfo().getExtent(fuser_tv->axis(axis_i)) != nullptr) { + if (gpu_lower->haloInfo().getExtent(info.buffer->axis(axis_i)) != + nullptr) { has_halo = true; } @@ -430,20 +384,19 @@ class AllocationInserter : public kir::MutableIrVisitor { // the halo extents from leaf IDs to root IDs if (has_halo) { info.has_halo = true; - return getNonGlobalAllocExprWithHalo(fuser_tv, alloc_domains); + return getNonGlobalAllocExprWithHalo(info.buffer, alloc_domains); } return alloc_dims; } - void createAllocExpr(AllocationInformation& info, bool is_output) { + kir::Allocate* createAllocExpr(AllocationInformation& info, bool is_output) { if (is_output) { - info.alloc_expr = nullptr; - return; + return nullptr; } - std::vector alloc_dims; - const MemoryType memory_type = info.buffer->memoryType(); + std::vector alloc_dims; + const MemoryType memory_type = info.buffer->getMemoryType(); if (memory_type == MemoryType::Global) { alloc_dims = getGlobalAllocationSizes(info); @@ -453,60 +406,74 @@ class AllocationInserter : public kir::MutableIrVisitor { if (alloc_dims.size() == 0 && info.buffer->domain()->noReductions().size() != 0) { - alloc_dims.push_back(ir_builder.create(1)); + alloc_dims.push_back(info.buffer->container()->oneVal()); + } + + // Double the allocation size if double-buffered. Record the + // original size for indexing. + if (info.buffer->isDoubleBuffered()) { + Val* original_alloc_size = nullptr; + for (auto alloc_dim : alloc_dims) { + if (original_alloc_size == nullptr) { + original_alloc_size = alloc_dim; + } else { + original_alloc_size = + IrBuilder::mulExpr(original_alloc_size, alloc_dim); + } + } + GpuLower::current()->doubleBufferInfo().setOriginalAllocSize( + info.buffer, original_alloc_size); + alloc_dims.push_back(IrBuilder::create(2)); } // Create the allocation node - info.alloc_expr = ir_builder.create( - info.buffer, info.buffer->memoryType(), alloc_dims); + return IrBuilder::create( + info.buffer, info.buffer->getMemoryType(), alloc_dims); } - void handle(kir::Expr* expr) { - if (!ir_utils::isTVOp(expr) || expr->isA()) { - expr->accept(this); + void handle(Expr* expr) override { + if (!ir_utils::isTvOp(expr) || expr->isA()) { + ExprMutator::handle(expr); return; } // // Found where the allocation needs to be inserted for (auto out : expr->outputs()) { - if (!out->isA()) { + if (!out->isA()) { continue; } - auto out_tv = out->as(); - auto default_val = - gpu_lower->predicateElimination().getInitValue(out_tv->fuserTv()); + auto out_tv = out->as(); + auto default_val = gpu_lower->predicateElimination().getInitValue(out_tv); - kir::Val* init = nullptr; - if (expr->isA() && out_tv->fuserTv()->hasReduction()) { + Val* init = nullptr; + if (expr->isA() && out_tv->hasReduction()) { TORCH_INTERNAL_ASSERT( default_val == nullptr, "Reduction should not have a default initialization value for predicate elimination."); - init = expr->as()->init(); - } else if (expr->isA()) { + init = expr->as()->init(); + } else if (expr->isA()) { TORCH_INTERNAL_ASSERT( default_val == nullptr, "Welford should not have a default initialization value for predicate elimination."); - const auto welford = expr->as(); - if (out->id() == welford->outVar()->id()) { - init = welford->initVar() == nullptr - ? ir_builder.create(0) - : welford->initVar(); - } else if (out->id() == welford->outAvg()->id()) { - init = welford->initAvg() == nullptr - ? ir_builder.create(0) - : welford->initAvg(); + const auto welford = expr->as(); + if (out->name() == welford->outVar()->name()) { + init = welford->initVar() == nullptr ? IrBuilder::create(0) + : welford->initVar(); + } else if (out->name() == welford->outAvg()->name()) { + init = welford->initAvg() == nullptr ? IrBuilder::create(0) + : welford->initAvg(); } else { TORCH_INTERNAL_ASSERT( - out->id() == welford->outN()->id(), "Unreachable"); + out->name() == welford->outN()->name(), "Unreachable"); init = welford->initN(); } } else if (default_val != nullptr) { init = default_val; } - const bool is_output = gpu_lower->kernel()->isOutput(out); + const bool is_output = out->isFusionOutput(); // Don't need to alloc outputs, and if we don't need to initialize we're // done. @@ -516,150 +483,91 @@ class AllocationInserter : public kir::MutableIrVisitor { AllocationInformation allocation; allocation.buffer = out_tv; - findAllocationPosition(allocation, expr); - createAllocExpr(allocation, is_output); - createInitExpr(allocation, init); + fillAllocationInformation(allocation, expr); + + auto alloc_expr = createAllocExpr(allocation, is_output); + auto init_expr = createInitExpr(allocation, init); // Write information to GPULower - writeInfoToGPULower(allocation); + writeInfoToGPULower(allocation, alloc_expr); - allocs.push_back(std::move(allocation)); + // Register allocations before initializations to keep them in the right + // order + if (alloc_expr != nullptr) { + if (allocation.buffer->getMemoryType() == MemoryType::Shared) { + // Shared allocations go at the begining of scope + TORCH_INTERNAL_ASSERT(!exprs_.empty()); + registerInsertBefore(exprs_[0], alloc_expr, nullptr); + } else { + TORCH_INTERNAL_ASSERT(allocation.alloc_place_before != nullptr); + kir::Scope* scope = allocation.alloc_for_loop == nullptr + ? nullptr + : &allocation.alloc_for_loop->body(); + registerInsertBefore( + allocation.alloc_place_before, alloc_expr, scope); + } + } + + if (init_expr != nullptr) { + TORCH_INTERNAL_ASSERT(allocation.init_place_before != nullptr); + kir::Scope* scope = allocation.init_for_loop == nullptr + ? nullptr + : &allocation.init_for_loop->body(); + registerInsertBefore(allocation.init_place_before, init_expr, scope); + } } } - void writeInfoToGPULower(const AllocationInformation& allocation) { + // Sends alloc_expr, info.has_halo, info.allocation_domains to GpuLower + void writeInfoToGPULower( + const AllocationInformation& allocation, + kir::Allocate* alloc_expr) { auto& lower_alloc_info_map = GpuLower::current()->localAllocationInfoMap(); - if (allocation.alloc_expr == nullptr) { + if (alloc_expr == nullptr) { // Skip output allocation. return; } TORCH_INTERNAL_ASSERT( - !lower_alloc_info_map.count(allocation.alloc_expr), + !lower_alloc_info_map.count(alloc_expr), "duplicated allocation info entry"); // Create info entry for GPULower auto lower_alloc_info_ptr = std::make_unique(); - lower_alloc_info_ptr->alloc_expr = allocation.alloc_expr; + lower_alloc_info_ptr->alloc_expr = alloc_expr; lower_alloc_info_ptr->has_halo = allocation.has_halo; if (allocation.allocation_domains) { lower_alloc_info_ptr->alloc_domains = *(allocation.allocation_domains); } // Write entry to the stored map - lower_alloc_info_map[allocation.alloc_expr] = - std::move(lower_alloc_info_ptr); + lower_alloc_info_map[alloc_expr] = std::move(lower_alloc_info_ptr); } - void visit(kir::ForLoop* fl) final { - for_loops.push_back(fl); - // Modifying in place, make a copy of the vector - const std::vector exprs = fl->body().exprs(); - for (auto expr : exprs) { - handle(expr); - } - for_loops.pop_back(); - } - - void visit(kir::IfThenElse*) final { + void handle(kir::IfThenElse*) final { TORCH_INTERNAL_ASSERT( false, "Pass does not support conditional statements, ", "this pass should be run before any conditionals are placed in code."); } - AllocationInserter(std::vector _loop_nests) - : loop_nests_(std::move(_loop_nests)), - gpu_lower(GpuLower::current()), - ir_builder(gpu_lower->kernel()) { - // Compute all allocations - const std::vector exprs = loop_nests_; - for (auto expr : exprs) { - handle(expr); - } - - // First, place allocations of dynamic smem tensors at the very - // beginning of the expr list. Traverse backward as they should be - // placed in topological order. - for (auto it = allocs.rbegin(); it != allocs.rend(); ++it) { - const auto& alloc = *it; - if (alloc.alloc_expr == nullptr) { - continue; - } - // Dynamic smem exprs need to be at the begining of the kernel outside for - // loops - if (alloc.buffer->memoryType() == MemoryType::Shared && - !kir::ExpressionEvaluator::isConst(alloc.alloc_expr->size())) { - loop_nests_.insert(loop_nests_.begin(), alloc.alloc_expr); - } - } - - // Place the remaining allocations. - for (const auto& alloc : allocs) { - if (alloc.alloc_expr == nullptr) { - continue; - } - if (alloc.buffer->memoryType() == MemoryType::Shared && - !kir::ExpressionEvaluator::isConst(alloc.alloc_expr->size())) { - continue; - } - if (alloc.alloc_for_loop == nullptr) { - auto place_before_it = std::find( - loop_nests_.begin(), loop_nests_.end(), alloc.alloc_place_before); - TORCH_INTERNAL_ASSERT( - place_before_it != loop_nests_.end(), - "Could not figure out where to place allocation. ", - "Use of the buffer, ", - toString(alloc.buffer), - ", could not be found.", - toString(alloc.alloc_place_before)); - loop_nests_.insert(place_before_it, alloc.alloc_expr); - } else { - alloc.alloc_for_loop->body().insert_before( - alloc.alloc_place_before, alloc.alloc_expr); - } - } - - // Now that allocations are in place, place the initializations - for (const auto& alloc : allocs) { - if (alloc.init_expr == nullptr) { - continue; - } - if (alloc.init_for_loop == nullptr) { - auto place_before_it = std::find( - loop_nests_.begin(), loop_nests_.end(), alloc.init_place_before); - // Don't need a check here as if the allocation placement succeeded - // this will too - loop_nests_.insert(place_before_it, alloc.init_expr); - } else { - alloc.init_for_loop->body().insert_before( - alloc.init_place_before, alloc.init_expr); - } - } + AllocationInserter(const std::vector& exprs) + : gpu_lower(GpuLower::current()) { + kir::ExprMutator::traverseAndInsert(exprs); } private: - std::deque allocs; - - std::vector for_loops; - - std::vector loop_nests_; - GpuLower* gpu_lower; - kir::IrBuilder ir_builder; - public: - static std::vector insert( - const std::vector& loop_nests) { - AllocationInserter inserter(loop_nests); - return inserter.loop_nests_; + static std::vector insert(const std::vector& exprs) { + AllocationInserter inserter(exprs); + return inserter.exprs_; } }; } // namespace -std::vector insertAllocations( - const std::vector& exprs) { +std::vector insertAllocations(const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::insertAllocations"); return AllocationInserter::insert(exprs); } diff --git a/torch/csrc/jit/codegen/cuda/lower_allocation.h b/torch/csrc/jit/codegen/cuda/lower_allocation.h index bc0344ca19f..45ebeac03f7 100644 --- a/torch/csrc/jit/codegen/cuda/lower_allocation.h +++ b/torch/csrc/jit/codegen/cuda/lower_allocation.h @@ -1,8 +1,7 @@ #pragma once -#include +#include -#include #include #include @@ -17,7 +16,7 @@ namespace cuda { //! logic duplication struct LocalAllocationInfo { kir::Allocate* alloc_expr = nullptr; - std::vector alloc_domains; + std::vector alloc_domains; bool has_halo = false; }; @@ -25,7 +24,7 @@ using LocalAllocationInfoMap = std::unordered_map>; //! Insert buffer allocations -std::vector insertAllocations(const std::vector& exprs); +std::vector insertAllocations(const std::vector& exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp b/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp new file mode 100644 index 00000000000..c8110413de7 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_double_buffer.cpp @@ -0,0 +1,508 @@ +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +unsigned int getDoubleBufferAxisPosition(const TensorView* tv) { + // Double-buffering prefetches the next subregion of the tensor by + // doubling the allocation. The subregion is defined by the axes + // at the CA position till the inner-most position. There must be + // at least one axis that is outside (left) of the CA position, + // which defines the loop where prefetching is applied. Therefore, + // the CA position must be larger than 0. + + TORCH_INTERNAL_ASSERT(tv->getComputeAtPosition() > 0); + + // Unroll must not exist outside of double-buffer axis + auto first_unroll_it = std::find_if( + tv->domain()->domain().begin(), + tv->domain()->domain().end(), + [](const auto axis) { + return axis->getParallelType() == ParallelType::Unroll; + }); + + const int first_unroll_pos = + std::distance(tv->domain()->domain().begin(), first_unroll_it); + + const int unroll_or_ca_pos = + std::min((int)tv->getComputeAtPosition(), first_unroll_pos); + + TORCH_INTERNAL_ASSERT( + unroll_or_ca_pos > 0, + "Invalid tensor to double-buffer. Valid double buffer axis not found due to Unroll. ", + tv->toString()); + + int valid_pos = -1; + // Skip parallelized or broadcast axes + for (int i = unroll_or_ca_pos - 1; i >= 0; --i) { + auto pt = tv->axis(i)->getParallelType(); + if (!isParallelTypeThread(pt) && !tv->axis(i)->isBroadcast()) { + valid_pos = i; + break; + } + } + + TORCH_INTERNAL_ASSERT( + valid_pos >= 0, + "Invalid tensor to double-buffer. Valid double buffer axis not found. ", + tv->toString()); + + return valid_pos; +} + +IterDomain* getDoubleBufferAxis(const TensorView* tv) { + return tv->axis((int)getDoubleBufferAxisPosition(tv)); +} + +void validateDoubleBufferedTensor(const TensorView* tv) { + auto double_buffer_pos = getDoubleBufferAxisPosition(tv); + + // Like vectorization, only UnaryOp::Set with another TensorView is + // considered. + auto def = tv->definition(); + TORCH_INTERNAL_ASSERT( + def->isA() && + def->as()->getUnaryOpType() == UnaryOpType::Set, + "Invalid tensor to double-buffer. Only tensor defined by UnaryOp::Set is supported: ", + def->toString()); + + TORCH_INTERNAL_ASSERT( + def->as()->in()->isA(), + "Invalid tensor to double-buffer. Only tensor defined by UnaryOp::Set with TensorView is supported: ", + def->toString()); + + // Require the producer tensor to have been computed entirely for + // the double-buffering loop. Otherwise, the producer itself would + // also need to be double-bufferred. + auto producer = def->as()->in()->as(); + TORCH_INTERNAL_ASSERT( + producer->getComputeAtPosition() <= double_buffer_pos, + "Invalid tensor to double-buffer. The computeAt position of the producer tensor must be moved left: ", + producer->toString()); + + // Not strictly necessary, but only gmem -> smem or local and smem -> local + // are allowed. + const auto p_mem_type = producer->getMemoryType(); + const auto c_mem_type = tv->getMemoryType(); + TORCH_INTERNAL_ASSERT( + (p_mem_type == MemoryType::Global && + (c_mem_type == MemoryType::Shared || c_mem_type == MemoryType::Local)) || + (p_mem_type == MemoryType::Shared && c_mem_type == MemoryType::Local), + "Invalid tensor to double-buffer: ", + tv->toString(), + ". Producer memory type: ", + p_mem_type, + ". Consumer memory type: ", + c_mem_type); + + return; +} + +namespace { + +// Initial inspection of a fusion to find and validate double buffered tensors +class DoubleBufferFusionInspector : private IterVisitor { + public: + DoubleBufferFusionInspector(Fusion* fusion, DoubleBufferInfo& db_info) + : db_info_(db_info) { + traverse(fusion); + } + + private: + using IterVisitor::handle; + + void handle(TensorView* tv) final { + if (!tv->isDoubleBuffered()) { + return; + } + + validateDoubleBufferedTensor(tv); + + auto db_axis = getDoubleBufferAxis(tv); + + db_info_.setDoubleBufferAxis(tv, db_axis); + } + + private: + DoubleBufferInfo& db_info_; +}; + +// The type of replicated double-buffer loops +enum class LoopType { Prologue, Main, Epilogue }; + +// The epilogue loop is only created when the producer of a double +// buffer tensor is on smem, in which case it would otherwise require +// an additional predicate to guard buffer overruns. When it's on +// gmem, that isn't the case, so it does not need to create an +// epilogue loop. +bool requireEpilogue(const std::vector& exprs) { + return std::any_of(exprs.begin(), exprs.end(), [](const UnaryOp* uop) { + return uop->in()->as()->getMemoryType() == MemoryType::Shared; + }); +} + +// Replicates double buffer loops for Prologue, Main, and +// Epilogue. Prologue only copies the load expressions of double +// buffered tensors, whereas Epilogue does any expression other than +// the loads. Main copies everything. +class DoubleBufferLoopCloner : public kir::IrVisitor { + public: + static kir::ForLoop* clone( + kir::ForLoop* double_buffer_loop, + const std::vector& double_buffer_load_exprs, + LoopType loop_type) { + DoubleBufferLoopCloner cloner( + double_buffer_loop, double_buffer_load_exprs, loop_type); + cloner.clone(); + return cloner.cloned_top_level_loop_; + } + + private: + DoubleBufferLoopCloner( + kir::ForLoop* double_buffer_loop, + const std::vector& double_buffer_load_exprs, + LoopType loop_type) + : double_buffer_loop_(double_buffer_loop), + double_buffer_load_exprs_(double_buffer_load_exprs), + loop_type_(loop_type) {} + + using kir::IrVisitor::handle; + + void clone() { + const auto gpu_lower = GpuLower::current(); + + // Cloning the double buffer loop as follows: + // + // Prologue: 0 to 1 + // Main: 0 to (extent-1) + // Epilogue: (extent-1) to extent + + auto index = IrBuilder::create(c10::nullopt); + auto start = double_buffer_loop_->start(); + auto stop = double_buffer_loop_->stop(); + + if (loop_type_ == LoopType::Prologue) { + TORCH_INTERNAL_ASSERT(start->isZeroInt()); + stop = gpu_lower->kernel()->oneVal(); + } else if ( + loop_type_ == LoopType::Main && + requireEpilogue(double_buffer_load_exprs_)) { + stop = IrBuilder::subExpr( + double_buffer_loop_->stop(), gpu_lower->kernel()->oneVal()); + } else if (loop_type_ == LoopType::Epilogue) { + TORCH_INTERNAL_ASSERT(requireEpilogue(double_buffer_load_exprs_)); + start = IrBuilder::subExpr( + double_buffer_loop_->stop(), gpu_lower->kernel()->oneVal()); + } + + cloned_top_level_loop_ = IrBuilder::create( + double_buffer_loop_->iter_domain(), + index, + start, + stop, + gpu_lower->kernel()->oneVal(), + false, + nullptr, + double_buffer_loop_->isUnrollRequired()); + + handle(double_buffer_loop_); + } + + void handle(kir::ForLoop* fl) final { + const auto gpu_lower = GpuLower::current(); + + kir::ForLoop* cloned_loop = fl == double_buffer_loop_ + ? cloned_top_level_loop_ + : IrBuilder::create(fl); + + cloned_scopes_.push_back(&cloned_loop->body()); + + kir::IrVisitor::handle(fl); + + cloned_scopes_.pop_back(); + + // Add the cloned loop into the parent loop body only when the + // cloned loop contains expressions. + if (!cloned_loop->body().empty() && !cloned_scopes_.empty()) { + cloned_scopes_.back()->push_back(cloned_loop); + } + } + + void handle(kir::IfThenElse* ite) final { + TORCH_INTERNAL_ASSERT(false, "No IfThenElse should exist yet"); + } + + void handle(Expr* expr) final { + if (expr->isA() || expr->isA()) { + kir::IrVisitor::handle(expr); + return; + } + + TORCH_INTERNAL_ASSERT(!cloned_scopes_.empty()); + + if (loop_type_ == LoopType::Main) { + cloned_scopes_.back()->push_back(expr); + return; + } + + // In Prologue and Epilogue, either load expressions or anything + // else are copied. Note that there can be multiple exprs defining + // double buffered TVs (e.g., buffer initialization). + + auto out_tv = ir_utils::getTvOutput(expr); + const auto is_double_buffer_load_expr = std::any_of( + double_buffer_load_exprs_.begin(), + double_buffer_load_exprs_.end(), + [out_tv](const auto load_expr) { + auto double_buffer_tv = ir_utils::getTvOutput(load_expr); + TORCH_INTERNAL_ASSERT(double_buffer_tv != nullptr); + return out_tv == double_buffer_tv; + }); + if ((loop_type_ == LoopType::Prologue && is_double_buffer_load_expr) || + (loop_type_ == LoopType::Epilogue && !is_double_buffer_load_expr)) { + cloned_scopes_.back()->push_back(expr); + } + } + + private: + kir::ForLoop* double_buffer_loop_ = nullptr; + const std::vector& double_buffer_load_exprs_; + const LoopType loop_type_; + + kir::ForLoop* cloned_top_level_loop_ = nullptr; + std::deque cloned_scopes_; +}; + +using InsertionInfo = std::unordered_map>; + +// Traverse lowered loop-nests and find all double buffer loops and +// associated load expressions. +class DoubleBufferLoopNestInspector : private kir::IrVisitor { + public: + static InsertionInfo run(const std::vector& exprs) { + DoubleBufferLoopNestInspector inspector(exprs); + return inspector.insertion_info_; + } + + private: + DoubleBufferLoopNestInspector(const std::vector& exprs) { + handle(exprs); + } + + using kir::IrVisitor::handle; + + void handle(UnaryOp* uop) final { + const auto gpu_lower = GpuLower::current(); + + auto out_tv = ir_utils::getTvOutput(uop); + + if (out_tv == nullptr) { + return; + } + + // Ignore init loop + if (!out_tv->isDoubleBuffered() || !uop->in()->isA()) { + return; + } + + auto double_buffer_loop = + gpu_lower->doubleBufferInfo().getDoubleBufferLoop(out_tv, for_loops_); + + TORCH_INTERNAL_ASSERT( + double_buffer_loop != nullptr, + "No double buffer loop found for a double buffered tensor: ", + out_tv->toString()); + + validateDoubleBufferLoop(double_buffer_loop); + + insertion_info_[double_buffer_loop].push_back(uop); + } + + static void validateDoubleBufferLoop(kir::ForLoop* loop) { + TORCH_INTERNAL_ASSERT( + loop->start()->isZeroInt(), "Unsupported loop: ", loop->toString()); + TORCH_INTERNAL_ASSERT( + loop->step()->isOneInt(), "Unsupported loop: ", loop->toString()); + TORCH_INTERNAL_ASSERT( + !loop->vectorize(), + "Vectorized loop should not be the allocation loop for double-buffered tensor: ", + loop->toString()); + TORCH_INTERNAL_ASSERT( + !loop->vectorize_shift(), + "Vectorize shift loop should not be the allocation loop for double-buffered tensor: ", + loop->toString()); + } + + InsertionInfo insertion_info_; +}; + +// Apply double buffering transformations +class DoubleBufferInserter : private kir::ExprMutator { + public: + // When there exist multiple double buffer loops, apply + // transformations to inner-most loops first. A single ExprMutator + // pass can only process one loop. + static std::vector run( + const std::vector& exprs, + InsertionInfo insertion_info) { + auto inserted_exprs = exprs; + while (!insertion_info.empty()) { + DoubleBufferInserter inserter(inserted_exprs, insertion_info); + inserted_exprs = inserter.exprs_; + } + return inserted_exprs; + } + + private: + DoubleBufferInserter( + const std::vector& exprs, + InsertionInfo& insertion_info) + : insertion_info_(insertion_info) { + auto num_double_buffer_loops = insertion_info.size(); + traverseAndInsert(exprs); + TORCH_INTERNAL_ASSERT(processed_loop_ != nullptr); + TORCH_INTERNAL_ASSERT(insertion_info.size() == num_double_buffer_loops - 1); + } + + using kir::ExprMutator::handle; + + void handle(kir::ForLoop* loop) final { + kir::ExprMutator::handle(loop); + + // If another loop is already taken care of, no more loop should + // be done in the same pass + if (processed_loop_ != nullptr) { + return; + } + + auto it = insertion_info_.find(loop); + if (it == insertion_info_.end()) { + return; + } + + insert(loop, it->second); + processed_loop_ = loop; + insertion_info_.erase(loop); + } + + void insert( + kir::ForLoop* double_buffer_loop, + const std::vector& loads) { + auto prologue_loop = DoubleBufferLoopCloner::clone( + double_buffer_loop, loads, LoopType::Prologue); + registerInsertBefore(double_buffer_loop, prologue_loop); + + auto write_to_smem = + std::any_of(loads.begin(), loads.end(), [](const UnaryOp* uop) { + return uop->out()->as()->getMemoryType() == + MemoryType::Shared; + }); + + // RAW sync is not inserted for double buffered tensors. The only + // exception is the prologue load. + if (write_to_smem) { + auto sync = IrBuilder::create(); + registerInsertBefore(double_buffer_loop, sync); + } + + auto main_loop = DoubleBufferLoopCloner::clone( + double_buffer_loop, loads, LoopType::Main); + registerReplace(double_buffer_loop, main_loop); + + if (requireEpilogue(loads)) { + auto epilogue_loop = DoubleBufferLoopCloner::clone( + double_buffer_loop, loads, LoopType::Epilogue); + registerInsertAfter(double_buffer_loop, epilogue_loop); + } + } + + private: + InsertionInfo& insertion_info_; + kir::ForLoop* processed_loop_ = nullptr; +}; + +} // namespace + +void DoubleBufferInfo::build(Fusion* fusion) { + DoubleBufferFusionInspector inspector(fusion, *this); +} + +DoubleBufferInfo::TvInfo& DoubleBufferInfo::getTvInfo(const TensorView* tv) { + TORCH_INTERNAL_ASSERT( + tv->isDoubleBuffered(), "Not a double-buffered tensor: ", tv->toString()); + return map_[tv]; +} + +void DoubleBufferInfo::setDoubleBufferAxis( + const TensorView* tv, + IterDomain* axis) { + getTvInfo(tv).double_buffer_axis = axis; +} + +IterDomain* DoubleBufferInfo::getDoubleBufferAxis(const TensorView* tv) { + if (!tv->isDoubleBuffered()) { + return nullptr; + } + + return getTvInfo(tv).double_buffer_axis; +} + +kir::ForLoop* DoubleBufferInfo::getDoubleBufferLoop( + IterDomain* axis, + const std::vector& loops, + bool ignore_prologue) { + auto loop_it = std::find_if(loops.begin(), loops.end(), [&](const auto loop) { + return GpuLower::current()->caIndexMap().areMapped( + loop->iter_domain(), axis) && + (!ignore_prologue || !loop->stop()->isOneInt()); + }); + + if (loop_it != loops.end()) { + return *loop_it; + } else { + return nullptr; + } +} + +kir::ForLoop* DoubleBufferInfo::getDoubleBufferLoop( + const TensorView* tv, + const std::vector& loops, + bool ignore_prologue) { + auto axis = getDoubleBufferAxis(tv); + + if (axis == nullptr) { + return nullptr; + } + + return getDoubleBufferLoop(axis, loops, ignore_prologue); +} + +void DoubleBufferInfo::setOriginalAllocSize( + const TensorView* tv, + Val* original_alloc_size) { + getTvInfo(tv).original_alloc_size = original_alloc_size; +} + +Val* DoubleBufferInfo::getOriginalAllocSize(const TensorView* tv) { + if (!tv->isDoubleBuffered()) { + return nullptr; + } + + return getTvInfo(tv).original_alloc_size; +} + +std::vector DoubleBufferPass::run(const std::vector& exprs) { + auto insertion_info = DoubleBufferLoopNestInspector::run(exprs); + return DoubleBufferInserter::run(exprs, insertion_info); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_double_buffer.h b/torch/csrc/jit/codegen/cuda/lower_double_buffer.h new file mode 100644 index 00000000000..96bc247f4ff --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_double_buffer.h @@ -0,0 +1,142 @@ +#pragma once + +#include + +#include +#include +#include + +// Double buffering a tensor doubles its allocation size and uses two +// buffers to facilitate computation and memory access +// overlapping. The basic form of code looks like as follows: +// +// Before: +// for i +// x[S]; // allocation +// for j: +// x[j] = y[i, j] +// for j: +// ... = x[j] +// +// After: +// X[S * 2]; // allocation +// for i in 0 to 1: // Prologue +// for j: +// x[j] = y[i, j] +// +// for i in 0 to N-1: // Main +// for j: +// x[j + (1 - i % 2) * S] = y[i + 1, j] +// for j: +// ... = x[j + (i % 2) * S] +// +// for i in N-1 to N: // Epilogue +// for j: +// ... = x[j + (i % 2) * S] +// +// Here, S is the original size of tensor x. +// +// The i loop is the double buffer loop of tensor x, where double +// buffering is applied to the tensor. The first step of lowering is +// to find the double buffering axis for each double buffered +// tensor. It must not be parallelized as it isn't possible to double +// buffer parallelized loops. Also, an unrolled axis expands the +// allocation and is intended to make the loop completely unrolled, +// which also conflicts with double buffering. So, basically, the double +// buffering axis is the inner-most axis within the axes left +// of the CA position. However, when it is parallelized or unrolled, a +// further left axis is picked. +// +// Once the double buffer axis is determined, the main task is to +// replicate the corresponding double buffer loop as illustrated +// above. The Prologue loop is to just fetch the first element to +// populate the buffer. The main loop is mostly the same as the +// original loop, except for the indexing change to switch the two +// buffers. When used as a consumer, an offset of (1 - i % 2) * S is +// added, whereas (i % 2) * S is added when used as a producer. Here, +// i is the index of the double buffer loop. The Epilogue loop is just +// for the last iteration of the loop. Since the main loop reads one +// element ahead of the producer of the double buffered tensor, it +// would require an additional guard to prevent buffer overruns with +// the producer if the main loop were also used for the last +// iteration. However, the value loaded by the invalid load would not +// be used, so instead of adding the additional predicate, the Epilogue +// loop is replicated from the original loop, except for the load +// expression since it's not used. Note that this overrun does not +// happen when the producer is on gmem, so in that case, this +// additional replication is not done. +// +// When creating those three types of loops, additional care must be +// taken when multiple tensors are double buffered. When multiple +// tensors use the same loop as their double buffer loop, one pass of +// replication takes care of them at once, meaning the same Prologue, +// Main, Epilogue loops are used for the multiple tensors. +// +// Other tasks to do for a double buffer tensor include: +// - Move allocation to outside of the double buffer loop +// - Double the allocation size +// - Omit the RAW sync in the Main and Epilogue loops + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +unsigned int getDoubleBufferAxisPosition(const TensorView* tv); + +IterDomain* getDoubleBufferAxis(const TensorView* tv); + +void validateDoubleBufferedTensor(const TensorView* tv); + +class TORCH_CUDA_CU_API DoubleBufferPass { + public: + //! Apply double buffering transformations + static std::vector run(const std::vector& exprs); +}; + +class TORCH_CUDA_CU_API DoubleBufferInfo { + // Lowering information of double buffered tensors. + struct TvInfo { + IterDomain* double_buffer_axis = nullptr; + Val* original_alloc_size = nullptr; + }; + + public: + void build(Fusion* fusion); + + void setDoubleBufferAxis(const TensorView* tv, IterDomain* id); + + IterDomain* getDoubleBufferAxis(const TensorView* tv); + + //! Get a loop that matches with a given double-buffer axis. If + //! ignore_prologue is true, a matched loop is ignored if it's a + //! prologue loop. + static kir::ForLoop* getDoubleBufferLoop( + IterDomain* axis, + const std::vector& loops, + bool ignore_prologue = false); + + //! Get a loop that matches with the double-buffer axis of a given + //! double-buffered tensor. If ignore_prologue is true, a matched + //! loop is ignored if it's a prologue loop. + kir::ForLoop* getDoubleBufferLoop( + const TensorView* tv, + const std::vector& loops, + bool ignore_prologue = false); + + void setOriginalAllocSize(const TensorView* tv, Val* size); + + Val* getOriginalAllocSize(const TensorView* tv); + + private: + TvInfo& getTvInfo(const TensorView* tv); + + private: + //! Keeps track of information for lowering double buffered tensors + std::unordered_map map_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp index 2353ea9bbf5..84c72c08185 100644 --- a/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_expr_sort.cpp @@ -541,7 +541,7 @@ ExprGroup* ExprSegmentationSorter::makeEmptyGroup() { ExprGroup* ExprSegmentationSorter::makeEmptyGroup(Expr* expr) { auto group = makeEmptyGroup(); group->exprs().push_back(expr); - if (ir_utils::isTVOp(expr)) { + if (ir_utils::isTvOp(expr)) { auto out_tv = expr->outputs()[0]->as(); // Grab all id's that are shared with other tensors. for (const auto tv_i : c10::irange(out_tv->getComputeAtPosition())) { @@ -721,7 +721,7 @@ std::vector getLocalDomainOrdering( std::unordered_set domains; for (auto expr : exprs) { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { continue; } diff --git a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp new file mode 100644 index 00000000000..fa84d1006a1 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.cpp @@ -0,0 +1,119 @@ +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +// Replace trivial reductions with unary ops. +class TrivialReductionReplacement : private OptOutMutator { + public: + TrivialReductionReplacement( + Fusion* fusion, + const TrivialReductionInfo& trivial_reduction_info) + : trivial_reduction_info_(trivial_reduction_info) { + FusionGuard fg(fusion); + auto exprs = StmtSort::getExprs(fusion); + for (auto expr : exprs) { + mutate(expr); + } + } + + private: + using OptOutMutator::mutate; + void mutate(ReductionOp* rop) final { + if (rop->out()->isA()) { + auto out_tv = rop->out()->as(); + if (std::all_of( + out_tv->domain()->domain().begin(), + out_tv->domain()->domain().end(), + [&](IterDomain* id) { + // If id is a reduction axis, is it a trivial reduction? + if (id->isReduction()) { + return trivial_reduction_info_.isDerived(id); + } else { + return true; + } + })) { + auto out = rop->out(); + auto in = rop->in(); + auto container = out->container(); + removeExpr(container, rop); + IrBuilder::create(container, UnaryOpType::Set, out, in); + } + } + } + + const TrivialReductionInfo& trivial_reduction_info_; +}; + +// Replaces Transpose, Shift, Gather, and View Ops with Unary Ops. +class UnaryOpInserter : private kir::ExprMutator { + public: + static std::vector insert(const std::vector& exprs) { + UnaryOpInserter inserter(exprs); + return inserter.exprs_; + } + + private: + using kir::ExprMutator::handle; + + UnaryOpInserter(const std::vector& exprs) { + kir::ExprMutator::traverseAndInsert(exprs); + } + + void handle(TransposeOp* top) final { + auto out = top->out(); + auto in = top->in(); + auto container = out->container(); + registerReplace( + top, IrBuilder::create(container, UnaryOpType::Set, out, in)); + } + + void handle(ShiftOp* sop) final { + auto out = sop->out(); + auto in = sop->in(); + auto container = out->container(); + registerReplace( + sop, IrBuilder::create(container, UnaryOpType::Set, out, in)); + } + + void handle(GatherOp* gop) final { + auto out = gop->out(); + auto in = gop->in(); + auto container = out->container(); + registerReplace( + gop, IrBuilder::create(container, UnaryOpType::Set, out, in)); + } + + void handle(ViewOp* vop) final { + auto out = vop->out(); + auto in = vop->in(); + auto container = out->container(); + registerReplace( + vop, IrBuilder::create(container, UnaryOpType::Set, out, in)); + } +}; + +} // namespace + +void trivialReductionReplacement( + Fusion* fusion, + const TrivialReductionInfo& trivial_reduction_info) { + TrivialReductionReplacement replacement(fusion, trivial_reduction_info); +} + +// Transpose, Shift, Gather, and View Ops with Unary Set Ops +std::vector unarySetOpInserter(const std::vector& exprs) { + return UnaryOpInserter::insert(exprs); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h new file mode 100644 index 00000000000..e18f4a8f077 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_fusion_simplifier.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +#include +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +// Replaces trivial reductions with Unary Set Ops +void trivialReductionReplacement(Fusion*, const TrivialReductionInfo&); + +// Transpose, Shift, Gather, and View Ops with Unary Set Ops +std::vector unarySetOpInserter(const std::vector& exprs); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_index.cpp b/torch/csrc/jit/codegen/cuda/lower_index.cpp index d92dd279b17..b0ef14079c4 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_index.cpp @@ -1,7 +1,6 @@ #include #include #include -#include #include #include #include @@ -13,30 +12,24 @@ namespace jit { namespace fuser { namespace cuda { -IndexLowering::IndexLowering() : ir_builder_(GpuLower::current()->kernel()) {} - -kir::Val* IndexLowering::lowerSrcIndex(kir::Val* src, kir::Val* dst) const { - if (auto tv = dynamic_cast(src)) { - TORCH_INTERNAL_ASSERT(dst->isA()); - return Index::getProducerIndex( - tv->fuserTv(), - dst->as()->fuserTv(), - scope_utils::getLoops(active_scope_expr_)); +Val* IndexLowering::lowerSrcIndex(Val* src, Val* dst) const { + if (auto tv = dynamic_cast(src)) { + TORCH_INTERNAL_ASSERT(dst->isA()); + return Index::getProducerIndex(tv, dst->as(), for_loops_); } else { return src; } } -kir::Val* IndexLowering::lowerDstIndex(kir::Val* dst) const { - if (auto tv = dynamic_cast(dst)) { - return Index::getConsumerIndex( - tv->fuserTv(), scope_utils::getLoops(active_scope_expr_)); +Val* IndexLowering::lowerDstIndex(Val* dst) const { + if (auto tv = dynamic_cast(dst)) { + return Index::getConsumerIndex(tv, for_loops_); } else { return dst; } } -void IndexLowering::pushBack(kir::Expr* expr) { +void IndexLowering::pushBack(Expr* expr) { if (active_scope_ == nullptr) { lowered_exprs_.push_back(expr); } else { @@ -44,78 +37,71 @@ void IndexLowering::pushBack(kir::Expr* expr) { } } -void IndexLowering::visit(const kir::IfThenElse* ite) { - const auto prev_scope_expr = active_scope_expr_; +void IndexLowering::handle(const kir::IfThenElse* ite) { const auto prev_scope = active_scope_; - // TODO(kir): try to avoid recreating new nodes and leaving old ones around - auto new_ite = ir_builder_.create(ite->predicate()); + auto new_ite = IrBuilder::create(ite->predicate()); pushBack(new_ite); - active_scope_expr_ = new_ite; active_scope_ = &new_ite->thenBody(); for (auto expr : ite->thenBody().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } active_scope_ = &new_ite->elseBody(); for (auto expr : ite->elseBody().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } active_scope_ = prev_scope; - active_scope_expr_ = prev_scope_expr; } -void IndexLowering::visit(const kir::ForLoop* for_loop) { - const auto prev_scope_expr = active_scope_expr_; +void IndexLowering::handle(const kir::ForLoop* for_loop) { const auto prev_scope = active_scope_; - auto new_for_loop = ir_builder_.create(for_loop); + auto new_for_loop = IrBuilder::create(for_loop); pushBack(new_for_loop); - active_scope_expr_ = new_for_loop; active_scope_ = &new_for_loop->body(); + for_loops_.push_back(new_for_loop); for (auto expr : for_loop->body().exprs()) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } + for_loops_.pop_back(); active_scope_ = prev_scope; - active_scope_expr_ = prev_scope_expr; } -void IndexLowering::visit(const kir::UnaryOp* uop) { +void IndexLowering::handle(const UnaryOp* uop) { const auto in = lowerSrcIndex(uop->in(), uop->out()); const auto out = lowerDstIndex(uop->out()); - pushBack(ir_builder_.create(uop->operation(), out, in)); + pushBack(IrBuilder::create(uop->getUnaryOpType(), out, in)); } -void IndexLowering::visit(const kir::BinaryOp* bop) { +void IndexLowering::handle(const BinaryOp* bop) { const auto lhs = lowerSrcIndex(bop->lhs(), bop->out()); const auto rhs = lowerSrcIndex(bop->rhs(), bop->out()); const auto out = lowerDstIndex(bop->out()); - pushBack(ir_builder_.create(bop->operation(), out, lhs, rhs)); + pushBack(IrBuilder::create(bop->getBinaryOpType(), out, lhs, rhs)); } -void IndexLowering::visit(const kir::TernaryOp* top) { +void IndexLowering::handle(const TernaryOp* top) { const auto in1 = lowerSrcIndex(top->in1(), top->out()); const auto in2 = lowerSrcIndex(top->in2(), top->out()); const auto in3 = lowerSrcIndex(top->in3(), top->out()); const auto out = lowerDstIndex(top->out()); - pushBack( - ir_builder_.create(top->operation(), out, in1, in2, in3)); + pushBack(IrBuilder::create( + top->getTernaryOpType(), out, in1, in2, in3)); } namespace { // Get the size of the temporary work buffer for grid communication, this can be // grid reduction, broadcast, or grid welford. -kir::Val* getGridCommWorkBufferSize( - kir::IrBuilder& ir_builder, - const kir::TensorDomain* td) { +Val* getGridCommWorkBufferSize(const TensorDomain* td) { // The buffer size is the number of thread blocks multiplied by the // number of threads not used for reduction domains. // Note: Previously it was calculated based on the shape of the @@ -125,7 +111,7 @@ kir::Val* getGridCommWorkBufferSize( // size if the parallel dimensions are exact, but otherwise, just // computing the buffer size based on the tensor shape isn't // sufficient since there could be extra threads/blocks. - kir::Val* buffer_size = ir_builder.create(1); + Val* buffer_size = GpuLower::current()->kernel()->oneVal(); for (auto pt : kParallelTypeThreads) { auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt); if (pt_dim == nullptr || pt_dim->isOneInt()) { @@ -133,33 +119,31 @@ kir::Val* getGridCommWorkBufferSize( } if (isParallelTypeThreadDim(pt) && std::any_of(td->domain().begin(), td->domain().end(), [&](auto out_id) { - return out_id->parallelType() == pt && + return out_id->getParallelType() == pt && (out_id->isReduction() || out_id->isBroadcast()); })) { continue; } - buffer_size = ir_builder.mulExpr(buffer_size, pt_dim); + buffer_size = IrBuilder::mulExpr(buffer_size, pt_dim); } return buffer_size; } -kir::Val* getGridSyncBufferSize( - kir::IrBuilder& ir_builder, - const kir::TensorDomain* td) { +Val* getGridSyncBufferSize(const TensorDomain* td) { // See the comment above for getGridCommWorkBufferSize. - kir::Val* buffer_size = ir_builder.create(1); + Val* buffer_size = GpuLower::current()->kernel()->oneVal(); for (auto pt : kParallelTypeBIDs) { auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt); if (pt_dim == nullptr || pt_dim->isOneInt()) { continue; } if (std::any_of(td->domain().begin(), td->domain().end(), [&](auto out_id) { - return out_id->parallelType() == pt && + return out_id->getParallelType() == pt && (out_id->isReduction() || out_id->isBroadcast()); })) { continue; } - buffer_size = ir_builder.mulExpr(buffer_size, pt_dim); + buffer_size = IrBuilder::mulExpr(buffer_size, pt_dim); } return buffer_size; } @@ -167,26 +151,25 @@ kir::Val* getGridSyncBufferSize( // Allocate global buffer for a grid communication calls, i.e. grid reduce, grid // welford reduce, grid broadcast. kir::Allocate* allocGlobalBufferForGridComm( - kir::IrBuilder& ir_builder, - kir::Val* buffer_size, + Val* buffer_size, DataType dtype, bool zero_init) { - const std::vector new_buffer_ids = { - ir_builder.create(ir_builder.zeroVal(), buffer_size)}; - const auto buffer_domain = - ir_builder.create(new_buffer_ids); - const auto buffer_tv = ir_builder.create( - dtype, buffer_domain, MemoryType::Global); - return ir_builder.create( - buffer_tv, buffer_tv->memoryType(), nullptr, zero_init); + const std::vector new_buffer_ids = { + IrBuilder::create( + GpuLower::current()->kernel()->zeroVal(), buffer_size)}; + const auto buffer_domain = IrBuilder::create(new_buffer_ids); + const auto buffer_tv = + IrBuilder::create(buffer_domain, dtype, MemoryType::Global); + return IrBuilder::create( + buffer_tv, buffer_tv->getMemoryType(), nullptr, zero_init); } } // namespace -void IndexLowering::visit(const kir::ReductionOp* rop) { - TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(rop)); +void IndexLowering::handle(const ReductionOp* rop) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(rop)); - const auto out_tv = rop->out()->as(); + const auto out_tv = rop->out()->as(); const auto out_domain = out_tv->domain(); const bool is_block_reduce = out_domain->hasBlockReduction(); @@ -199,7 +182,7 @@ void IndexLowering::visit(const kir::ReductionOp* rop) { std::none_of( out_domain->domain().begin(), out_domain->domain().end(), - [](kir::IterDomain* id) { + [](IterDomain* id) { return !id->isThread() && id->isReduction() && !id->extent()->isOneInt(); }), @@ -212,11 +195,11 @@ void IndexLowering::visit(const kir::ReductionOp* rop) { const auto out = lowerDstIndex(rop->out()); const auto in = lowerSrcIndex(rop->in(), rop->out()); - kir::ReductionOp* block_reduction_op = nullptr; + ReductionOp* block_reduction_op = nullptr; if (is_block_reduce) { - block_reduction_op = ir_builder_.create( - rop->operation(), rop->init(), out, in); + block_reduction_op = IrBuilder::create( + rop->getReductionOpType(), rop->init(), out, in); if (rop->predicate()) { block_reduction_op->setPredicate(rop->predicate()); } @@ -228,29 +211,22 @@ void IndexLowering::visit(const kir::ReductionOp* rop) { if (is_grid_reduce) { const auto reduce_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridCommWorkBufferSize(ir_builder_, out_domain), - out->dtype(), - false); + getGridCommWorkBufferSize(out_domain), out->dtype(), false); const auto sync_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridSyncBufferSize(ir_builder_, out_domain), - DataType::Int, - true); + getGridSyncBufferSize(out_domain), DataType::Int, true); const auto grid_reduction_op = (block_reduction_op == nullptr) - ? ir_builder_.create( - rop->operation(), rop->init(), out, in) + ? IrBuilder::create( + rop->getReductionOpType(), rop->init(), out, in) : block_reduction_op; // The thread predicate for GridReduction needs to be set // separately from the main predicate. Do not combine them like // other expressions. const auto& thread_pred = - GpuLower::current()->threadPredMap().getPredicatedParallelTypes( - out_tv->fuserTv()); - auto grid_reduction = ir_builder_.create( + GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv); + auto grid_reduction = IrBuilder::create( grid_reduction_op, reduce_buffer, sync_buffer); grid_reduction->setThreadPredicate(thread_pred); @@ -260,8 +236,8 @@ void IndexLowering::visit(const kir::ReductionOp* rop) { // predicate does not work when the write predicate of the // blockReduce is different from the read predicate. if (is_block_reduce) { - grid_reduction->setPredicate( - ir_builder_.create(ir_builder_.trueVal())); + grid_reduction->setPredicate(IrBuilder::create( + GpuLower::current()->kernel()->trueVal())); } else { grid_reduction->setPredicate(rop->predicate()); } @@ -277,15 +253,15 @@ void IndexLowering::visit(const kir::ReductionOp* rop) { } if (!is_block_reduce && !is_grid_reduce) { - // TODO(kir): this breaks our "SSA" form - pushBack(ir_builder_.create(rop->operation(), out, out, in)); + pushBack( + IrBuilder::create(rop->getReductionOpType(), out, out, in)); } } -void IndexLowering::visit(const kir::WelfordOp* wop) { - TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(wop)); +void IndexLowering::handle(const WelfordOp* wop) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(wop)); - const auto out_tv = wop->outAvg()->as(); + const auto out_tv = wop->outAvg()->as(); const auto out_domain = out_tv->domain(); const bool is_block_reduce = out_domain->hasBlockReduction(); @@ -298,7 +274,7 @@ void IndexLowering::visit(const kir::WelfordOp* wop) { std::none_of( out_domain->domain().begin(), out_domain->domain().end(), - [](kir::IterDomain* id) { + [](IterDomain* id) { return !id->isThread() && id->isReduction(); }), "Found a reduction stage that has both a non-parallelized ", @@ -322,18 +298,18 @@ void IndexLowering::visit(const kir::WelfordOp* wop) { auto out_var = lowerDstIndex(wop->outVar()); auto out_N = lowerDstIndex(wop->outN()); - kir::WelfordOp* welford_op = ir_builder_.create( - out_var, + WelfordOp* welford_op = IrBuilder::create( out_avg, + out_var, out_N, - wop->initVar(), wop->initAvg(), + wop->initVar(), wop->initN(), - in_var, in_avg, + in_var, in_N); - kir::WelfordOp* block_welford_op = nullptr; + WelfordOp* block_welford_op = nullptr; if (is_block_reduce) { block_welford_op = welford_op; @@ -348,21 +324,17 @@ void IndexLowering::visit(const kir::WelfordOp* wop) { if (is_grid_reduce) { // Buffer allocation - const auto work_buffer_size = - getGridCommWorkBufferSize(ir_builder_, out_domain); + const auto work_buffer_size = getGridCommWorkBufferSize(out_domain); - const auto out_var_buffer = allocGlobalBufferForGridComm( - ir_builder_, work_buffer_size, out_var->dtype(), false); - const auto out_avg_buffer = allocGlobalBufferForGridComm( - ir_builder_, work_buffer_size, out_avg->dtype(), false); - const auto out_N_buffer = allocGlobalBufferForGridComm( - ir_builder_, work_buffer_size, out_N->dtype(), false); + const auto out_var_buffer = + allocGlobalBufferForGridComm(work_buffer_size, out_var->dtype(), false); + const auto out_avg_buffer = + allocGlobalBufferForGridComm(work_buffer_size, out_avg->dtype(), false); + const auto out_N_buffer = + allocGlobalBufferForGridComm(work_buffer_size, out_N->dtype(), false); const auto sync_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridSyncBufferSize(ir_builder_, out_domain), - DataType::Int, - true); + getGridSyncBufferSize(out_domain), DataType::Int, true); // Grid Welford instantiation const auto grid_welford_op = @@ -372,10 +344,9 @@ void IndexLowering::visit(const kir::WelfordOp* wop) { // separately from the main predicate. Do not combine them like // other expressions. const auto& thread_pred = - GpuLower::current()->threadPredMap().getPredicatedParallelTypes( - out_tv->fuserTv()); + GpuLower::current()->threadPredMap().getPredicatedParallelTypes(out_tv); - auto grid_welford = ir_builder_.create( + auto grid_welford = IrBuilder::create( grid_welford_op, out_var_buffer, out_avg_buffer, @@ -400,18 +371,18 @@ void IndexLowering::visit(const kir::WelfordOp* wop) { } } -void IndexLowering::visit(const kir::BroadcastOp* bop) { - TORCH_INTERNAL_ASSERT(ir_utils::isTVOp(bop)); +void IndexLowering::handle(const BroadcastOp* bop) { + TORCH_INTERNAL_ASSERT(ir_utils::isTvOp(bop)); - const auto out_tv = bop->out()->as(); + const auto out_tv = bop->out()->as(); const auto out = lowerDstIndex(bop->out()); const auto in = lowerSrcIndex(bop->in(), bop->out()); - auto indexed_expr = ir_builder_.create(out, in); + auto indexed_expr = + IrBuilder::create(out, in, bop->getBroadcastDimFlags()); const ParallelTypeBitmap parallel_bitmap = - GpuLower::current()->threadPredMap().getParallelBroadcastDomains( - out_tv->fuserTv()); + GpuLower::current()->threadPredMap().getParallelBroadcastDomains(out_tv); const bool block_x = parallel_bitmap.get(ParallelType::BIDx); const bool block_y = parallel_bitmap.get(ParallelType::BIDy); @@ -430,18 +401,12 @@ void IndexLowering::visit(const kir::BroadcastOp* bop) { // Grid broadcast const auto out_domain = out_tv->domain(); const auto broadcast_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridCommWorkBufferSize(ir_builder_, out_domain), - out->dtype(), - false); + getGridCommWorkBufferSize(out_domain), out->dtype(), false); const auto sync_buffer = allocGlobalBufferForGridComm( - ir_builder_, - getGridSyncBufferSize(ir_builder_, out_domain), - DataType::Int, - true); + getGridSyncBufferSize(out_domain), DataType::Int, true); - auto grid_broadcast = ir_builder_.create( + auto grid_broadcast = IrBuilder::create( indexed_expr, broadcast_buffer, sync_buffer); if (bop->predicate()) { @@ -453,19 +418,19 @@ void IndexLowering::visit(const kir::BroadcastOp* bop) { pushBack(grid_broadcast); } -void IndexLowering::visit(const kir::Allocate* allocate) { +void IndexLowering::handle(const kir::Allocate* allocate) { // TODO(kir): remove the need for const_cast pushBack(const_cast(allocate)); // NOLINT } -void IndexLowering::visit(const kir::Sync* sync) { +void IndexLowering::handle(const kir::Sync* sync) { // TODO(kir): remove the need for const_cast pushBack(const_cast(sync)); // NOLINT } -void IndexLowering::generate(const std::vector& exprs) { +void IndexLowering::generate(const std::vector& exprs) { for (auto expr : exprs) { - expr->accept(this); + OptOutConstDispatch::handle(expr); } } diff --git a/torch/csrc/jit/codegen/cuda/lower_index.h b/torch/csrc/jit/codegen/cuda/lower_index.h index 5eb27c78f28..2f3af0061e1 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index.h +++ b/torch/csrc/jit/codegen/cuda/lower_index.h @@ -1,10 +1,10 @@ #pragma once -#include +#include #include #include -#include +#include #include #include @@ -14,10 +14,11 @@ namespace jit { namespace fuser { namespace cuda { -class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor { +// TODO: Replace with mutator as IndexLowering is replacing expr's with +// versions that are doing indexing +class TORCH_CUDA_CU_API IndexLowering : private OptOutConstDispatch { public: - static std::vector getIndexedExprs( - std::vector incoming_exprs) { + static std::vector getIndexedExprs(std::vector incoming_exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::IndexLowering::getIndexedExprs"); IndexLowering il; il.generate(incoming_exprs); @@ -25,28 +26,29 @@ class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor { } private: - IndexLowering(); + IndexLowering() = default; - void pushBack(kir::Expr*); + void pushBack(Expr*); - void visit(const kir::ForLoop*) final; - void visit(const kir::IfThenElse*) final; - void visit(const kir::UnaryOp*) final; - void visit(const kir::BinaryOp*) final; - void visit(const kir::TernaryOp*) final; - void visit(const kir::ReductionOp*) final; - void visit(const kir::WelfordOp*) final; - void visit(const kir::BroadcastOp*) final; - void visit(const kir::Allocate*) final; - void visit(const kir::Sync*) final; + void handle(const UnaryOp*) final; + void handle(const BinaryOp*) final; + void handle(const TernaryOp*) final; + void handle(const ReductionOp*) final; + void handle(const WelfordOp*) final; + void handle(const BroadcastOp*) final; - void generate(const std::vector& exprs); + void handle(const kir::ForLoop*) final; + void handle(const kir::IfThenElse*) final; + void handle(const kir::Allocate*) final; + void handle(const kir::Sync*) final; - kir::Val* lowerSrcIndex(kir::Val* val, kir::Val* dst) const; - kir::Val* lowerDstIndex(kir::Val* dst) const; + void generate(const std::vector& exprs); + + Val* lowerSrcIndex(Val* val, Val* dst) const; + Val* lowerDstIndex(Val* dst) const; private: - std::vector lowered_exprs_; + std::vector lowered_exprs_; // This is a slight work around as scope has a couple definitions, we have the // Scope that's in ForLoop/IfThenElse which is really just a wrapper around @@ -55,9 +57,10 @@ class TORCH_CUDA_CU_API IndexLowering : private kir::IrVisitor { // could be either the body or else body of the IfThenElse. However, we want // to understand the nesting of IfThenElse/ForLoop nodes. kir::Scope* active_scope_ = nullptr; - kir::Expr* active_scope_expr_ = nullptr; - kir::IrBuilder ir_builder_; + // Track for loops to send to indexing. Similar to what's done in + // kir::IrVisitor + std::vector for_loops_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp index 0947ef0f579..77be88183ec 100644 --- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.cpp @@ -1,8 +1,8 @@ #include #include +#include #include -#include -#include +#include #include #include @@ -33,8 +33,8 @@ class SmemAllocMap { public: //! Insert a new node if it's a SMEM allocation void insert(kir::Allocate* alloc) { - if (auto tv = dynamic_cast(alloc->buffer())) { - if (tv->memoryType() == MemoryType::Shared) { + if (auto tv = dynamic_cast(alloc->buffer())) { + if (tv->getMemoryType() == MemoryType::Shared) { // Note that a TensorView can have two allocations due to // unswitch. auto p = map_.insert({tv, alloc}); @@ -50,290 +50,298 @@ class SmemAllocMap { } } - //! Get the buffer that is actually allocated for a given TV - kir::TensorView* getRealBuffer(kir::TensorView* tv) const { + //! Run through aliases to get the buffer that is actually allocated for a + //! given TV + TensorView* getRealBuffer(TensorView* tv) const { auto it = map_.find(tv); TORCH_INTERNAL_ASSERT( - it != map_.end(), "Allocation not found for ", kir::toString(tv)); + it != map_.end(), "Allocation not found for ", tv->toString()); const kir::Allocate* alloc = it->second; while (alloc->alias()) { alloc = alloc->alias(); } auto buf = alloc->buffer(); - TORCH_INTERNAL_ASSERT(buf->isA()); - return buf->as(); + TORCH_INTERNAL_ASSERT(buf->isA()); + return buf->as(); } private: - std::unordered_map map_; + std::unordered_map map_; }; -//! Insert WAR sync for a given ForLoop -class LocalSyncInserterForLoop { - using TvSet = std::unordered_set; +struct WarMemoryInfo { + // True if there's a sync after the last read within the alloc loop. + bool sync_after_read = false; + // True if there's a sync before the first write. There can be multiple writes + // from memory aliasing. + bool sync_before_write = false; + + // Has there been a read of this memory location + bool read_hit = false; + + // Has there been *the* write to this memory location, assumes single write + // instruction (needs to be before conditionals added to code) + bool write_hit = false; + + // For loop this TV is compute_at'ed in. + kir::ForLoop* ca_loop = nullptr; +}; + +// To prevent shared memory from being over written before it is read, a +// synchronization point has to be inserted either between the allocation of an +// SMEM buffer and where we write into it, or after the buffer's last read +// before exiting the allocation's scope. +// +// e.g. +// for i: +// "alloc A" in shared memory - This is really marked by the compute_at point +// sync_loc_0 +// for j: +// sync_loc_1 +// for k: +// sync_loc_2 +// A = ... +// for k: +// ... = ... A +// for j: +// for k: +// ... = ... A +// sync_loc_3 +// sync_loc_4 +// sync_loc_5 +// +// All sync locations here provide valid protection that memory in A is finished +// being read before it is over written in the next iteration +// +// Insertion of sync threads will be done from the inner most position to the +// outer most. If a sync protecting the buffer is not already placed, the +// location prefered for the sync threads is the last possible position. One +// future optimization could be to not sync on the last iteration of the loop +// the sync is placed in. +class WarSyncInserter : private kir::ExprMutator { public: + static std::vector insert(const std::vector& exprs) { + WarSyncInserter inserter(exprs); + return inserter.exprs_; + } + + private: //! Insert Sync nodes at the end of a given for-loop when a WAR //! hazard may happen. - LocalSyncInserterForLoop(kir::ForLoop* fl, SmemAllocMap& alloc_map) - : alloc_map_(alloc_map) { - for (auto expr : fl->body().exprs()) { - handle(expr); - } - - // No need to insert sync when the loop is not actually generated - if (fl->iter_domain()->isThread() || fl->iter_domain()->isBroadcast()) { - return; - } - - // Determine if any smem TV is written to at beginning of the for-loop - // and whether that smem TV is read from at the end of the for-loop - // Insert new SyncThreads at end of for-loop to prevent WAR race condition - // - // TODO: replace __syncthreads with __threadfence for alias ops - // - if (detectIntersection(initial_, final_) && - !fl->body().exprs().back()->isA() && !is_last_op_sync_) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - fl->body().push_back(ir_builder.create(true)); - initial_sync_ = true; - is_last_op_sync_ = true; - final_.clear(); + WarSyncInserter(const std::vector& exprs) { + auto& lower_alloc_info_map = GpuLower::current()->localAllocationInfoMap(); + for (const auto& entry : lower_alloc_info_map) { + alloc_map_.insert(entry.first); } + kir::ExprMutator::traverseAndInsert(exprs); } - const auto& initial() const { - return initial_; + void handle(kir::IfThenElse* ite) final { + TORCH_INTERNAL_ASSERT( + ite->elseBody().empty(), + "Pass does not support conditional flow,", + " needs to be done before conditional execution is lowered."); + kir::ExprMutator::handle(ite); } - const auto& final() const { - return final_; - } - - const auto& all_smem_inputs() const { - return all_smem_inputs_; - } - - const auto& all_smem_outputs() const { - return all_smem_outputs_; - } - - void handle(kir::Expr* expr) { - if (ir_utils::isTVOp(expr)) { - is_last_op_sync_ = false; - - // For this SyncInserter - if (initial_sync_) { - addInputSmemTvs(expr, final_); - } else { - addInputSmemTvs(expr, final_); - addOutputSmemTvs(expr, initial_); + void handle(kir::Sync* sync) final { + // Register the sync for the active for loop + sync_hit_.back() = true; + // Run through the active allocations, if a read was hit, register there was + // a sync after the read. If there's subsequent reads on this buffer the + // sync_after_read will be cleared. + for (auto& entry : smem_allocations_) { + auto& alloc_stack = entry.second; + if (alloc_stack.back().read_hit) { + alloc_stack.back().sync_after_read = true; } - - // For parent SyncInserter - addOutputSmemTvs(expr, all_smem_outputs_); - addInputSmemTvs(expr, all_smem_inputs_); - } else if (auto sync = dynamic_cast(expr)) { - handle(sync); - } else if (auto ite = dynamic_cast(expr)) { - handle(ite); - } else if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - } else if (auto alloc = dynamic_cast(expr)) { - alloc_map_.insert(alloc); } } - void handle(kir::Sync* sync) { - is_last_op_sync_ = true; - initial_sync_ = true; - final_.clear(); - } - - void handle(kir::IfThenElse* ite) { - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - for (auto expr : ite->elseBody().exprs()) { - handle(expr); - } - } - - void handle(kir::ForLoop* fl) { - LocalSyncInserterForLoop child_sync_inserter(fl, alloc_map_); - - const auto& child_inputs = child_sync_inserter.all_smem_inputs(); - const auto& child_outputs = child_sync_inserter.all_smem_outputs(); - const bool maybe_skipped = !fl->start()->isZeroInt() && - !isParallelTypeThread(fl->iter_domain()->parallelType()); - - // Default - Track all smem inputs / outputs - all_smem_inputs_.insert(child_inputs.begin(), child_inputs.end()); - all_smem_outputs_.insert(child_outputs.begin(), child_outputs.end()); - - // Propagate the last_op_sync flag from the child loop. If the - // child is deterministically executed at least once, just set the - // flag with the child flag. Otherwise, conservatively set the - // flag, i.e., if the current flag is true and the child flag is - // also true, we can say the last op is still sync. - if (!maybe_skipped) { - is_last_op_sync_ = child_sync_inserter.is_last_op_sync_; - } else { - is_last_op_sync_ = - is_last_op_sync_ && child_sync_inserter.is_last_op_sync_; + // Checks if fl or loops within it have hit a sync + bool syncWithin(kir::ForLoop* fl) { + // If outer most scope check the first sync_hit_ position + if (fl == nullptr) { + return sync_hit_[0]; } - // When the child is not guaranteed to have sync. - if (!child_sync_inserter.initial_sync_) { - // If no sync is yet found, add the child outputs to - // initial. - if (!initial_sync_) { - initial_.insert(child_outputs.begin(), child_outputs.end()); - } - // Add the child inputs to final even when inital_sync is false, - // which only means sync may not be found yet. - final_.insert(child_inputs.begin(), child_inputs.end()); - } else { - // Similar to the above case, but here, the child is guaranteed - // to have sync, so we only need to look at initial and final. - if (!initial_sync_) { - initial_.insert( - child_sync_inserter.initial().begin(), - child_sync_inserter.initial().end()); - } - if (!maybe_skipped) { - initial_sync_ = true; - final_.clear(); - } - final_.insert( - child_sync_inserter.final().begin(), - child_sync_inserter.final().end()); - } - } + // Find the for loop we want to look within + auto fl_it = std::find(for_loops_.begin(), for_loops_.end(), fl); - static bool detectIntersection(const TvSet& left, const TvSet& right) { - for (auto item : left) { - if (right.find(item) != right.end()) { + // Convert it to an index, but add one for the outer most scope + auto fl_i = std::distance(for_loops_.begin(), fl_it) + 1; + + // Start at that index and see if there's syncs within that for loop + for (auto i : c10::irange(fl_i, sync_hit_.size())) { + if (sync_hit_[i]) { return true; } } return false; } - void addOutputSmemTvs(const kir::Expr* expr, TvSet& set) { - for (auto out : expr->outputs()) { - if (auto tv = dynamic_cast(out)) { - if (tv->memoryType() == MemoryType::Shared) { - auto real_tv = alloc_map_.getRealBuffer(tv); - set.insert(real_tv); + void handle(Expr* expr) final { + // If not a tensor view expression continue with dispatch + if (!ir_utils::isTvOp(expr)) { + kir::ExprMutator::handle(expr); + return; + } + + // Mark write has been hit for all output tvs + auto out_tvs = ir_utils::filterByType(expr->outputs()); + for (auto out_tv : out_tvs) { + if (out_tv->getMemoryType() != MemoryType::Shared) { + continue; + } + auto& entry = getMemInfo(out_tv); + + // If this is the first write and there's a sync in one of the loops after + // the compute at loop, then this buffer is protected. + if (syncWithin(entry.ca_loop) && !entry.write_hit) { + entry.sync_before_write = true; + } + entry.write_hit = true; + } + + // Mark read was hit, if sync_after_read was set, clear it. + auto inp_tvs = ir_utils::filterByType(expr->inputs()); + for (auto inp_tv : inp_tvs) { + if (inp_tv->getMemoryType() != MemoryType::Shared) { + continue; + } + auto& entry = getMemInfo(inp_tv); + entry.read_hit = true; + // Clear the sync_after_read if it was set because there was another write + entry.sync_after_read = false; + } + } + + void handle(kir::ForLoop* for_loop) final { + // Push loop scope information + auto prev_within_iter_loop_ = within_iter_loop_; + sync_hit_.push_back(false); + + // If there is no real iterating loop WAR syncs aren't necessary + within_iter_loop_ = within_iter_loop_ || + !(for_loop->iter_domain()->isThread() || + for_loop->iter_domain()->isBroadcast() || + for_loop->iter_domain()->extent()->isOneInt()); + + // Process the expressions in the for loop + kir::ExprMutator::handle(for_loop); + + // Sync analysis and cleanup: + // + // Pop for loop stack inside WarMemoryInfo structs if they match this one. + // Erase empty entries so we don't continue to search over them + // + // Insert sync at end of this for loop if any of the entries require + std::vector to_erase; + bool insert_sync = false; + for (auto& entry : smem_allocations_) { + auto& alloc_stack = entry.second; + if (alloc_stack.size() && alloc_stack.back().ca_loop == for_loop) { + if (!alloc_stack.back().sync_after_read && + !alloc_stack.back().sync_before_write) { + insert_sync = within_iter_loop_; + } + + alloc_stack.pop_back(); + if (alloc_stack.empty()) { + to_erase.push_back(entry.first); } } } - } - void addInputSmemTvs(const kir::Expr* expr, TvSet& set) { - for (auto in : expr->inputs()) { - if (auto tv = dynamic_cast(in)) { - if (tv->memoryType() == MemoryType::Shared) { - auto real_tv = alloc_map_.getRealBuffer(tv); - set.insert(real_tv); - } - } + for (auto tv : to_erase) { + smem_allocations_.erase(tv); } - } - private: - //! Allocation map of SMEM buffers - SmemAllocMap& alloc_map_; - - //! Track Shared Memory Inputs (Reads) for parent for-loop - TvSet all_smem_inputs_; - - //! Track Shared Memory Outputs (Writes) for parent for-loop - TvSet all_smem_outputs_; - - //! Shared Memory Writes at beginning of the for-loop - //! before first SyncThreads - TvSet initial_; - - //! Shared Memory Reads at end of the for-loop - //! Cleared after each SyncThreads - TvSet final_; - - //! Track first sync deterministically found in for-loop. Even when a - //! child loop has a sync, if it may not be executed due to non-zero - //! start value, this flag remains false. - bool initial_sync_ = false; - - //! Track if last op is sync - bool is_last_op_sync_ = false; -}; - -class LocalSyncInserter { - public: - //! Write-After-Read race conditions are only found within for-loops. - //! Sync nodes are inserted directly into the for-loops. - //! The expressions are modified in-place and exprs is const. - static void insertSyncs(const std::vector& exprs) { - LocalSyncInserter inserter; - inserter.insert(exprs); - } - - private: - void insert(const std::vector& exprs) { - for (auto expr : exprs) { - if (auto fl = dynamic_cast(expr)) { - LocalSyncInserterForLoop sync_inserter(fl, alloc_map_); - } else if (auto ite = dynamic_cast(expr)) { - insert(ite->thenBody().exprs()); - insert(ite->elseBody().exprs()); - } else if (auto alloc = dynamic_cast(expr)) { - alloc_map_.insert(alloc); - } + // WAR Sync is necessary in this loop, register its insertion. + if (insert_sync) { + auto sync_expr = IrBuilder::create(true); + kir::ExprMutator::registerInsertAfter( + for_loop->body().exprs().back(), sync_expr, &for_loop->body()); + handle(sync_expr); } + + // Pop for loop scope information + sync_hit_.pop_back(); + within_iter_loop_ = prev_within_iter_loop_; } - private: + // Create a new WarMemoryInfo entry if required and return a reference to it, + // else return the WarMemoryInfo associated with tv + WarMemoryInfo& getMemInfo(TensorView* tv) { + auto maybe_aliased_tv = alloc_map_.getRealBuffer(tv); + auto alloc_it = smem_allocations_.find(maybe_aliased_tv); + auto ca_loop = + loop_utils::getAllocInformation(tv, for_loops_).init_for_loop; + if (alloc_it == smem_allocations_.end()) { + WarMemoryInfo mem_info; + mem_info.ca_loop = ca_loop; + auto entry_it = + smem_allocations_ + .insert(std::make_pair( + maybe_aliased_tv, std::vector({mem_info}))) + .first; + return entry_it->second.back(); + } else if ( + maybe_aliased_tv != tv && alloc_it->second.back().ca_loop != ca_loop) { + WarMemoryInfo mem_info; + mem_info.ca_loop = ca_loop; + auto& alloc_stack = alloc_it->second; + alloc_stack.push_back(mem_info); + return alloc_stack.back(); + } + return alloc_it->second.back(); + } + + //! Allocation map of SMEM buffers. Needed because of SMEM buffer aliasing, + //! need to track the root of the alias to properly insert WAR hazard syncs SmemAllocMap alloc_map_; + + //! Is there a loop nest that has a non-trivial iteration (extent != 1) and + //! not bound to a block/thread. This indicates if a WAR sync is necessary, + //! otherwise the Expr is not in an iterating for loop. + bool within_iter_loop_ = false; + + // Track which loops have hit a sync. Used to see if there's a sync before + // write. + std::vector sync_hit_ = {false}; + + // Keep track of the active allocations we need to protect. Key is the + // "getRealBuffer", not the raw tv. There can be multiple WarMemoryInfo's + // because of aliasing. If the "getRealBuffer" tv has a compute at outside the + // alias tv, each aliased tv in a unique ca_loop has to be tracked separately + // for WAR insertion. + std::unordered_map> smem_allocations_; }; class ExprFlattener : private kir::IrVisitor { private: - void handle(kir::Expr* expr) { + using kir::IrVisitor::handle; + + void handle(Expr* expr) final { if (expr->isA() || expr->isA()) { - expr->accept(this); + kir::IrVisitor::handle(expr); } else { - exprs_.push_back(expr); - } - } - - void visit(const kir::ForLoop* fl) final { - for (auto expr : fl->body().exprs()) { - handle(expr); - } - } - - void visit(const kir::IfThenElse* ite) final { - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - for (auto expr : ite->elseBody().exprs()) { - handle(expr); + flat_exprs_.push_back(expr); } } private: - std::vector exprs_; + std::vector flat_exprs_; public: //! Flattens scopes extracting out a single ordered list of exprs. - static std::vector flatten( - const std::vector& loop_nests) { + static std::vector flatten(const std::vector& loop_nests) { ExprFlattener flattener; for (auto expr : loop_nests) { flattener.handle(expr); } - return flattener.exprs_; + return flattener.flat_exprs_; } }; @@ -342,53 +350,42 @@ class ValidatePlacementAfterWrites : private kir::IrVisitor { //! Validate no expr in writes found under loop static void validate( kir::ForLoop* loop, - const std::unordered_set& writes) { + const std::unordered_set& writes) { ValidatePlacementAfterWrites validator(writes); validator.handle(loop); } private: - ValidatePlacementAfterWrites(const std::unordered_set& writes) + using kir::IrVisitor::handle; + + ValidatePlacementAfterWrites(const std::unordered_set& writes) : writes_(writes) {} - void handle(kir::Expr* expr) { + void handle(Expr* expr) final { if (expr->isA() || expr->isA()) { - expr->accept(this); + kir::IrVisitor::handle(expr); } else { TORCH_INTERNAL_ASSERT( writes_.find(expr) == writes_.end(), "Block sync must be placed after ", - kir::toString(expr)); - } - } - - void visit(const kir::ForLoop* fl) final { - for (auto expr : fl->body().exprs()) { - handle(expr); - } - } - - void visit(const kir::IfThenElse* ite) final { - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - for (auto expr : ite->elseBody().exprs()) { - handle(expr); + expr->toString()); } } private: - const std::unordered_set& writes_; + const std::unordered_set& writes_; }; -class ReadAfterWriteSyncs : public kir::MutableIrVisitor { +class ReadAfterWriteSyncs : public kir::ExprMutator { private: + using kir::ExprMutator::handle; + //! Traverse up the loop stack from loops_it and if a halo loop is //! found, place a given sync expr before the outer-most halo loop. bool insertBeforeHaloLoop( std::vector::iterator loops_it, kir::Sync* sync_expr, - const std::unordered_set& writes) { + const std::unordered_set& writes) { std::vector::iterator halo_loop_it; bool halo_loop_found = false; @@ -420,21 +417,21 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { if (halo_loop_it == for_loops_.begin()) { // place in global scope - auto place_before_it = - std::find(loop_nests_.begin(), loop_nests_.end(), halo_loop); - TORCH_INTERNAL_ASSERT(place_before_it != loop_nests_.end()); - loop_nests_.insert(place_before_it, sync_expr); + auto place_before_it = std::find(exprs_.begin(), exprs_.end(), halo_loop); + TORCH_INTERNAL_ASSERT(place_before_it != exprs_.end()); + exprs_.insert(place_before_it, sync_expr); } else { auto place_in = *(halo_loop_it - 1); - place_in->body().insert_before(halo_loop, sync_expr); + kir::ExprMutator::registerInsertBefore( + halo_loop, sync_expr, &place_in->body()); } return true; } - void handle(kir::Expr* expr) { - if (!ir_utils::isTVOp(expr) || expr->isA()) { - expr->accept(this); + void handle(Expr* expr) final { + if (!ir_utils::isTvOp(expr) || expr->isA()) { + kir::ExprMutator::handle(expr); return; } @@ -443,8 +440,8 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { auto last_writes = last_writes_.front(); last_writes_.pop_front(); // Found that a sync is needed - TORCH_INTERNAL_ASSERT(expr->outputs()[0]->isA()); - auto out_tv = expr->outputs()[0]->as(); + TORCH_INTERNAL_ASSERT(expr->outputs()[0]->isA()); + auto out_tv = expr->outputs()[0]->as(); // Find where a sync needs to be inserted // This is very similar to how allocations are placed, simply place sync @@ -454,39 +451,35 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { // out of or saving state for tensor view ID -> for loop // TODO: Explicitly test the 3 cases below - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto sync_expr = ir_builder.create(); - if (out_tv->fuserTv()->getComputeAtPosition() == 0) { + auto sync_expr = IrBuilder::create(); + if (out_tv->getComputeAtPosition() == 0) { // Sync should be placed at global scope, after its outer most loop if // it has one. - kir::Expr* place_after = for_loops_.size() > 0 ? for_loops_[0] : expr; - // Find location in loop_nests_ + Expr* place_after = for_loops_.size() > 0 ? for_loops_[0] : expr; + // Find location in exprs_ auto place_after_it = - std::find(loop_nests_.begin(), loop_nests_.end(), place_after); + std::find(exprs_.begin(), exprs_.end(), place_after); TORCH_INTERNAL_ASSERT( - place_after_it != loop_nests_.end(), + place_after_it != exprs_.end(), "Could not figure out where to place synchronization. ", "Tried to place after, ", - toString(place_after), + place_after->toString(), ", but could not find this expression at the global scope."); - loop_nests_.insert(place_after_it + 1, sync_expr); + + registerInsertAfter(*(place_after_it + 1), sync_expr, nullptr); } else { // Find the last loop in computeAt of out_tv, this is the loop where we // would place an allocation for out_tv - auto fuser_tv = out_tv->fuserTv(); - auto lowered_local_id = - GpuLower::current() - ->lowerValue(fuser_tv->axis( - (int)out_tv->fuserTv()->getComputeAtPosition() - 1)) - ->as(); + auto local_id = out_tv->axis((int)out_tv->getComputeAtPosition() - 1); auto loops_it = std::find_if( for_loops_.begin(), for_loops_.end(), - [&lowered_local_id](const auto& loop) { + [&local_id](const auto& loop) { return GpuLower::current()->caLoopMap().areMapped( - loop->iter_domain(), lowered_local_id) || - loop->iter_domain()->parallelType() == ParallelType::Unroll; + loop->iter_domain(), local_id) || + loop->iter_domain()->getParallelType() == + ParallelType::Unroll; }); TORCH_INTERNAL_ASSERT(loops_it != for_loops_.end()); @@ -497,7 +490,7 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { } auto place_in = *loops_it; - kir::Expr* place_after = nullptr; + Expr* place_after = nullptr; if (loops_it + 1 == for_loops_.end()) { // Inline allocation, place after expr @@ -509,22 +502,12 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { place_after = *(loops_it + 1); } - place_in->body().insert_after(place_after, sync_expr); + registerInsertAfter(place_after, sync_expr, &place_in->body()); } } } - void visit(kir::ForLoop* fl) final { - for_loops_.push_back(fl); - // Modifying in place, make a copy of the vector - const std::vector exprs = fl->body().exprs(); - for (auto expr : exprs) { - handle(expr); - } - for_loops_.pop_back(); - } - - void visit(kir::IfThenElse*) final { + void handle(kir::IfThenElse*) final { TORCH_INTERNAL_ASSERT( false, "Pass does not support conditional statements, ", @@ -532,18 +515,17 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { } // Clear the modify status for all shared memory buffers - static void cleanSharedMemory( - std::unordered_map& smem) { + static void cleanSharedMemory(std::unordered_map& smem) { smem.clear(); } // Return a set of expressions that modify shared-memory // tensors. Expressions are excluded when syncthreads are already // placed. - std::unordered_set isModifiedSharedMemory( - const std::unordered_map& smem, - const std::vector& tvs) const { - std::unordered_set last_writes; + std::unordered_set isModifiedSharedMemory( + const std::unordered_map& smem, + const std::vector& tvs) const { + std::unordered_set last_writes; for (auto tv : tvs) { auto it = smem.find(tv); if (it != smem.end()) { @@ -553,18 +535,17 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { return last_writes; } - ReadAfterWriteSyncs(std::vector _loop_nests) - : loop_nests_(std::move(_loop_nests)) { + ReadAfterWriteSyncs(const std::vector& _exprs) { // Fusion shared_memory values // Tracks if shared memory is modified - std::unordered_map smem; + std::unordered_map smem; // Flatten all the expressions - auto flattened_exprs = ExprFlattener::flatten(loop_nests_); + auto flattened_exprs = ExprFlattener::flatten(_exprs); - kir::Expr* prev_tv_expr = nullptr; + Expr* prev_tv_expr = nullptr; for (auto expr : flattened_exprs) { - if (!ir_utils::isTVOp(expr) || expr->isA()) { + if (!ir_utils::isTvOp(expr) || expr->isA()) { continue; } @@ -578,22 +559,20 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { cleanSharedMemory(smem); } - for (auto out : expr->outputs()) { - if (out->isA()) { - if (out->as()->memoryType() == MemoryType::Shared) { - smem[out] = expr; - } + for (auto tv : ir_utils::filterByType(expr->outputs())) { + // Double buffered tensors do not need RAW sync to be inserted + // here, except for the initial load part, which is taken care + // separately by DoubleBufferInserter. + if (tv->getMemoryType() == MemoryType::Shared && + !tv->isDoubleBuffered()) { + smem[tv] = expr; } } prev_tv_expr = expr; } - // Insert read after write syncs - const std::vector exprs = loop_nests_; - for (auto expr : exprs) { - handle(expr); - } + kir::ExprMutator::traverseAndInsert(_exprs); TORCH_INTERNAL_ASSERT( sync_after_.empty(), "Didn't place all required syncs."); @@ -601,7 +580,7 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { private: //! Keep track of expressions that must be followed by syncthreads - std::deque sync_after_; + std::deque sync_after_; //! Keep track of write expressions that must be placed before //! syncthreads. @@ -611,35 +590,27 @@ class ReadAfterWriteSyncs : public kir::MutableIrVisitor { //! be placed before that. last_writes_ keeps track of expressions //! modifying the smem buffer each syncthreads is used for so that //! it is not placed before those write expressions. - std::deque> last_writes_; - - //! Keep track of for loops while inserting syncthreads - std::vector for_loops_; - - //! Loop-nests where syncthreads are inserted - std::vector loop_nests_; + std::deque> last_writes_; public: - static std::vector insert( - const std::vector& loop_nests) { + static std::vector insert(const std::vector& loop_nests) { ReadAfterWriteSyncs inserter(loop_nests); - return inserter.loop_nests_; + return inserter.exprs_; } }; } // namespace -std::vector insertRawThreadSynchronization( - const std::vector& exprs) { +std::vector insertRawThreadSynchronization( + const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::insertRawThreadSynchronization"); return ReadAfterWriteSyncs::insert(exprs); } -std::vector insertWarThreadSynchronization( - const std::vector& exprs) { +std::vector insertWarThreadSynchronization( + const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::insertWarThreadSynchronization"); - LocalSyncInserter::insertSyncs(exprs); - return exprs; + return WarSyncInserter::insert(exprs); } } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h index 50618373448..756462f0bd7 100644 --- a/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h +++ b/torch/csrc/jit/codegen/cuda/lower_insert_syncs.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -16,40 +16,14 @@ namespace cuda { //! //! WAR race condition occurs when the next iteration of the loop overwrites //! shared memory value before a previous operation has finished reading it. -//! -//! WAR Race Check: -//! Track all output shared memory TVs before first sync -//! Track all input shared memory TVs after last sync -//! If the intersection is non-empty, then there is a WAR race condition. -//! Recursively check each nested for-loop -//! -//! Parent-Child For-Loop Recursive Relationship -//! Notation: -//! None - Zero Syncs -//! 1+ - One or more Syncs -//! End - Sync is last op in for-loop to prevent WAR race condition -//! -//! Default: Track all shared memory inputs and outputs -//! -//! Parent - None -//! Child - None => Append All Child Outputs to Parent Initial -//! Child - 1+ => Parent first sync => Inherit Child Initial + Final -//! Child - End => Parent first sync => Keep Child Initial / Clear Parent Final -//! -//! Parent - 1+ -//! Child - None => Append All Child to Parent Last -//! Child - 1+ => Child Final to Parent Final / Discard Child Initial -//! Child - End => Clear Parent Last / Discard Child Initial -//! -//! If Child - End and Parent has zero remaining operations, then -//! Parent inherits Child End. -//! -std::vector insertWarThreadSynchronization( - const std::vector& exprs); +std::vector insertWarThreadSynchronization( + const std::vector& exprs); //! Insert syncs between writing to shared memory and then reading it. -std::vector insertRawThreadSynchronization( - const std::vector& exprs); +//! RAW pass is run before indexing, unrolling (loop duplication), memory +//! aliasing, and index (grid/block bcast/reduction) +std::vector insertRawThreadSynchronization( + const std::vector& exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp index e4396f9a864..12c7d33e077 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -19,7 +18,7 @@ namespace jit { namespace fuser { namespace cuda { -std::vector LoopNestGenerator::loweredExprs( +std::vector LoopNestGenerator::loweredExprs( const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::LoopNestGenerator::loweredExprs"); TORCH_INTERNAL_ASSERT(FusionGuard::getCurFusion() != nullptr); @@ -33,22 +32,20 @@ LoopNestGenerator::LoopNestGenerator(const std::vector& exprs) { namespace { -kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto extent_with_halo = gpu_lower->haloInfo().getExtent(kir_id); +kir::ForLoop* openForHelper(kir::ForLoop* scope, IterDomain* id) { + auto extent_with_halo = GpuLower::current()->haloInfo().getExtent(id); kir::ForLoop* new_scope = nullptr; if (extent_with_halo) { // When an axis is extended with halo, unrolling and vectorization // are assumed to not be used for now. TORCH_INTERNAL_ASSERT( - kir_id->parallelType() != ParallelType::Unroll && - !isParallelTypeVectorize(kir_id->parallelType())); + id->getParallelType() != ParallelType::Unroll && + !isParallelTypeVectorize(id->getParallelType())); // Use the extent that's extended by halo - new_scope = ir_builder.create( - kir_id, - kir_id->isBroadcast() ? ir_builder.zeroVal() - : ir_builder.create(c10::nullopt), + new_scope = IrBuilder::create( + id, + id->isBroadcast() ? GpuLower::current()->kernel()->zeroVal() + : IrBuilder::create(c10::nullopt), nullptr, extent_with_halo, nullptr, @@ -56,7 +53,7 @@ kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) { nullptr, false); } else { - new_scope = ir_builder.create(kir_id); + new_scope = IrBuilder::create(id); } if (scope != nullptr) { scope->body().insert(0, new_scope); @@ -66,13 +63,13 @@ kir::ForLoop* openForHelper(kir::ForLoop* scope, kir::IterDomain* kir_id) { } // namespace -void LoopNestGenerator::openFor(kir::IterDomain* kir_iter_domain) { +void LoopNestGenerator::openFor(IterDomain* id) { if (for_loops_.size() > 0) { - const auto new_scope = openForHelper(for_loops_.back(), kir_iter_domain); + const auto new_scope = openForHelper(for_loops_.back(), id); // for_loop_allocations_.insert({new_scope, 0}); for_loops_.push_back(new_scope); } else { - for_loops_.push_back(openForHelper(nullptr, kir_iter_domain)); + for_loops_.push_back(openForHelper(nullptr, id)); lowered_exprs_.insert(lowered_exprs_.begin(), for_loops_.back()); } } @@ -82,7 +79,7 @@ void LoopNestGenerator::closeFor() { for_loops_.pop_back(); } -void LoopNestGenerator::pushFront(kir::Expr* expr) { +void LoopNestGenerator::pushFront(Expr* expr) { if (for_loops_.size() == 0) { lowered_exprs_.insert(lowered_exprs_.begin(), expr); } else { @@ -91,18 +88,15 @@ void LoopNestGenerator::pushFront(kir::Expr* expr) { } void LoopNestGenerator::handle(Expr* expr) { - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - // Check if it's a tensor view expression we need to place in the loop nest // structure - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { // Close all the loops, scalar operations cannot be inside for loops based // on expr sorting. while (!for_loops_.empty()) { closeFor(); } - pushFront(gpu_lower->lowerExpr(expr)); + pushFront(expr); for (auto out : expr->outputs()) { TORCH_INTERNAL_ASSERT( @@ -112,10 +106,8 @@ void LoopNestGenerator::handle(Expr* expr) { " cannot lower ", out->getValType().value()); - pushFront(ir_builder.create( - gpu_lower->lowerValue(out), - MemoryType::Local, - ir_builder.create(1))); + pushFront(IrBuilder::create( + out, MemoryType::Local, GpuLower::current()->kernel()->oneVal())); } return; } @@ -130,27 +122,19 @@ void LoopNestGenerator::handle(Expr* expr) { // Figure out what the entire loop structure should look like. std::vector loop_structure = loop_structures_.at(out_tv); - std::vector kir_loop_structure; - std::transform( - loop_structure.begin(), - loop_structure.end(), - std::back_inserter(kir_loop_structure), - [&gpu_lower](IterDomain* id) { - return gpu_lower->lowerValue(id)->as(); - }); // Ordering of loop_structure is global, so simply close loops we don't need, // and open the ones we do. while (!for_loops_.empty() && std::find( - kir_loop_structure.begin(), - kir_loop_structure.end(), - for_loops_.back()->iter_domain()) == kir_loop_structure.end()) { + loop_structure.begin(), + loop_structure.end(), + for_loops_.back()->iter_domain()) == loop_structure.end()) { closeFor(); } - for (auto loop : kir_loop_structure) { + for (auto loop : loop_structure) { auto find_it = std::find_if( for_loops_.begin(), for_loops_.end(), [loop](kir::ForLoop* fl) { return fl->iter_domain() == loop; @@ -160,7 +144,7 @@ void LoopNestGenerator::handle(Expr* expr) { } } - pushFront(gpu_lower->lowerExpr(expr)); + pushFront(expr); } namespace { diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.h b/torch/csrc/jit/codegen/cuda/lower_loops.h index fbbdf079e89..9b480d7eb6f 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.h +++ b/torch/csrc/jit/codegen/cuda/lower_loops.h @@ -1,13 +1,12 @@ #pragma once -#include +#include #include #include #include #include -#include #include namespace torch { @@ -30,20 +29,20 @@ namespace cuda { //! nests to initialize reduction buffers. class TORCH_CUDA_CU_API LoopNestGenerator { public: - static std::vector loweredExprs(const std::vector& exprs); + static std::vector loweredExprs(const std::vector& exprs); private: LoopNestGenerator(const std::vector& exprs); // Open a new inner most for loop, track which TV it was constructed from // according to the computeAt chain. - void openFor(kir::IterDomain*); + void openFor(IterDomain*); // Close the inner most for loop void closeFor(); // Appends an expression to the current scope - void pushFront(kir::Expr* expr); + void pushFront(Expr* expr); void handle(Expr* expr); @@ -52,7 +51,7 @@ class TORCH_CUDA_CU_API LoopNestGenerator { private: // Lowered exprs to return - std::vector lowered_exprs_; + std::vector lowered_exprs_; // Keep all for loops conveniently to make unrolling easier, basically just a // stack of the active for_loops diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp b/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp index f5f5c72676a..f17f91806d6 100644 --- a/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_magic_zero.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include namespace torch { @@ -12,11 +12,11 @@ namespace cuda { namespace { -class MagicZeroInserter : public kir::MutableIrVisitor { +class MagicZeroInserter : public kir::ExprMutator { public: - static std::vector insert(const std::vector& exprs) { + static std::vector insert(const std::vector& exprs) { MagicZeroInserter inserter(exprs); - return inserter.loop_nests_; + return inserter.exprs_; } private: @@ -25,94 +25,43 @@ class MagicZeroInserter : public kir::MutableIrVisitor { kir::ForLoop* fl = nullptr; }; - MagicZeroInserter(const std::vector& exprs) - : loop_nests_(exprs), ir_builder(GpuLower::current()->kernel()) { - loop_nests_.insert( - loop_nests_.begin(), ir_builder.create()); - for (auto expr : exprs) { - handle(expr); - } - insertAll(); + MagicZeroInserter(const std::vector& exprs) { + TORCH_INTERNAL_ASSERT(exprs.size()); + kir::ExprMutator::registerInsertBefore( + exprs.front(), IrBuilder::create(), nullptr); + kir::ExprMutator::traverseAndInsert(exprs); } - void handle(kir::Expr* expr) { - if (auto ite = dynamic_cast(expr)) { - handle(ite); - } else if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - } - } - - void handle(kir::IfThenElse* ite) { - scope_nest_.push_back(&ite->thenBody()); - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - scope_nest_.pop_back(); - scope_nest_.push_back(&ite->elseBody()); - for (auto expr : ite->elseBody().exprs()) { - handle(expr); - } - scope_nest_.pop_back(); - } - - void handle(kir::ForLoop* fl) { + void handle(kir::ForLoop* fl) final { if (fl->isUnrolled()) { - kir::Scope* scope = nullptr; - if (!scope_nest_.empty()) { - scope = scope_nest_.back(); - } - insertion_list_.push_back({scope, fl}); - } else { - scope_nest_.push_back(&fl->body()); - for (auto expr : fl->body().exprs()) { - handle(expr); - } - scope_nest_.pop_back(); - } - } - - void insertAll() { - for (const auto& info : insertion_list_) { - auto fl = info.fl; - auto scope = info.scope; - if (scope == nullptr) { - // place in global scope - auto loop_it = std::find(loop_nests_.begin(), loop_nests_.end(), fl); - TORCH_INTERNAL_ASSERT(loop_it != loop_nests_.end()); - // Place after the loop - loop_it++; - loop_nests_.insert(loop_it, ir_builder.create()); + if (scope_.empty()) { + kir::ExprMutator::registerInsertAfter( + fl, IrBuilder::create()); } else { - scope->insert_after(fl, ir_builder.create()); + TORCH_INTERNAL_ASSERT( + scope_.back()->exprs().size(), "Not expecting an empty loop."); + kir::ExprMutator::registerInsertAfter( + fl, IrBuilder::create(), scope_.back()); } + } else { + kir::ExprMutator::handle(fl); } } - //! Keep track for loop structure - std::vector scope_nest_; - - // Keep a copy of the expressions provided - std::vector loop_nests_; - - kir::IrBuilder ir_builder; - std::vector insertion_list_; }; } // namespace -std::vector insertMagicZero(const std::vector& exprs) { +std::vector insertMagicZero(const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::insertMagicZero"); // Check if magic zero was even used, if not we don't have to define it or // update it. const auto gpu_lower = GpuLower::current(); auto kernel = gpu_lower->kernel(); - const bool has_magic_zero = std::any_of( - kernel->irNodes().begin(), - kernel->irNodes().end(), - [](const std::unique_ptr& ir_node) { - return ir_node->isA() && isMagicZero(ir_node->as()); + const bool has_magic_zero = + std::any_of(kernel->vals().begin(), kernel->vals().end(), [](Val* val) { + return isMagicZero(val); }); if (!has_magic_zero) { @@ -122,19 +71,21 @@ std::vector insertMagicZero(const std::vector& exprs) { return MagicZeroInserter::insert(exprs); } -bool isMagicZero(kir::Val* val) { - auto ns = dynamic_cast(val); - if (ns == nullptr) { +bool isMagicZero(const Val* val) { + if (!val->isA()) { return false; } + auto ns = val->as(); return ns->dtype() == DataType::Int && ns->name() == std::string(kMagicZeroName); } -bool isProtectedWithMagicZero(kir::Val* val) { - auto def = dynamic_cast(val->definition()); - return def && def->operation() == BinaryOpType::Add && - isMagicZero(def->rhs()); +bool isProtectedWithMagicZero(const Val* val) { + if (val->definition() == nullptr || !val->definition()->isA()) { + return false; + } + auto bop = val->definition()->as(); + return bop->getBinaryOpType() == BinaryOpType::Add && isMagicZero(bop->rhs()); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h b/torch/csrc/jit/codegen/cuda/lower_magic_zero.h index 03a37a46813..942a3302801 100644 --- a/torch/csrc/jit/codegen/cuda/lower_magic_zero.h +++ b/torch/csrc/jit/codegen/cuda/lower_magic_zero.h @@ -14,15 +14,15 @@ namespace cuda { //! zero update after every (outer most) loop nest with a compile time extent. //! //! This will make sure nvrtc does not aggressively save predicate and indices. -std::vector insertMagicZero(const std::vector& exprs); +std::vector insertMagicZero(const std::vector& exprs); //! Check if val is a reference to the magic zero variable -bool isMagicZero(kir::Val* val); +bool isMagicZero(const Val* val); //! Check if val is protected with magic zero. //! //! Specifically, this returns true if val is defined as "x + magic_zero". -bool isProtectedWithMagicZero(kir::Val* val); +bool isProtectedWithMagicZero(const Val* val); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp index b94c12c27c8..66b405ac8e2 100644 --- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.cpp @@ -5,8 +5,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -18,85 +17,64 @@ namespace cuda { namespace { -class MisalignedVectorizationModifier { +class MisalignedVectorizationModifier : public kir::ExprMutator { public: - void process(const std::vector& exprs) { - FUSER_PERF_SCOPE( - "GpuLower::Lower::MisalignedVectorizationModifier::process"); - // Run through loop nests - // Find for-loops with misaligned vectorization domains - for (auto* expr : exprs) { - handle(expr); - } - } + MisalignedVectorizationModifier() = delete; - const std::unordered_map& replacementMap() const { - return expr_replacement_map_; + static std::vector processMisalignedVectorization( + const std::vector& exprs) { + FUSER_PERF_SCOPE("GpuLower::Lower::processMisalignedVectorization"); + MisalignedVectorizationModifier mvm(exprs); + return mvm.exprs_; } private: - void handle(kir::Expr* expr) { - if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - } else if (auto ite = dynamic_cast(expr)) { - handle(ite); - } + MisalignedVectorizationModifier(const std::vector& exprs) { + FUSER_PERF_SCOPE("GpuLower::Lower::MisalignedVectorizationModifier"); + // Run through loop nests + // Find for-loops with misaligned vectorization domains + kir::ExprMutator::traverseAndInsert(exprs); } - void handle(kir::ForLoop* fl) { - for_loops_structure_.push_back(fl); - - // Make copy of exprs because we replace them inplace in fl - const auto exprs_copy = fl->body().exprs(); - + void handle(kir::ForLoop* fl) final { + kir::Scope* scope = scope_.empty() ? nullptr : scope_.back(); if (containsAnyDirectChildMisalignedVectorize(fl)) { - auto new_fl = handleMisalignedVectorize(for_loops_structure_, fl); - expr_replacement_map_.insert({fl, new_fl}); + for_loops_.push_back(fl); + auto new_fl = handleMisalignedVectorize(for_loops_, fl); + for_loops_.pop_back(); + + kir::ExprMutator::registerReplace(fl, new_fl, scope); } else { - for (auto expr : exprs_copy) { - handle(expr); - } - } - - for_loops_structure_.pop_back(); - } - - void handle(kir::IfThenElse* ite) { - for (auto expr : ite->thenBody().exprs()) { - handle(expr); - } - for (auto expr : ite->elseBody().exprs()) { - handle(expr); + kir::ExprMutator::handle(fl); } } struct ReferenceTensors { // Input TensorView to Vectorize Set operation - kir::TensorView* in_tv = nullptr; + TensorView* in_tv = nullptr; // Output TensorView to Vectorize Set operation - kir::TensorView* out_tv = nullptr; + TensorView* out_tv = nullptr; // TensorView in global memory - kir::TensorView* global_tv = nullptr; + TensorView* global_tv = nullptr; // TensorView with vectorize IterDomain and not in global memory - kir::TensorView* vec_tv = nullptr; + TensorView* vec_tv = nullptr; }; - ReferenceTensors getReferenceTensors(kir::Expr* vectorized_expr) { + ReferenceTensors getReferenceTensors(Expr* vectorized_expr) { TORCH_INTERNAL_ASSERT(vectorized_expr != nullptr); TORCH_INTERNAL_ASSERT( - vectorized_expr->outputs().front()->isA()); - TORCH_INTERNAL_ASSERT( - vectorized_expr->inputs().front()->isA()); + vectorized_expr->outputs().front()->isA()); + TORCH_INTERNAL_ASSERT(vectorized_expr->inputs().front()->isA()); - auto in_tv = vectorized_expr->inputs().front()->as(); - auto out_tv = vectorized_expr->outputs().front()->as(); + auto in_tv = vectorized_expr->inputs().front()->as(); + auto out_tv = vectorized_expr->outputs().front()->as(); const bool global_vectorize_write_op = - (out_tv->memoryType() == MemoryType::Global && - in_tv->memoryType() == MemoryType::Local); + (out_tv->getMemoryType() == MemoryType::Global && + in_tv->getMemoryType() == MemoryType::Local); const bool global_vectorize_read_op = - (out_tv->memoryType() == MemoryType::Local && - in_tv->memoryType() == MemoryType::Global); + (out_tv->getMemoryType() == MemoryType::Local && + in_tv->getMemoryType() == MemoryType::Global); TORCH_INTERNAL_ASSERT( global_vectorize_write_op || global_vectorize_read_op, "Unsupported vectorize memory configuration detected."); @@ -104,25 +82,26 @@ class MisalignedVectorizationModifier { // TensorView on global memory. This is the tensor that may have // a non-aligned base address. auto global_tv = - (out_tv->memoryType() == MemoryType::Global) ? out_tv : in_tv; + (out_tv->getMemoryType() == MemoryType::Global) ? out_tv : in_tv; // TensorView with the misaligned vec iterDomain. It is the consumer // of vectorized load or the producer of vectorized store. It is // assumed that when the output TV is not on global memory, this // expression is a vectorized load, so the output TV is vec_tv. - auto vec_tv = (out_tv->memoryType() != MemoryType::Global) ? out_tv : in_tv; + auto vec_tv = + (out_tv->getMemoryType() != MemoryType::Global) ? out_tv : in_tv; return {in_tv, out_tv, global_tv, vec_tv}; } struct VectorizeData { - kir::Val* vector_size = nullptr; - kir::Val* shift = nullptr; - kir::Val* extent = nullptr; - kir::Val* remainder = nullptr; - kir::Val* extent_minus_remainder = nullptr; - kir::Val* last_root_domain_index = nullptr; - kir::Val* last_root_domain_index_shift = nullptr; + Val* vector_size = nullptr; + Val* shift = nullptr; + Val* extent = nullptr; + Val* remainder = nullptr; + Val* extent_minus_remainder = nullptr; + Val* last_root_domain_index = nullptr; + Val* last_root_domain_index_shift = nullptr; }; // Create constants for handling misaligned addresses @@ -130,48 +109,43 @@ class MisalignedVectorizationModifier { const std::vector& for_loop_structure, const ReferenceTensors& tensors, kir::IfThenElse* parent_scope_ite) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - // Generate vectorize index - auto indices = (tensors.out_tv->memoryType() == MemoryType::Global) - ? Index::getConsumerStridedIndices( - tensors.out_tv->fuserTv(), for_loop_structure) + auto indices = (tensors.out_tv->getMemoryType() == MemoryType::Global) + ? Index::getConsumerStridedIndices(tensors.out_tv, for_loop_structure) : Index::getProducerStridedIndices( - tensors.in_tv->fuserTv(), - tensors.out_tv->fuserTv(), - for_loop_structure); + tensors.in_tv, tensors.out_tv, for_loop_structure); // >>>>>>>>>>>>> // Number of elements in vectorize access auto vector_size = - tensors.vec_tv->domain()->domain().back()->extent()->as(); + tensors.vec_tv->domain()->domain().back()->extent()->as(); // Size of memory type for the elements - kir::Int* data_size_in_bytes = - ir_builder.create(dataTypeSize(tensors.vec_tv->dtype())); + Int* data_size_in_bytes = + IrBuilder::create(dataTypeSize(tensors.vec_tv->dtype())); // The number of bytes in the vectorize access auto vector_size_in_bytes = - ir_builder.mulExpr(vector_size, data_size_in_bytes); + IrBuilder::mulExpr(vector_size, data_size_in_bytes); - auto index = ir_builder.create( - tensors.global_tv->fuserTv(), indices); + auto index = + IrBuilder::create(tensors.global_tv, indices); auto address = createNamedScalarFromValue( parent_scope_ite->thenBody(), index, "address", true); // offset_size = (address % vector_size_bytes) / data_type_size_bytes // shift_init = vector_size - offset_size - auto a = ir_builder.modExpr(address, vector_size_in_bytes); - auto b = ir_builder.divExpr(a, data_size_in_bytes); - auto c = ir_builder.subExpr(vector_size, b); + auto a = IrBuilder::modExpr(address, vector_size_in_bytes); + auto b = IrBuilder::divExpr(a, data_size_in_bytes); + auto c = IrBuilder::subExpr(vector_size, b); auto shift_init = createNamedScalarFromValue( parent_scope_ite->thenBody(), c, "shift_val"); // shift = (shift_init == vector_size) ? 0 : shift_init // The number of elements until the first aligned address - auto shift_pred = ir_builder.eqExpr(shift_init, vector_size); - auto shift_val = - ir_builder.whereExpr(shift_pred, ir_builder.zeroVal(), shift_init); + auto shift_pred = IrBuilder::eqExpr(shift_init, vector_size); + auto shift_val = IrBuilder::whereExpr( + shift_pred, GpuLower::current()->kernel()->zeroVal(), shift_init); // >>>>>>>>>>>>> auto shift = createNamedScalarFromValue( @@ -183,13 +157,13 @@ class MisalignedVectorizationModifier { // remainder = (extent - shift) % vector_size // The number of elements remaining not accessed by vectorized operations - auto remaining_extent = ir_builder.subExpr(extent, shift); - auto remainder_val = ir_builder.modExpr(remaining_extent, vector_size); + auto remaining_extent = IrBuilder::subExpr(extent, shift); + auto remainder_val = IrBuilder::modExpr(remaining_extent, vector_size); auto remainder = createNamedScalarFromValue( parent_scope_ite->thenBody(), remainder_val, "remainder"); // (extent - remainder) is the upper-bound for the vectorize section - auto extent_remainder_val = ir_builder.subExpr(extent, remainder); + auto extent_remainder_val = IrBuilder::subExpr(extent, remainder); // >>>>>>>>>>>>> auto extent_minus_remainder = createNamedScalarFromValue( @@ -203,7 +177,7 @@ class MisalignedVectorizationModifier { // >>>>>>>>>>>>> auto last_root_domain_index_shift = - ir_builder.addExpr(last_root_domain_index, shift); + IrBuilder::addExpr(last_root_domain_index, shift); return { vector_size, @@ -220,20 +194,18 @@ class MisalignedVectorizationModifier { kir::IfThenElse* createVectorizeSection( const std::vector& child_loops, const VectorizeData& params) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto vectorized_child_loops = cloneForLoops( child_loops, params.vector_size, nullptr, true, params.shift); // Vectorize Range: [shift - (extent-remainder)) // (last_root_domain_index + shift) < (extent - remainder) - kir::Val* vectorize_cond = ir_builder.ltExpr( + Val* vectorize_cond = IrBuilder::ltExpr( params.last_root_domain_index_shift, params.extent_minus_remainder); kir::Predicate* vectorize_pred = - ir_builder.create(vectorize_cond->as()); + IrBuilder::create(vectorize_cond->as()); kir::IfThenElse* vectorize_ite = - ir_builder.create(vectorize_pred); + IrBuilder::create(vectorize_pred); for (auto cloned_loop : vectorized_child_loops) { vectorize_ite->thenBody().push_back(cloned_loop); @@ -247,20 +219,19 @@ class MisalignedVectorizationModifier { kir::IfThenElse* createInitialSection( const std::vector& child_loops, const VectorizeData& params) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto pre_child_loops = cloneForLoops( child_loops, params.vector_size, params.shift, false, nullptr); // Initial Range: [0 - shift) // last_root_domain_index == 0 - kir::Val* initial_cond = - ir_builder.eqExpr(params.last_root_domain_index, ir_builder.zeroVal()); + Val* initial_cond = IrBuilder::eqExpr( + params.last_root_domain_index, + GpuLower::current()->kernel()->zeroVal()); kir::Predicate* initial_pred = - ir_builder.create(initial_cond->as()); + IrBuilder::create(initial_cond->as()); kir::IfThenElse* initial_ite = - ir_builder.create(initial_pred); + IrBuilder::create(initial_pred); for (auto cloned_loop : pre_child_loops) { initial_ite->thenBody().push_back(cloned_loop); @@ -274,23 +245,21 @@ class MisalignedVectorizationModifier { kir::IfThenElse* createRemainderSection( const std::vector& child_loops, const VectorizeData& params) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto post_child_loops = cloneForLoops( child_loops, params.vector_size, params.remainder, false, params.shift); // Remainder Range: [(extent-remainder) - extent) // (extent - remainder) <= last_root_domain_index + shift < extent - kir::Val* lower_bound = ir_builder.geExpr( + Val* lower_bound = IrBuilder::geExpr( params.last_root_domain_index_shift, params.extent_minus_remainder); - kir::Val* upper_bound = - ir_builder.ltExpr(params.last_root_domain_index_shift, params.extent); - kir::Val* remainder_cond = ir_builder.andExpr(lower_bound, upper_bound); + Val* upper_bound = + IrBuilder::ltExpr(params.last_root_domain_index_shift, params.extent); + Val* remainder_cond = IrBuilder::andExpr(lower_bound, upper_bound); kir::Predicate* remainder_pred = - ir_builder.create(remainder_cond->as()); + IrBuilder::create(remainder_cond->as()); kir::IfThenElse* remainder_ite = - ir_builder.create(remainder_pred); + IrBuilder::create(remainder_pred); for (auto cloned_loop : post_child_loops) { remainder_ite->thenBody().push_back(cloned_loop); @@ -302,8 +271,6 @@ class MisalignedVectorizationModifier { kir::ForLoop* handleMisalignedVectorize( std::vector for_loop_structure, const kir::ForLoop* parent_for_loop) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto child_loops = findChildForLoops(parent_for_loop); // Assumption: All vectorize operations have the same shift @@ -315,17 +282,19 @@ class MisalignedVectorizationModifier { // The parent_for_loop contains allocate, read, compute, write operations const auto new_parent_for_loop = - ir_builder.create(parent_for_loop); + IrBuilder::create(parent_for_loop); // Transfer all expressions except for-loops to new parent for-loop // All expressions are placed at the beginning of the new for-loop - moveExprsExceptForLoops(parent_for_loop, new_parent_for_loop); + copyExprsExceptForLoops(parent_for_loop, new_parent_for_loop); // Get the predicate for all but the last root domain - auto pred_except_last_root_domain = ir_builder.create( - PredicateType::Misaligned, vectorized_expr, ir_builder.trueVal()); + auto pred_except_last_root_domain = IrBuilder::create( + PredicateType::Misaligned, + vectorized_expr, + GpuLower::current()->kernel()->trueVal()); kir::IfThenElse* pred_ite = - ir_builder.create(pred_except_last_root_domain); + IrBuilder::create(pred_except_last_root_domain); new_parent_for_loop->body().push_back(pred_ite); auto constants = createVectorizeConstants( @@ -351,17 +320,17 @@ class MisalignedVectorizationModifier { // Determine that the expression is UnaryOpType::Set AND // the output TensorView domain is vectorized - bool isVectorizeSetOp(kir::ForLoop* fl, kir::Expr* expr) { - if (fl->iter_domain()->parallelType() != + bool isVectorizeSetOp(kir::ForLoop* fl, Expr* expr) { + if (fl->iter_domain()->getParallelType() != ParallelType::MisalignedVectorize) { return false; } - if (expr->isA()) { - auto unaryOp = expr->as(); - if (unaryOp->out()->isA()) { - auto out_tv = unaryOp->out()->as(); - return unaryOp->operation() == UnaryOpType::Set && + if (expr->isA()) { + auto unaryOp = expr->as(); + if (unaryOp->out()->isA()) { + auto out_tv = unaryOp->out()->as(); + return unaryOp->getUnaryOpType() == UnaryOpType::Set && out_tv->domain()->hasVectorize(); } } @@ -374,15 +343,14 @@ class MisalignedVectorizationModifier { // vectorize flag - Do not generate for loop header // shift value - Add shift to global indices generated within for loop std::vector cloneForLoops( - const std::vector& for_loops, - kir::Val* loop_stop, - kir::Val* pred_stop, + const std::vector& for_loops_, + Val* loop_stop, + Val* pred_stop, bool vectorize, - kir::Val* vectorize_shift) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); + Val* vectorize_shift) { std::vector cloned_for_loops; - for (auto fl : for_loops) { + for (auto fl : for_loops_) { auto first_expr = fl->body().exprs().front(); bool has_vectorize_op = isVectorizeSetOp(fl, first_expr); @@ -391,12 +359,12 @@ class MisalignedVectorizationModifier { TORCH_INTERNAL_ASSERT( !has_vectorize_op || fl->body().exprs().size() == 1); - const auto new_loop = ir_builder.create( + const auto new_loop = IrBuilder::create( fl->iter_domain(), fl->index(), - ir_builder.zeroVal(), + GpuLower::current()->kernel()->zeroVal(), loop_stop, - ir_builder.oneVal(), + GpuLower::current()->kernel()->oneVal(), vectorize && has_vectorize_op, vectorize_shift, fl->isUnrollRequired()); @@ -406,9 +374,9 @@ class MisalignedVectorizationModifier { // Predicate the loop body if pred_stop is not null. This is to // make sure the loop itself is completely unrollable. if (pred_stop != nullptr) { - auto body_pred = ir_builder.create( - ir_builder.ltExpr(new_loop->index(), pred_stop)->as()); - auto body_ite = ir_builder.create(body_pred); + auto body_pred = IrBuilder::create( + IrBuilder::ltExpr(new_loop->index(), pred_stop)->as()); + auto body_ite = IrBuilder::create(body_pred); body->push_back(body_ite); body = &body_ite->thenBody(); } @@ -423,7 +391,7 @@ class MisalignedVectorizationModifier { } // Add all expressions except for loops to new parent for loop - void moveExprsExceptForLoops( + void copyExprsExceptForLoops( const kir::ForLoop* for_loop, kir::ForLoop* new_loop) { std::vector loops; @@ -448,10 +416,10 @@ class MisalignedVectorizationModifier { // Find the first vectorize set - either read or write // Add child For-Loop to for_loop_structure // Enable vectorize flag in child For-Loop - kir::Expr* findFirstVectorizedSetOp( + Expr* findFirstVectorizedSetOp( std::vector& for_loop_structure, - const std::vector& for_loops) { - for (auto fl : for_loops) { + const std::vector& for_loops_) { + for (auto fl : for_loops_) { auto first_expr = fl->body().exprs().front(); bool has_vectorize_op = isVectorizeSetOp(fl, first_expr); if (has_vectorize_op) { @@ -463,38 +431,31 @@ class MisalignedVectorizationModifier { } // Get full extent for the inner-most, merged root domain - kir::Val* getVectorizeExtent( - kir::TensorView* producer_tv, - kir::TensorView* consumer_tv) { + Val* getVectorizeExtent(TensorView* producer_tv, TensorView* consumer_tv) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto consumer_fuser_tv = consumer_tv->fuserTv(); - auto producer_fuser_tv = producer_tv->fuserTv(); - - auto p2c = - PairwiseRootDomainMap(producer_fuser_tv, consumer_fuser_tv) - .mapProducerToConsumer( - producer_fuser_tv->domain(), consumer_fuser_tv->domain()); + auto p2c = PairwiseRootDomainMap(producer_tv, consumer_tv) + .mapProducerToConsumer( + producer_tv->domain(), consumer_tv->domain()); auto consumer_root_right_of_ca_domains = IterVisitor::getInputsTo( - {consumer_fuser_tv->domain()->domain().begin() + - consumer_fuser_tv->getComputeAtPosition(), - consumer_fuser_tv->domain()->domain().end()}); + {consumer_tv->domain()->domain().begin() + + consumer_tv->getComputeAtPosition(), + consumer_tv->domain()->domain().end()}); auto producer_root_right_of_ca_domains = IterVisitor::getInputsTo( - {producer_fuser_tv->domain()->domain().begin() + - producer_fuser_tv->getComputeAtPosition(), - producer_fuser_tv->domain()->domain().end()}); + {producer_tv->domain()->domain().begin() + + producer_tv->getComputeAtPosition(), + producer_tv->domain()->domain().end()}); - const auto& consumer_contig = consumer_fuser_tv->domain()->contiguity(); - const auto& producer_contig = producer_fuser_tv->domain()->contiguity(); + const auto& consumer_contig = consumer_tv->domain()->contiguity(); + const auto& producer_contig = producer_tv->domain()->contiguity(); - auto producer_root_domain = producer_fuser_tv->getMaybeRFactorDomain(); + auto producer_root_domain = producer_tv->getMaybeRFactorDomain(); // Calculate extent of merged root domains - kir::Val* extent = nullptr; + Val* extent = nullptr; auto consumer_root_idx = - int(consumer_fuser_tv->getMaybeRFactorDomain().size()) - 1; + int(consumer_tv->getMaybeRFactorDomain().size()) - 1; for (int i = int(producer_root_domain.size()) - 1; i >= 0; --i) { auto producer_root_id = producer_root_domain.at(i); @@ -533,11 +494,10 @@ class MisalignedVectorizationModifier { // We now know it's safe to extend the vectorization domain to these // axes. It shouldn't matter whether producer or consumer is used. - auto consumer_extent = gpu_lower->lowerValue(consumer_root_id->extent()); if (extent == nullptr) { - extent = consumer_extent; + extent = consumer_root_id->extent(); } else { - extent = ir_builder.mulExpr(extent, consumer_extent); + extent = IrBuilder::mulExpr(extent, consumer_root_id->extent()); } // If it's not contiguous, extending the vectorization domain @@ -554,57 +514,37 @@ class MisalignedVectorizationModifier { return extent; } - kir::Val* createNamedScalarFromValue( + Val* createNamedScalarFromValue( kir::Scope& body, - kir::Val* val, + Val* val, const std::string& name, bool address = false) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto namedScalar = (address) ? ir_builder.addressExprNamedScalar(name, val) - : ir_builder.setExprNamedScalar(name, val); + auto namedScalar = (address) ? IrBuilder::addressExprNamedScalar(name, val) + : IrBuilder::setExprNamedScalar(name, val); TORCH_INTERNAL_ASSERT(namedScalar->definition() != nullptr); - auto alloc = ir_builder.create( - namedScalar, MemoryType::Local, ir_builder.oneVal()); + auto alloc = IrBuilder::create( + namedScalar, + MemoryType::Local, + GpuLower::current()->kernel()->oneVal()); body.push_back(alloc); body.push_back(namedScalar->definition()); return namedScalar; } - - private: - // We will track which loops in the incoming IR will be replaced and by what - std::unordered_map expr_replacement_map_; - - // A depth-first ordering of nested for loops - // It is used for indexing and predicate generation - std::vector for_loops_structure_; }; } // namespace -std::vector processMisalignedVectorization( - Fusion* fusion, - const std::vector& exprs) { - FUSER_PERF_SCOPE("GpuLower::Lower::processMisalignedVectorization"); - - MisalignedVectorizationModifier mvm; - mvm.process(exprs); - - std::vector mutated_exprs; - mutated_exprs.reserve(exprs.size()); - for (auto expr : exprs) { - mutated_exprs.push_back( - ir_utils::applyReplacements(mvm.replacementMap(), expr)); - } - - return mutated_exprs; +std::vector processMisalignedVectorization( + const std::vector& exprs) { + return MisalignedVectorizationModifier::processMisalignedVectorization(exprs); } bool containsAnyDirectChildMisalignedVectorize(const kir::ForLoop* fl) { for (auto expr : fl->body().exprs()) { if (expr->isA()) { auto child_fl = expr->as(); - if (child_fl->iter_domain()->parallelType() == + if (child_fl->iter_domain()->getParallelType() == ParallelType::MisalignedVectorize) { return true; } diff --git a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h index 588d3787752..bd7ae19d93a 100644 --- a/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h +++ b/torch/csrc/jit/codegen/cuda/lower_misaligned_vectorization.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include @@ -106,9 +106,8 @@ namespace cuda { //! } //! } //! -std::vector processMisalignedVectorization( - Fusion* fusion, - const std::vector& exprs); +std::vector processMisalignedVectorization( + const std::vector& exprs); bool containsAnyDirectChildMisalignedVectorize(const kir::ForLoop* fl); diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp index 838d5d85d9e..cd34c56b510 100644 --- a/torch/csrc/jit/codegen/cuda/lower_predicate.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_predicate.cpp @@ -7,8 +7,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -23,27 +22,26 @@ namespace cuda { namespace { -class ConditionalFromPredicateModifier { +class ConditionalFromPredicateModifier : public kir::IrVisitor { public: - ConditionalFromPredicateModifier(const std::vector& exprs) { - FUSER_PERF_SCOPE( - "GpuLower::Lower::ConditionalFromPredicateModifier::process"); - for (auto* expr : exprs) { - handle(expr); - } - } + ConditionalFromPredicateModifier() = delete; - const std::unordered_map& replacementMap() const { - return expr_replacement_map_; + static std::vector fillPredicates(const std::vector& exprs) { + ConditionalFromPredicateModifier cfpm(exprs); + return cfpm.exprs_; } private: - void handle(kir::Expr* expr) { - if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - } else if (auto ite = dynamic_cast(expr)) { - handle(ite); - } else if (expr != nullptr && expr->predicate() != nullptr) { + ConditionalFromPredicateModifier(const std::vector& exprs) { + FUSER_PERF_SCOPE( + "GpuLower::Lower::ConditionalFromPredicateModifier::process"); + kir::IrVisitor::handle(exprs); + } + + using kir::IrVisitor::handle; + + void handle(Expr* expr) final { + if (expr != nullptr && expr->predicate() != nullptr) { // Replace expr predicate with bool conditional auto conditional = generateConditional(expr->predicate()); TORCH_INTERNAL_ASSERT(conditional != nullptr); @@ -51,9 +49,11 @@ class ConditionalFromPredicateModifier { TORCH_INTERNAL_ASSERT(expr->predicate()->value() != nullptr); setWritePredicate(expr, conditional); } + + kir::IrVisitor::handle(expr); } - void setWritePredicate(kir::Expr* expr, kir::Bool* read_cond) { + void setWritePredicate(Expr* expr, Bool* read_cond) { if (expr->writePredicate() != nullptr) { auto write_cond = generateConditional(expr->writePredicate()); if (write_cond) { @@ -66,46 +66,25 @@ class ConditionalFromPredicateModifier { } } - void handle(kir::ForLoop* fl) { - for_loops_structure_.push_back(fl); - - const auto exprs_copy = fl->body().exprs(); - for (auto expr : exprs_copy) { - handle(expr); - } - - for_loops_structure_.pop_back(); - } - - void handle(kir::IfThenElse* ite) { + void handle(kir::IfThenElse* ite) final { TORCH_INTERNAL_ASSERT(ite->predicate() != nullptr); // If ite already has Bool conditional, handle internal expressions // Otherwise, generate conditional and update predicate - if (ite->predicate()->hasValue()) { - const auto then_exprs_copy = ite->thenBody().exprs(); - for (auto expr : then_exprs_copy) { - handle(expr); - } - - const auto else_exprs_copy = ite->elseBody().exprs(); - for (auto expr : else_exprs_copy) { - handle(expr); - } - } else { + if (!ite->predicate()->hasValue()) { auto conditional = generateConditional(ite->predicate()); TORCH_INTERNAL_ASSERT(conditional != nullptr); - TORCH_INTERNAL_ASSERT(conditional->isA()); + TORCH_INTERNAL_ASSERT(conditional->isA()); // Update bool conditional in-place ite->predicate()->setValue(conditional); - handle(ite); TORCH_INTERNAL_ASSERT(ite->predicate()->value() != nullptr); } + kir::IrVisitor::handle(ite); } // Generate conditional according to PredicateType - kir::Bool* generateConditional(kir::Predicate* pred) { + Bool* generateConditional(kir::Predicate* pred) { switch (pred->predicate_type()) { case PredicateType::Inline: case PredicateType::ReductionWrite: @@ -114,15 +93,16 @@ class ConditionalFromPredicateModifier { case PredicateType::Padding: { return PredicateCompute::getInlinePredicate( pred->expr(), - for_loops_structure_, + for_loops_, pred->thread_pred(), pred->predicate_type()); } case PredicateType::Vectorize: { std::vector outer_loops; kir::ForLoop* vectorized_loop = nullptr; - for (auto loop : for_loops_structure_) { - if (loop->iter_domain()->parallelType() == ParallelType::Vectorize) { + for (auto loop : for_loops_) { + if (loop->iter_domain()->getParallelType() == + ParallelType::Vectorize) { vectorized_loop = loop; break; } else { @@ -134,8 +114,7 @@ class ConditionalFromPredicateModifier { return UnswitchPredicate::get(outer_loops, vectorized_loop); } case PredicateType::Unswitch: { - return UnswitchPredicate::get( - for_loops_structure_, pred->unrolled_loop()); + return UnswitchPredicate::get(for_loops_, pred->unrolled_loop()); } case PredicateType::Manual: { return pred->value(); @@ -145,33 +124,13 @@ class ConditionalFromPredicateModifier { } return nullptr; } - - private: - // We will track which loops in the incoming IR will be replaced and by what - std::unordered_map expr_replacement_map_; - - // A depth-first ordering of nested for loops - // It is used for indexing and predicate generation - std::vector for_loops_structure_; }; } // namespace -std::vector generateConditionalFromPredicate( - Fusion* fusion, - const std::vector& exprs) { - FUSER_PERF_SCOPE("GpuLower::Lower::generateConditionalFromPredicate"); - - ConditionalFromPredicateModifier p2cm(exprs); - - std::vector mutated_exprs; - mutated_exprs.reserve(exprs.size()); - for (auto expr : exprs) { - mutated_exprs.push_back( - ir_utils::applyReplacements(p2cm.replacementMap(), expr)); - } - - return mutated_exprs; +std::vector generateConditionalFromPredicate( + const std::vector& exprs) { + return ConditionalFromPredicateModifier::fillPredicates(exprs); } namespace { @@ -225,17 +184,14 @@ class PredicateAnalyzer : public OptOutDispatch { return needs_predicate_; } - using OptOutDispatch::handle; - void handle(IterDomain* consumer_id) override { // The traversal should have ended if needs_predicate_ was true TORCH_INTERNAL_ASSERT(!needs_predicate_); // If consumer_id is not going to be materialized as a loop (e.g., // broadcast), no need to predicate - const auto gpu_lower = GpuLower::current(); if (consumer_id->isBroadcast() || - gpu_lower->trivialReductionInfo().isDerived(consumer_id)) { + GpuLower::current()->trivialReductionInfo().isDerived(consumer_id)) { return; } @@ -250,7 +206,7 @@ class PredicateAnalyzer : public OptOutDispatch { return; } - handle(consumer_id->definition()); + OptOutDispatch::handle(consumer_id->definition()); } // If it splits the input axis evenly, proceeds to check the input @@ -291,7 +247,7 @@ class PredicateAnalyzer : public OptOutDispatch { } // namespace bool PredicateElimination::needsPredicate(Expr* expr) const { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { return false; } @@ -394,7 +350,7 @@ bool PredicateElimination::needsPredicate(Expr* expr) const { } void PredicateElimination::handle(Expr* expr) { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { return; } @@ -491,7 +447,7 @@ bool PredicateElimination::setReductionInitValue( bool PredicateElimination::canOmitPredicate(const Expr* expr) const { TORCH_INTERNAL_ASSERT(expr != nullptr); - const auto out_tv = ir_utils::getTVOutput(expr); + const auto out_tv = ir_utils::getTvOutput(expr); TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Not a tensor expression"); // No need to predicate local tensors to which a scalar is assigned if (out_tv->getMemoryType() == MemoryType::Local) { @@ -508,38 +464,17 @@ bool PredicateElimination::canOmitPredicate(const Expr* expr) const { return false; } -bool PredicateElimination::canOmitPredicate(const kir::Expr* kir_expr) const { - TORCH_INTERNAL_ASSERT(kir_expr != nullptr); - const auto out_tv = ir_utils::getTVOutput(kir_expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Not a tensor expression"); - // No need to predicate local tensors to which a scalar is assigned - if (out_tv->memoryType() == MemoryType::Local) { - if (auto uop = dynamic_cast(kir_expr)) { - if (uop->operation() == UnaryOpType::Set && uop->in()->isScalar()) { - return true; - } - } - } - const auto fuser_tv = out_tv->fuserTv(); - if (fuser_tv == nullptr) { - return false; - } - return canOmitPredicate(fuser_tv->definition()); -} - -kir::Val* PredicateElimination::getInitValue(TensorView* tv) const { +Val* PredicateElimination::getInitValue(TensorView* tv) const { auto it = init_value_map_.find(tv); if (it == init_value_map_.end()) { return nullptr; } - const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); auto init_val = it->second; if (init_val == nullptr) { // No reduction restriction. Just use zero - return ir_builder.zeroVal(); + return GpuLower::current()->kernel()->zeroVal(); } else { - return gpu_lower->lowerValue(init_val); + return init_val; } } diff --git a/torch/csrc/jit/codegen/cuda/lower_predicate.h b/torch/csrc/jit/codegen/cuda/lower_predicate.h index 393d0fa5c18..c0a1f702f7b 100644 --- a/torch/csrc/jit/codegen/cuda/lower_predicate.h +++ b/torch/csrc/jit/codegen/cuda/lower_predicate.h @@ -1,5 +1,5 @@ #pragma once -#include +#include #include #include @@ -13,9 +13,8 @@ namespace cuda { //! Update predicates with valid bool conditionals //! -std::vector generateConditionalFromPredicate( - Fusion* fusion, - const std::vector& exprs); +std::vector generateConditionalFromPredicate( + const std::vector& exprs); class TORCH_CUDA_CU_API PredicateElimination : public IterVisitor { public: @@ -26,13 +25,8 @@ class TORCH_CUDA_CU_API PredicateElimination : public IterVisitor { //! \param expr Tensor expression bool canOmitPredicate(const Expr* expr) const; - //! True if expr does not need a predicate - //! - //! \param expr KIR tensor expr - bool canOmitPredicate(const kir::Expr* expr) const; - //! Value to initialize out-of-bound regions - kir::Val* getInitValue(TensorView* tv) const; + Val* getInitValue(TensorView* tv) const; //! Dump to string for debugging std::string toString() const; @@ -40,7 +34,7 @@ class TORCH_CUDA_CU_API PredicateElimination : public IterVisitor { private: using IterVisitor::handle; - void handle(Expr* expr) override; + void handle(Expr* expr) final; //! Set a value to initialize out-of-bound regions bool setDefaultInitValue(TensorView* tv); diff --git a/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp b/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp new file mode 100644 index 00000000000..582b6d91d06 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_replace_size.cpp @@ -0,0 +1,288 @@ +#include +#include +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { +// Going to generate a map of tensor view root domain extents to reduce the +// number used during lowering. For example if we have: +// +// T2[i0, i1] = T1[i0, i1] + T2[i2, i3] +// +// We know it would be safe to use: +// +// T2[i0, i1] = T1[i0, i1] + T2[i0, i1] +// +// And that way we don't generate T2.size[0] and T2.size[1], instead we will +// reuse T1.size[0] and T1.size[1] +// This is important when doing CSE as T2 and T1 would otherwise look like +// they're using different values, even though we know they're the same +// +// There's some duplicate logic here that's in computeAt map, but it's not so +// concice there to pull out. May want to consider making this mapping its own +// class especially as it may be useful during scheduling. +std::unordered_map getSimplificationMap(Fusion* fusion) { + std::list> disjoint_root_sets; + std::unordered_map*> + id_to_disjoint_root_set; + + auto map_root_ids = [&disjoint_root_sets, &id_to_disjoint_root_set]( + IterDomain* id0, IterDomain* id1) { + if (id0->isBroadcast() || id1->isBroadcast()) { + return; + } + + auto disjoint_set_0_it = id_to_disjoint_root_set.find(id0); + auto disjoint_set_1_it = id_to_disjoint_root_set.find(id1); + bool set_0_found = disjoint_set_0_it != id_to_disjoint_root_set.end(); + bool set_1_found = disjoint_set_1_it != id_to_disjoint_root_set.end(); + + if (set_0_found && set_1_found) { + if (disjoint_set_0_it->second == disjoint_set_1_it->second) { + return; + } + // merge second disjoint set into first + auto* set_0 = disjoint_set_0_it->second; + auto* set_1 = disjoint_set_1_it->second; + for (auto id : *set_1) { + set_0->emplace(id); + id_to_disjoint_root_set[id] = set_0; + } + // remove second set from disjoint_root_sets + disjoint_root_sets.erase(std::find( + disjoint_root_sets.begin(), disjoint_root_sets.end(), *set_1)); + } else if (set_0_found || set_1_found) { + auto existing_set = + set_0_found ? disjoint_set_0_it->second : disjoint_set_1_it->second; + auto to_add_id = set_0_found ? id1 : id0; + existing_set->emplace(to_add_id); + id_to_disjoint_root_set[to_add_id] = existing_set; + // add entry into existing set + } else { + // create new set entry + disjoint_root_sets.emplace_back(std::unordered_set()); + auto* new_set = &disjoint_root_sets.back(); + new_set->emplace(id0); + new_set->emplace(id1); + id_to_disjoint_root_set[id0] = new_set; + id_to_disjoint_root_set[id1] = new_set; + } + }; + + auto fusion_vals = fusion->usedMathVals(); + for (auto producer_tv : ir_utils::filterByType(fusion_vals)) { + auto consumer_tvs = ir_utils::consumerTvsOf(producer_tv); + for (auto consumer_tv : consumer_tvs) { + auto pairwise_map = PairwiseRootDomainMap(producer_tv, consumer_tv); + auto c2p_root_map = pairwise_map.mapConsumerToProducer( + consumer_tv->domain(), producer_tv->domain()); + for (auto entry : c2p_root_map) { + auto c_id = entry.first; + auto p_id = entry.second; + map_root_ids(p_id, c_id); + } + } + } + + // Map each set to an input ID (if it exists) that has the smallest ->name() + // entry value + std::unordered_map*, IterDomain*> + set_to_input_id; + + // Loop over the root domains, of the inputs to the fusion. Pick an input ID + // to use as the representative ID of the collected sets. Only consider inputs + // as those are the ones that map to values like "T0.size[1]". They are he + // ID's that propagated their extents into the problem. We could also check + // the outputs as we do have C++ examples of using output dimensions for the + // problem size instead of inputs. However, we don't do anything where we can + // translate to those kinds of kernels integrated into PyTorch. + for (auto input_tv : ir_utils::filterByType(fusion->inputs())) { + for (auto id : + TensorDomain::noReductions(input_tv->getMaybeRFactorDomain())) { + auto id_set_it = id_to_disjoint_root_set.find(id); + if (id_set_it == id_to_disjoint_root_set.end()) { + continue; + } + auto* id_set = id_set_it->second; + if (set_to_input_id.find(id_set) == set_to_input_id.end()) { + set_to_input_id[id_set] = id; + } else { + auto input_id_of_set = set_to_input_id.at(id_set); + // Swap id's if new name is less than previously set + bool swap_ids = id->name() < input_id_of_set->name(); + // If new id is a const scalar but previously was'nt use the const + // scalar + swap_ids = swap_ids || + (id->extent()->isConstScalar() && + !input_id_of_set->extent()->isConstScalar()); + // If previous scalar was const and new isn't, don't swap + swap_ids = swap_ids && + !(input_id_of_set->extent()->isConstScalar() && + !id->extent()->isConstScalar()); + + if (swap_ids) { + set_to_input_id[id_set] = id; + } + } + } + } + + // Finally make map from ID extents to the representitive ID extent. + std::unordered_map extent_to_min_input_id_extent; + for (auto entry : set_to_input_id) { + auto* set = entry.first; + auto input_id = entry.second; + for (auto id : *set) { + extent_to_min_input_id_extent[id->extent()] = input_id->extent(); + } + } + return extent_to_min_input_id_extent; +} + +std::vector allLeafOuts(Fusion* fusion) { + auto exprs = StmtSort::getExprs(fusion, true); + std::unordered_set inputs; + std::unordered_set outputs; + std::vector ordered_outputs; + for (auto expr : exprs) { + inputs.insert(expr->inputs().begin(), expr->inputs().end()); + outputs.insert(expr->outputs().begin(), expr->outputs().end()); + ordered_outputs.insert( + ordered_outputs.end(), expr->outputs().begin(), expr->outputs().end()); + } + for (auto input : inputs) { + outputs.erase(input); + } + + std::vector ordered_leaf_outs; + for (auto out : ordered_outputs) { + if (outputs.find(out) != outputs.end()) { + ordered_leaf_outs.push_back(out); + } + } + return ordered_leaf_outs; +} + +class ValReplacementMutator : private OptOutMutator { + public: + ValReplacementMutator( + Fusion* fusion, + const std::unordered_map& replacement_map) + : replacement_map_(replacement_map) { + FusionGuard fg(fusion); + + // Welford makes this a little annoying since it holds a count which is + // typically not used by anything else. If we don't grab that count, then it + // would be a tensorview that doesn't get updated extents. Therefore, first + // grab all leaves towards outputs and grab stmts from there. + auto stmts = StmtSort::getStmts(fusion, allLeafOuts(fusion), true); + for (auto stmt : stmts) { + mutate(stmt); + } + } + + private: + using OptOutMutator::mutate; + void mutate(Val* val) final { + if (replacement_map_.find(val) == replacement_map_.end()) { + return OptOutMutator::mutate(val); + } + auto replaced_val = replacement_map_.at(val); + registerMutation(val, replaced_val); + } + + const std::unordered_map& replacement_map_; +}; + +} // namespace + +void replaceSymbolicSizes(Fusion* fusion) { + FUSER_PERF_SCOPE("GpuLower::Lower::replaceSymbolicSizes"); + std::unordered_map tensor_dim_map; + + // Grab inputs and outputs + std::vector inputs_and_outputs; + for (auto val : fusion->inputs()) { + if (ir_utils::isTV(val)) { + inputs_and_outputs.push_back(val->as()); + } + } + // Symbolic size is necessary for outputs if there are no inputs. + // Otherwise infer output sizes from the inputs via expression evaluation. + if (fusion->inputs().empty()) { + for (auto val : fusion->outputs()) { + if (ir_utils::isTV(val)) { + inputs_and_outputs.push_back(val->as()); + } + } + } + + // Generate map for all tensorview root domain values to map them to symbolic + // values. i.e. T0->getRootDomain()[0] would map to a named scalar + // "T0.size[0]". This map will be used when lowering fusion ir to kernel ir. + for (TensorView* tv : inputs_and_outputs) { + // Replace the domain with one based on Ti.size[j] + const std::vector& root_td = tv->getRootDomain(); + + size_t dim = 0; + for (auto id : root_td) { + Val* orig_size = id->extent(); + + // Output sizes could have reduction axes, which isn't what gets output. + // NOLINTNEXTLINE(bugprone-branch-clone) + if (id->isReduction() || + (id->getIterType() == IterType::BroadcastWithoutStride)) { + continue; + } else if ( + id->isRFactorProduct() || + // NOLINTNEXTLINE(bugprone-branch-clone) + (id->getIterType() == IterType::BroadcastWithStride) || + orig_size->isConstScalar()) { + dim++; + continue; + } + + // Currently turn off this part for inputs of segmented fusion, + // since FusionKernelRuntime will provide these as integer inputs + if (tensor_dim_map.find(orig_size) == tensor_dim_map.end() && + !orig_size->isFusionInput() && !orig_size->isConstScalar()) { + std::stringstream ss; + ss << "T" << tv->name() << ".size[" << dim++ << "]"; + tensor_dim_map[orig_size] = IrBuilder::create( + ss.str(), orig_size->getDataType().value()); + } else { + dim++; + } + } + } + + // Use a minimal number of sizes from provided tensors. + auto extent_simplification_map = getSimplificationMap(fusion); + for (auto extent_entry : extent_simplification_map) { + auto orig_extent = extent_entry.first; + auto simplified_extent = extent_entry.second; + if (tensor_dim_map.count(orig_extent)) { + if (tensor_dim_map.count(simplified_extent)) { + tensor_dim_map[orig_extent] = tensor_dim_map[simplified_extent]; + } else { + tensor_dim_map[orig_extent] = simplified_extent; + } + } + } + + // Run mutation on the fusion with the tensor_dim_map + ValReplacementMutator(fusion, tensor_dim_map); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_replace_size.h b/torch/csrc/jit/codegen/cuda/lower_replace_size.h new file mode 100644 index 00000000000..81cee9f6ffe --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_replace_size.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +// TensorViews are all based on symbolic sizes. When we first initialize them +// we don't know if they're inputs or outputs which would mean that they have +// runtime shapes. Intermediate tensors (those not going to global memory) do +// not have this information. Since we need to have the correct information in +// the kernel being fetched for shapes, we want to replace input and output +// tensors to reference the runtime structure containing sizes. +void replaceSymbolicSizes(Fusion*); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.cpp b/torch/csrc/jit/codegen/cuda/lower_shift.cpp index 8a4f6980e01..ca451ee5f97 100644 --- a/torch/csrc/jit/codegen/cuda/lower_shift.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_shift.cpp @@ -5,8 +5,6 @@ #include #include #include -#include -#include #include #include #include @@ -19,19 +17,17 @@ namespace fuser { namespace cuda { void ShiftPredicateInserter::insert( - kir::Expr* expr, + Expr* expr, const std::vector& loops, - kir::Bool* thread_pred, + Bool* thread_pred, bool within_unswitch) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - kir::TensorView* out_tv = ir_utils::getTVOutput(expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output"); + TensorView* out_tv = ir_utils::getTvOutput(expr); + TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output"); - TensorView* out_fuser_tv = out_tv->fuserTv(); const bool needs_shift_predicate = - gpu_lower->haloInfo().needsShiftPredicate(out_fuser_tv->definition()); + gpu_lower->haloInfo().needsShiftPredicate(out_tv->definition()); if (!needs_shift_predicate) { return; } @@ -48,12 +44,12 @@ void ShiftPredicateInserter::insert( kir::Predicate* thread_pred_expr = nullptr; if (within_unswitch) { - thread_pred_expr = ir_builder.create(thread_pred); + thread_pred_expr = IrBuilder::create(thread_pred); } kir::Predicate* shift_pred = within_unswitch ? thread_pred_expr - : ir_builder.create( + : IrBuilder::create( PredicateType::Shift, expr, thread_pred); // If the expr involves a thread-block barrier, set the predicate of @@ -64,7 +60,7 @@ void ShiftPredicateInserter::insert( return; } - auto shift_ite = ir_builder.create(shift_pred); + auto shift_ite = IrBuilder::create(shift_pred); auto& scope = loops.back()->body(); @@ -83,56 +79,33 @@ void ShiftPredicateInserter::insert( } // Padding by zero - kir::Predicate* padding_pred = ir_builder.create( + kir::Predicate* padding_pred = IrBuilder::create( PredicateType::Padding, expr, thread_pred); - auto bounds_ite = ir_builder.create(padding_pred); + auto bounds_ite = IrBuilder::create(padding_pred); const int pad_value = 0; - auto pad_expr = ir_builder.create( - UnaryOpType::Set, out_tv, ir_builder.create(pad_value)); + auto pad_expr = IrBuilder::create( + UnaryOpType::Set, out_tv, IrBuilder::create(pad_value)); bounds_ite->thenBody().push_back(pad_expr); // Insert the else block shift_ite->elseBody().push_back(bounds_ite); } -AxisHaloInfo::AxisHaloInfo() { - auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - setWidth(0, ir_builder.zeroVal()); - setWidth(1, ir_builder.zeroVal()); +int AxisHaloInfo::width() const { + return width(0) + width(1); } -kir::Int* AxisHaloInfo::width() const { - auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - return ir_builder.addExpr(width(0), width(1))->as(); -} - -kir::Int* AxisHaloInfo::width(int pos) const { +int AxisHaloInfo::width(int pos) const { TORCH_INTERNAL_ASSERT(pos >= 0 && pos < 2); - TORCH_INTERNAL_ASSERT(widths_[pos] != nullptr); return widths_[pos]; } -void AxisHaloInfo::setWidth(int pos, kir::Int* width) { +void AxisHaloInfo::setWidth(int pos, int width) { TORCH_INTERNAL_ASSERT(pos >= 0 && pos < 2); widths_[pos] = width; } -void AxisHaloInfo::merge(int pos, kir::Int* other) { - auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto cur = width(pos); - kir::Int* new_width = nullptr; - if (cur->isConst() && other->isConst()) { - new_width = ir_builder.create( - std::max(cur->value().value(), other->value().value())); - } else if (cur->isZeroInt()) { - new_width = other; - } else if (other->isZeroInt()) { - new_width = cur; - } else { - new_width = ir_builder.maxExpr(width(pos), other)->as(); - } +void AxisHaloInfo::merge(int pos, int other) { + auto new_width = std::max(width(pos), other); setWidth(pos, new_width); } @@ -144,13 +117,12 @@ void AxisHaloInfo::merge(const AxisHaloInfo& other) { bool AxisHaloInfo::hasHalo() const { return std::any_of( - widths_.begin(), widths_.end(), [](auto w) { return !w->isZeroInt(); }); + widths_.begin(), widths_.end(), [](auto w) { return w != 0; }); } std::string AxisHaloInfo::toString() const { std::stringstream ss; - ss << "<" << kir::toString(width(0)) << ", " << kir::toString(width(1)) - << ">"; + ss << "<" << width(0) << ", " << width(1) << ">"; return ss.str(); } @@ -158,14 +130,17 @@ bool HaloInfo::hasRootAxisInfo(IterDomain* id) const { return root_axis_map_.find(id) != root_axis_map_.end(); } -bool HaloInfo::hasRootAxisInfo(kir::IterDomain* id) const { - return kir_root_axis_map_.find(id) != kir_root_axis_map_.end(); -} - const AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) const { + // TODO: Enable this check, was failing in many tests + // TORCH_INTERNAL_ASSERT( + // id->definition() == nullptr || id->isRFactorProduct(), + // "Invalid IterDomain: ", + // id); auto it = root_axis_map_.find(id); TORCH_INTERNAL_ASSERT( - it != root_axis_map_.end(), "Halo root axis info not found for ", id); + it != root_axis_map_.end(), + "Halo root axis info not found for ", + id->toString()); return it->second; } @@ -176,33 +151,10 @@ AxisHaloInfo& HaloInfo::getRootAxisInfo(IterDomain* id) { const_cast(this)->getRootAxisInfo(id)); } -const AxisHaloInfo& HaloInfo::getRootAxisInfo(kir::IterDomain* id) const { - TORCH_INTERNAL_ASSERT( - id->definition() == nullptr || id->isRFactorProduct(), - "Invalid IterDomain: ", - id); - auto it = kir_root_axis_map_.find(id); - TORCH_INTERNAL_ASSERT( - it != kir_root_axis_map_.end(), - "Halo root axis info not found for ", - kir::toString(id)); - return it->second; -} - -AxisHaloInfo& HaloInfo::getRootAxisInfo(kir::IterDomain* id) { - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - return const_cast( - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - const_cast(this)->getRootAxisInfo(id)); -} - void HaloInfo::setRootAxisInfo( IterDomain* id, const AxisHaloInfo& root_axis_info) { root_axis_map_[id] = root_axis_info; - kir_root_axis_map_ - [GpuLower::current()->lowerValue(id)->as()] = - root_axis_info; initializeFromRootAxisInfo(id); return; @@ -283,9 +235,6 @@ void HaloInfo::propagateRootAxisInfo( const auto& c_root = consumer->getRootDomain(); - auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); - for (const auto i : c10::irange(c_root.size())) { auto c_id = c_root[i]; auto it = c2p.find(c_id); @@ -332,31 +281,19 @@ void HaloInfo::propagateRootAxisInfo( p_info.merge(c_info); } else { int pos = (offset > 0) ? 0 : 1; - p_info.merge( - pos, - ir_builder.addExpr(c_info.width(pos), std::abs(offset)) - ->as()); + p_info.merge(pos, c_info.width(pos) + std::abs(offset)); } } else if (auto gather_op = dynamic_cast(expr)) { - const auto window_dim = - gpu_lower->lowerValue(gather_op->windowShape()[i]); - if (window_dim->isOneInt()) { + const auto window_dim = gather_op->windowShape()[i]; + if (window_dim == 1) { p_info.merge(c_info); continue; } - const auto& pad_dim = gather_op->padWidth()[i]; - const auto pad_dim0 = gpu_lower->lowerValue(pad_dim[0])->as(); - p_info.merge( - 0, ir_builder.addExpr(c_info.width(0), pad_dim0)->as()); + const auto pad_dim0 = gather_op->padWidth()[i][0]; + p_info.merge(0, c_info.width(0) + pad_dim0); // The right-side halo is propagated as: // consumer_right_halo + (window_dim - 1 - left_padding) - p_info.merge( - 1, - ir_builder - .subExpr( - ir_builder.addExpr(c_info.width(1), window_dim), - ir_builder.addExpr(pad_dim0, 1)) - ->as()); + p_info.merge(1, c_info.width(1) + window_dim - 1 - pad_dim0); } else { p_info.merge(c_info); } @@ -390,29 +327,30 @@ void HaloInfo::initializeFromRootAxisInfo(IterDomain* id) { TORCH_INTERNAL_ASSERT(hasRootAxisInfo(id)); auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); const auto& halo_info = getRootAxisInfo(id); auto halo_width = halo_info.width(); if (!halo_info.hasHalo()) { - halo_width_map_[id] = ir_builder.zeroVal(); + setHaloWidth(id, 0); return; } auto expanded_extent = - ir_builder.addExpr(gpu_lower->lowerValue(id->extent()), halo_width); - kir_extent_map_[gpu_lower->lowerValue(id)->as()] = - expanded_extent; + IrBuilder::addExpr(id->extent(), IrBuilder::create(halo_width)); + extent_map_[id] = expanded_extent; halo_width_map_[id] = halo_width; inheritance_map_[id] = {id}; } +void HaloInfo::setHaloWidth(IterDomain* id, int halo_width) { + halo_width_map_[id] = halo_width; +} + // Propagate extent information from root axes to descendants void HaloInfo::build(TensorDomain* td) { auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); auto exprs = DependencyCheck::getAllExprsBetween( {td->getMaybeRFactorDomain().begin(), td->getMaybeRFactorDomain().end()}, @@ -459,33 +397,29 @@ void HaloInfo::build(TensorDomain* td) { auto in_id = split->in(); - const auto& halo_width_it = halo_width_map_.find(in_id); - // If no halo info is found, nothing needs to be done. This ID // must be an ancestor of a domain set by setRootAxisInfo. - if (halo_width_it == halo_width_map_.end()) { + if (!hasHaloWidth(in_id)) { continue; } - const auto halo_width = halo_width_it->second; + const auto halo_width = getHaloWidth(in_id); - if (halo_width->isZeroInt()) { - halo_width_map_.insert({split->outer(), halo_width}); - halo_width_map_.insert({split->inner(), halo_width}); + if (halo_width == 0) { + setHaloWidth(split->outer(), 0); + setHaloWidth(split->inner(), 0); continue; } // propagate to inner domain auto out_id = split->inner(); - auto expanded_extent = ir_builder.addExpr( - gpu_lower->lowerValue(out_id->extent()), halo_width); - kir_extent_map_.insert( - {gpu_lower->lowerValue(out_id)->as(), - expanded_extent}); + auto expanded_extent = + SimplifyingIrBuilder::addExpr(out_id->extent(), halo_width); + extent_map_.insert({out_id, expanded_extent}); - halo_width_map_.insert({split->outer(), ir_builder.zeroVal()}); - halo_width_map_.insert({split->inner(), halo_width}); + setHaloWidth(split->outer(), 0); + setHaloWidth(split->inner(), halo_width); insertToInheritanceMap(td, in_id, split->inner()); } else if (auto merge = dynamic_cast(expr)) { @@ -495,25 +429,24 @@ void HaloInfo::build(TensorDomain* td) { auto outer_extent = getExtent(merge->outer()); if (inner_extent != nullptr || outer_extent != nullptr) { if (inner_extent == nullptr) { - inner_extent = gpu_lower->lowerValue(merge->inner()->extent()); + inner_extent = merge->inner()->extent(); } else { insertToInheritanceMap(td, merge->inner(), merge->out()); } if (outer_extent == nullptr) { - outer_extent = gpu_lower->lowerValue(merge->outer()->extent()); + outer_extent = merge->outer()->extent(); } else { insertToInheritanceMap(td, merge->outer(), merge->out()); } - auto expanded_extent = ir_builder.mulExpr(outer_extent, inner_extent); - kir_extent_map_.insert( - {gpu_lower->lowerValue(merge->out())->as(), - expanded_extent}); + auto expanded_extent = + SimplifyingIrBuilder::mulExpr(outer_extent, inner_extent); + extent_map_.insert({merge->out(), expanded_extent}); // Splitting the output of this merge is not allowed, so // remember it merged_shifted_ids.insert(merge->out()); // Note that halo_width_map_ is not updated } else { - halo_width_map_.insert({merge->out(), ir_builder.zeroVal()}); + setHaloWidth(merge->out(), 0); } } else { TORCH_INTERNAL_ASSERT(false, "Unsupported expr: ", expr); @@ -579,7 +512,7 @@ void HaloInfo::validate(TensorView* tv) const { bool shared_mem_needed = false; for (auto use : tv->uses()) { - if (!ir_utils::isTVOp(use)) { + if (!ir_utils::isTvOp(use)) { continue; } if (use->isA() || use->isA()) { @@ -629,21 +562,16 @@ void HaloInfo::validate(TensorView* tv) const { return; } -kir::Val* HaloInfo::getExtent(IterDomain* id) const { - auto kir_id = GpuLower::current()->lowerValue(id)->as(); - return getExtent(kir_id); -} - -kir::Val* HaloInfo::getExtent(kir::IterDomain* id) const { - auto it = kir_extent_map_.find(id); - if (it != kir_extent_map_.end()) { +Val* HaloInfo::getExtent(IterDomain* id) const { + auto it = extent_map_.find(id); + if (it != extent_map_.end()) { return it->second; } else { return nullptr; } } -kir::Int* HaloInfo::getHaloWidth(IterDomain* id) const { +int HaloInfo::getHaloWidth(IterDomain* id) const { auto it = halo_width_map_.find(id); TORCH_INTERNAL_ASSERT(it != halo_width_map_.end()); return it->second; @@ -736,63 +664,11 @@ bool extentCompare( } // namespace bool HaloInfo::extentLessEqual(IterDomain* id1, IterDomain* id2) const { - auto cmp = [](kir::Int* x, kir::Int* y) { - if (x == y) { - return true; - } - auto xv = x->value(); - auto yv = y->value(); - return xv.has_value() && yv.has_value() && xv.value() <= yv.value(); - }; - return extentCompare(*this, id1, id2, cmp); + return extentCompare(*this, id1, id2, std::less_equal<>()); } bool HaloInfo::extentEqual(IterDomain* id1, IterDomain* id2) const { - // Returns true only when x and y are proven to be the same. The - // analysis is not comprehensive and can prove in rather trivial - // cases only. Specifically: - // - x and y are the same pointers - // - Both have static values and they are the same - // - Both are defined by the same expression and the inputs are - // proven to be equal - std::function cmp = [&](kir::Int* x, - kir::Int* y) { - if (x == y) { - return true; - } - - auto xv = x->value(); - auto yv = y->value(); - if (xv.has_value() && yv.has_value() && xv.value() == yv.value()) { - return true; - } - - // Check if both are defined by an expression of the same type. If - // so, recursively check the input operands. - auto x_def = x->definition(); - auto y_def = y->definition(); - if (x_def && y_def && - ((x_def->isA() && y_def->isA() && - x_def->as()->operation() == - y_def->as()->operation()) || - (x_def->isA() && y_def->isA() && - x_def->as()->operation() == - y_def->as()->operation()))) { - for (const auto i : c10::irange(x_def->inputs().size())) { - auto x_input = dynamic_cast(x_def->inputs()[i]); - auto y_input = dynamic_cast(y_def->inputs()[i]); - // Both must be kir::Int - TORCH_INTERNAL_ASSERT(x_input && y_input); - if (!cmp(x_input, y_input)) { - return false; - } - } - return true; - } - - return false; - }; - return extentCompare(*this, id1, id2, cmp); + return extentCompare(*this, id1, id2, std::equal_to<>()); } std::string HaloInfo::toString() const { @@ -822,16 +698,19 @@ std::string HaloInfo::toString() const { } bool HaloInfo::needsShiftPredicate(Expr* expr) const { - auto consumer_td = ir_utils::getTVOutput(expr)->domain(); - auto shift_expr = dynamic_cast(expr); - auto gather_expr = dynamic_cast(expr); + // In lowering shift and gather turn into a unary op. We really need the shift + // expr. Do a round about trick to grab it: + auto tv_out = ir_utils::getTvOutput(expr); + auto consumer_td = tv_out->domain(); + auto shift_expr = dynamic_cast(tv_out->definition()); + auto gather_expr = dynamic_cast(tv_out->definition()); for (const auto i : c10::irange(consumer_td->getRootDomain().size())) { auto consumer_id = consumer_td->getRootDomain()[i]; const auto consumer_halo_info = getRootAxisInfo(consumer_id); if (consumer_halo_info.hasHalo() || (shift_expr != nullptr && shift_expr->offset(i) != 0 && !consumer_id->isBroadcast()) || - (gather_expr != nullptr && !gather_expr->windowShape()[i]->isOneInt() && + (gather_expr != nullptr && gather_expr->windowShape()[i] != 1 && !consumer_id->isBroadcast())) { return true; } @@ -839,13 +718,6 @@ bool HaloInfo::needsShiftPredicate(Expr* expr) const { return false; } -bool HaloInfo::needsShiftPredicate(kir::Expr* expr) const { - const auto out_tv = expr->outputs()[0]->as(); - auto fuser_expr = out_tv->fuserTv()->definition(); - TORCH_INTERNAL_ASSERT(fuser_expr != nullptr); - return needsShiftPredicate(fuser_expr); -} - } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/lower_shift.h b/torch/csrc/jit/codegen/cuda/lower_shift.h index 378709ca443..c0fea8c1ead 100644 --- a/torch/csrc/jit/codegen/cuda/lower_shift.h +++ b/torch/csrc/jit/codegen/cuda/lower_shift.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -16,16 +16,14 @@ namespace cuda { //! Auxiliary class to represent information about halo of an axis class AxisHaloInfo { public: - AxisHaloInfo(); - //! Width of halo. //! //! pos is either 0 or 1. The width of halo at offset zero is set //! when pos is 0. - kir::Int* width(int pos) const; + int width(int pos) const; //! Sum of the widths of both widths - kir::Int* width() const; + int width() const; const auto& widths() const { return widths_; @@ -34,10 +32,10 @@ class AxisHaloInfo { //! Set the halo width of either side. //! pos is either 0 or 1. The width of halo at offset zero is set //! when pos is 0. - void setWidth(int pos, kir::Int* width); + void setWidth(int pos, int width); //! Extend the halo width to account for another axis. - void merge(int pos, kir::Int* other); + void merge(int pos, int other); //! Extend the halo width to account for another axis. void merge(const AxisHaloInfo& other); @@ -53,7 +51,7 @@ class AxisHaloInfo { //! widths_[0] is non-zero and designates the size of the //! halo. Similarly, non-zero widths_[1] means the axis has halo at //! the other end of the axis. - std::array widths_ = {nullptr, nullptr}; + std::array widths_ = {0, 0}; }; //! Helper class for lowering tensors with halo. Only valid at the @@ -77,7 +75,6 @@ class TORCH_CUDA_CU_API HaloInfo { //! Returns true if id has the root halo information set by //! setRootAxisInfo. bool hasRootAxisInfo(IterDomain* id) const; - bool hasRootAxisInfo(kir::IterDomain* id) const; //! Returns the registed AxisHaloInfo of a root axis. //! @@ -85,9 +82,6 @@ class TORCH_CUDA_CU_API HaloInfo { //! non-root axes. const AxisHaloInfo& getRootAxisInfo(IterDomain* id) const; AxisHaloInfo& getRootAxisInfo(IterDomain* id); - //! KIR version - const AxisHaloInfo& getRootAxisInfo(kir::IterDomain* id) const; - AxisHaloInfo& getRootAxisInfo(kir::IterDomain* id); //! Query if an axis has a halo width. //! @@ -98,12 +92,11 @@ class TORCH_CUDA_CU_API HaloInfo { //! //! It's an error if queried for an axis with no halo width //! information. - kir::Int* getHaloWidth(IterDomain* id) const; + int getHaloWidth(IterDomain* id) const; //! Returns an extent if id is extended for halo. Nullptr is //! returned otherwise. - kir::Val* getExtent(IterDomain* id) const; - kir::Val* getExtent(kir::IterDomain* id) const; + Val* getExtent(IterDomain* id) const; //! Returns all child domains of a root domain that inherits the //! halo of the root domain. @@ -135,7 +128,6 @@ class TORCH_CUDA_CU_API HaloInfo { //! interior and another for padding. Predicate insertion is done in //! the ShiftPredicateInserter class below. bool needsShiftPredicate(Expr* expr) const; - bool needsShiftPredicate(kir::Expr* expr) const; std::string toString() const; @@ -166,14 +158,14 @@ class TORCH_CUDA_CU_API HaloInfo { //! Validate shift usage void validate(TensorView* td) const; + void setHaloWidth(IterDomain* id, int halo_width); + private: //! Halo information of root axes std::unordered_map root_axis_map_; - //! KIR version - std::unordered_map kir_root_axis_map_; //! Halo-extended extents. No mapping for axes without halo extension - std::unordered_map kir_extent_map_; + std::unordered_map extent_map_; //! The halo width of an axis. //! @@ -209,7 +201,7 @@ class TORCH_CUDA_CU_API HaloInfo { //! inner axis is merged with another axis of extent M, we know that //! the extent of the resulting output axis is 5*M, but we don't //! create its mapping. - std::unordered_map halo_width_map_; + std::unordered_map halo_width_map_; //! Mappings from root domains to child domains that inherit halo std::unordered_map> @@ -224,9 +216,9 @@ class ShiftPredicateInserter { //! the usual predicated expression, so the insertion is also done //! here. static void insert( - kir::Expr* expr, + Expr* expr, const std::vector& loops, - kir::Bool* thread_pred, + Bool* thread_pred, bool within_unswitch); }; diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp index a7f8768883d..8721490feb7 100644 --- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include @@ -17,55 +16,49 @@ namespace cuda { namespace { -kir::Bool* getPredicatePerParallelType( +Bool* getPredicatePerParallelType( ParallelType pt, const ThreadPredicateMap::PredicateInfo& pred_info) { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - auto pt_dim = GpuLower::current()->parallelDimensionMap().get(pt); // If pt is not used or is proven to be one, no need to predicate. if (pt_dim == nullptr || pt_dim->isOneInt()) { - return ir_builder.trueVal(); + return GpuLower::current()->kernel()->trueVal(); } - // When BID needs to be predicated, that means it's an output of a grid // reduction and only the last block index in that dimension has the right // value from the grid reduce. if (isParallelTypeBlockDim(pt) && pred_info.limited_types.get(pt)) { - return ir_builder - .eqExpr( - kir::NamedScalar::getParallelIndex(pt), - ir_builder.subExpr( - kir::NamedScalar::getParallelDim(pt), ir_builder.oneVal())) - ->as(); + return SimplifyingIrBuilder::eqExpr( + NamedScalar::getParallelIndex(pt), + SimplifyingIrBuilder::subExpr( + NamedScalar::getParallelDim(pt), + GpuLower::current()->kernel()->oneVal())) + ->as(); } // Otherwise, only thread of index 0 executes the computation - return ir_builder - .eqExpr(kir::NamedScalar::getParallelIndex(pt), ir_builder.zeroVal()) - ->as(); + return SimplifyingIrBuilder::eqExpr( + NamedScalar::getParallelIndex(pt), + GpuLower::current()->kernel()->zeroVal()) + ->as(); } } // namespace -kir::Bool* ThreadPredicateMap::getPredicateFromPredicateInfo( +Bool* ThreadPredicateMap::getPredicateFromPredicateInfo( const ThreadPredicateMap::PredicateInfo& pred_info) { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - const auto pred_types = pred_info.limited_types | pred_info.redundant_types; if (pred_types.none()) { - return ir_builder.trueVal(); + return GpuLower::current()->kernel()->trueVal(); } - kir::Bool* pred = nullptr; - + Bool* pred = nullptr; for (const auto pt : pred_types) { const auto tp = getPredicatePerParallelType(pt, pred_info); - pred = ir_builder.andExpr(pred, tp)->as(); + pred = SimplifyingIrBuilder::andExpr(pred, tp)->as(); } - TORCH_INTERNAL_ASSERT(pred != nullptr); return pred; @@ -191,7 +184,9 @@ void ThreadPredicateMap::updateBitSet(const Expr* expr) { if (id->isReduction()) { id_reductions.set(id->getParallelType()); } - if (id->isBroadcast()) { + if (id->isBroadcast() && + GpuLower::current()->concretizedBroadcastDomains().isConcretized( + id)) { id_bcasts.set(id->getParallelType()); } } @@ -302,7 +297,7 @@ void ThreadPredicateMap::insert( thread_predicates_.insert({tv, pred_info}); } -kir::Bool* ThreadPredicateMap::getPredicate(const TensorView* tv) const { +Bool* ThreadPredicateMap::getPredicate(const TensorView* tv) const { TORCH_INTERNAL_ASSERT(find(tv) != end(), "Couldn't find ", tv); auto pred_info = getPredicateInfo(tv); return getPredicateFromPredicateInfo(pred_info); @@ -326,7 +321,8 @@ ParallelTypeBitmap ThreadPredicateMap::getParallelBroadcastDomains( const bool output_smem = tv->getMemoryType() == MemoryType::Shared; for (auto id : iter_domains) { - if (!id->isBroadcast()) { + if (!id->isBroadcast() || + !GpuLower::current()->concretizedBroadcastDomains().isConcretized(id)) { continue; } if (id->isBlockDim() || (!output_smem && id->isThreadDim())) { diff --git a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h index 256e0385aeb..0d7a2685b32 100644 --- a/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h +++ b/torch/csrc/jit/codegen/cuda/lower_thread_predicate.h @@ -1,7 +1,7 @@ #pragma once -#include +#include #include #include @@ -69,7 +69,7 @@ class TORCH_CUDA_CU_API ThreadPredicateMap { ParallelTypeBitmap getPredicatedParallelTypes(const TensorView* tv) const; //! Returns a Bool predicate for a given TensorView. - kir::Bool* getPredicate(const TensorView* tv) const; + Bool* getPredicate(const TensorView* tv) const; //! Returns a ParallelTypeBitmap representing which domain needs //! blockBroadcast. @@ -81,7 +81,7 @@ class TORCH_CUDA_CU_API ThreadPredicateMap { void print() const; //! Generate a Bool value from PredicateInfo. - static kir::Bool* getPredicateFromPredicateInfo( + static Bool* getPredicateFromPredicateInfo( const ThreadPredicateMap::PredicateInfo& pred_info); private: diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp new file mode 100644 index 00000000000..ab62530591a --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.cpp @@ -0,0 +1,119 @@ +#include +#include +#include +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +void ConcretizedBroadcastDomains::build(Fusion* fusion) { + // Initialize the origin map with input broadcast domains + for (const auto fusion_input_tv : + ir_utils::filterByType(fusion->inputs())) { + for (auto root_id : fusion_input_tv->getRootDomain()) { + if (root_id->isBroadcast()) { + broadcast_origin_map_.emplace( + root_id, std::unordered_set({root_id})); + } + } + } + traverse(fusion); +} + +bool ConcretizedBroadcastDomains::isConcretized(IterDomain* id) const { + auto it = concretized_domains_.find(id); + return it != concretized_domains_.end(); +} + +void ConcretizedBroadcastDomains::handle(BroadcastOp* bop) { + // Create a new entry for each of new broadcast domains + auto out = bop->out()->as(); + for (const auto i : c10::irange(out->getRootDomain().size())) { + if (bop->getBroadcastDimFlags().at(i)) { + auto new_bcast_id = out->getRootDomain().at(i); + broadcast_origin_map_.emplace( + new_bcast_id, std::unordered_set({new_bcast_id})); + } + } +} + +void ConcretizedBroadcastDomains::handle(Expr* expr) { + IterVisitor::handle(expr); + + // Propagate broadcast origin info from producers to consumers + for (auto producer : ir_utils::filterByType(expr->inputs())) { + std::unordered_set producer_broadcasts; + // This assumes there's no merged broadcast axes between root and rfactor + // domains which is not possible at the moment. If this assumption is ever + // invalidated we would need to manaually propagate root IDs to rfactor IDs. + for (auto producer_id : producer->getMaybeRFactorDomain()) { + if (producer_id->isBroadcast()) { + producer_broadcasts.insert(producer_id); + } + } + if (producer_broadcasts.empty()) { + continue; + } + + for (auto consumer : ir_utils::filterByType(expr->outputs())) { + auto p2c_map = + PairwiseRootDomainMap(producer, consumer) + .mapProducerToConsumer( + producer->domain(), consumer->domain(), producer_broadcasts); + for (const auto& kv : p2c_map) { + auto p_id = kv.first; + auto c_id = kv.second; + const bool is_concretized = !c_id->isBroadcast(); + auto it = broadcast_origin_map_.find(p_id); + TORCH_INTERNAL_ASSERT( + it != broadcast_origin_map_.end(), + "Broadcast origin info not found for producer broadcast domain: ", + p_id->toString(), + " of ", + producer->toString()); + const auto& producer_origins = it->second; + if (is_concretized) { + // Keep track of all the origin domains as concretized + for (auto origin : producer_origins) { + // concretized_root_domains_.insert(origin); + markAsConcretized(origin); + } + } else { + // Not concretized yet. Propagate forward the origin info. + auto& consumer_origins = broadcast_origin_map_[c_id]; + for (auto origin : producer_origins) { + consumer_origins.insert(origin); + } + consumer_origins.insert(c_id); + } + } + } + } +} + +void ConcretizedBroadcastDomains::markAsConcretized(IterDomain* root_domain) { + std::deque child_domains({root_domain}); + while (!child_domains.empty()) { + auto child = child_domains.front(); + child_domains.pop_front(); + if (!concretized_domains_.emplace(child).second) { + continue; + } + const auto& child_uses = child->uses(); + for (auto child_use : child_uses) { + for (auto out_id : + ir_utils::filterByType(child_use->outputs())) { + child_domains.push_back(out_id); + } + } + } +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h new file mode 100644 index 00000000000..9dd50e8afc1 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/lower_trivial_broadcast.h @@ -0,0 +1,51 @@ +#pragma once + +#include + +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +//! Traverse and collect all concretized broadcast domains. +//! +//! The traversal first initializes the origin map with broadcast +//! domains in input tensors. Then, a new entry is added to the origin +//! map when a broadcast op is encountered during a forward traversal +//! of the given fusion. For non-broadcast ops, mappings are just +//! propagated forward using PairwiseRootDomainMap. +//! +//! When the mapped consumer domain is not broadcast, it means the +//! producer broadcast domain is concretized, and its origin broadcast +//! domains are marked as concretized. +class TORCH_CUDA_CU_API ConcretizedBroadcastDomains : private IterVisitor { + public: + void build(Fusion* fusion); + + bool isConcretized(IterDomain* id) const; + + private: + using IterVisitor::handle; + + void handle(BroadcastOp* bop) final; + + void handle(Expr* expr) final; + + void markAsConcretized(IterDomain* root_domain); + + private: + //! Maps each broadcast domain to its original broadcast + //! domains. Their can be multiple original domains due to, e.g., + //! binary ops with broadcast domains in both inputs. + std::unordered_map> + broadcast_origin_map_; + //! Set of all concretized original domains + std::unordered_set concretized_domains_; +}; + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp index 33651785d43..a8905b4d404 100644 --- a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.cpp @@ -74,7 +74,7 @@ bool analyzeIfDerivedFromTrivialReduction(TensorView* tv, IterDomain* id) { } // namespace -void TrivialReductionInfo::build(Fusion* fusion, GpuLower* gpu_lower) { +void TrivialReductionInfo::build(Fusion* fusion) { auto used_vals = fusion->usedMathVals(); for (auto tv : ir_utils::filterByType(used_vals)) { @@ -99,20 +99,6 @@ void TrivialReductionInfo::build(Fusion* fusion, GpuLower* gpu_lower) { } } } - - buildKir(fusion, gpu_lower); -} - -void TrivialReductionInfo::buildKir(Fusion* fusion, GpuLower* gpu_lower) { - for (auto id : domains_) { - auto kir_trivial_id = gpu_lower->lowerValue(id)->as(); - kir_domains_.insert(kir_trivial_id); - } - - for (auto id : domains_derived_from_root_) { - auto kir_trivial_id = gpu_lower->lowerValue(id)->as(); - kir_domains_derived_from_root_.insert(kir_trivial_id); - } } bool TrivialReductionInfo::isDerived(IterDomain* id) const { @@ -124,15 +110,6 @@ bool TrivialReductionInfo::isDerivedFromRoot(IterDomain* id) const { domains_derived_from_root_.end(); } -bool TrivialReductionInfo::isDerived(kir::IterDomain* id) const { - return kir_domains_.find(id) != kir_domains_.end(); -} - -bool TrivialReductionInfo::isDerivedFromRoot(kir::IterDomain* id) const { - return kir_domains_derived_from_root_.find(id) != - kir_domains_derived_from_root_.end(); -} - } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h index c16439ed4f0..9ccbc2f7828 100644 --- a/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h +++ b/torch/csrc/jit/codegen/cuda/lower_trivial_reductions.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -13,24 +13,15 @@ namespace jit { namespace fuser { namespace cuda { -class GpuLower; - //! Detect almost all IterDomains that are derived from trivial //! reductons. class TORCH_CUDA_CU_API TrivialReductionInfo { public: - void build(Fusion* fusion, GpuLower* gpu_lower); + void build(Fusion* fusion); bool isDerived(IterDomain* id) const; bool isDerivedFromRoot(IterDomain* id) const; - bool isDerived(kir::IterDomain* id) const; - bool isDerivedFromRoot(kir::IterDomain* id) const; - - private: - //! Convert the sets to KIR sets - void buildKir(Fusion* fusion, GpuLower* gpu_lower); - private: //! IterDomains that are derived only from trivial //! reductons. Included domains are not limited to reduction axes as @@ -48,9 +39,6 @@ class TORCH_CUDA_CU_API TrivialReductionInfo { //! trivial reductions. These domains do not need to manifest as //! for-loops. std::unordered_set domains_derived_from_root_; - - std::unordered_set kir_domains_; - std::unordered_set kir_domains_derived_from_root_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp index 08f91ba59bd..c4f926131a8 100644 --- a/torch/csrc/jit/codegen/cuda/lower_unroll.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_unroll.cpp @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include #include @@ -22,8 +20,7 @@ namespace { // Provide a new for loop matching the one provided kir::ForLoop* cloneLoopNest(const kir::ForLoop* for_loop) { - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - const auto new_loop = ir_builder.create(for_loop); + const auto new_loop = IrBuilder::create(for_loop); for (auto expr : for_loop->body().exprs()) { if (auto nested_for_loop = dynamic_cast(expr)) { expr = cloneLoopNest(nested_for_loop); @@ -35,20 +32,20 @@ kir::ForLoop* cloneLoopNest(const kir::ForLoop* for_loop) { // Returns true if expr is an expression that initializes a reduction // buffer. -bool isReductionInitExpr(const kir::Expr* expr) { +bool isReductionInitExpr(const Expr* expr) { // False if its output isn't a TensorView - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { return false; } // False if it doesn't have any reduction axis - const auto out_tv = expr->outputs()[0]->as(); + const auto out_tv = expr->outputs()[0]->as(); if (!out_tv->domain()->hasReduction()) { return false; } // False if it has have TensorView inputs as initialization should // never use TensorViews const auto tv_filter_inp_view = - ir_utils::filterByType(expr->inputs()); + ir_utils::filterByType(expr->inputs()); if (tv_filter_inp_view.begin() != tv_filter_inp_view.end()) { return false; } @@ -57,28 +54,27 @@ bool isReductionInitExpr(const kir::Expr* expr) { } // namespace -void UnrollPass::handle(kir::Expr* expr) { - if (ir_utils::isTVOp(expr)) { +void UnrollPass::handle(Expr* expr) { + if (ir_utils::isTvOp(expr)) { // If tv op, predicate it - const auto out_tv = ir_utils::getTVOutput(expr); + const auto out_tv = ir_utils::getTvOutput(expr); const bool should_predicate = !for_loops_.empty() || - out_tv->memoryType() == MemoryType::Global || - out_tv->memoryType() == MemoryType::Shared; + out_tv->getMemoryType() == MemoryType::Global || + out_tv->getMemoryType() == MemoryType::Shared; if (!should_predicate) { return; } - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); const auto thread_pred = isReductionInitExpr(expr) - ? ir_builder.trueVal() - : GpuLower::current()->threadPredMap().getPredicate(out_tv->fuserTv()); + ? GpuLower::current()->kernel()->trueVal() + : GpuLower::current()->threadPredMap().getPredicate(out_tv); // When this expr is in an unswitched block, only attach the // thread predicate to the expr as thread predicates are not // grouped to the unswitch predicate. kir::Predicate* thread_pred_expr = nullptr; if (unswitched_loop_) { - thread_pred_expr = ir_builder.create(thread_pred); + thread_pred_expr = IrBuilder::create(thread_pred); } non_trivial_pred_found_ = true; @@ -95,7 +91,7 @@ void UnrollPass::handle(kir::Expr* expr) { if (!isReductionInitExpr(expr) && out_tv->domain()->hasReduction()) { const auto write_pred = unswitched_loop_ ? thread_pred_expr - : ir_builder.create( + : IrBuilder::create( PredicateType::ReductionWrite, expr, thread_pred); expr->setWritePredicate(write_pred); } @@ -105,7 +101,7 @@ void UnrollPass::handle(kir::Expr* expr) { if (ir_utils::hasBlockSync(expr, GpuLower::current()->threadPredMap())) { const auto pred = unswitched_loop_ ? thread_pred_expr - : ir_builder.create( + : IrBuilder::create( PredicateType::Inline, expr, thread_pred); expr->setPredicate(pred); return; @@ -116,28 +112,28 @@ void UnrollPass::handle(kir::Expr* expr) { if (!unswitched_loop_ && std::any_of( for_loops_.begin(), for_loops_.end(), [](const kir::ForLoop* fl) { - return fl->iter_domain()->parallelType() == + return fl->iter_domain()->getParallelType() == ParallelType::Vectorize; })) { - pred = ir_builder.create(PredicateType::Vectorize); + pred = IrBuilder::create(PredicateType::Vectorize); } if (pred == nullptr) { pred = unswitched_loop_ ? thread_pred_expr - : ir_builder.create( + : IrBuilder::create( PredicateType::Inline, expr, thread_pred); } // If we need a predicate, put expr inside an if then else - kir::IfThenElse* inline_ite = ir_builder.create(pred); + kir::IfThenElse* inline_ite = IrBuilder::create(pred); if (for_loops_.empty()) { // Special handling for top level output expressions that still // need predicates. One motivating example is a reduction op that // reduces to a scalar (issue #491) - expr_replacement_map_.insert({expr, inline_ite}); + kir::ExprMutator::registerReplace(expr, inline_ite, nullptr); } else { - for_loops_.back()->body().insert_before(expr, inline_ite); - for_loops_.back()->body().erase(expr); + kir::ExprMutator::registerReplace( + expr, inline_ite, &for_loops_.back()->body()); } inline_ite->thenBody().push_back(expr); } else if (auto for_loop = dynamic_cast(expr)) { @@ -150,8 +146,8 @@ void UnrollPass::handle(kir::Expr* expr) { void UnrollPass::handle(kir::ForLoop* fl) { // Setup for loop scoping const bool is_unroll = - fl->iter_domain()->parallelType() == ParallelType::Unroll || - fl->iter_domain()->parallelType() == ParallelType::Unswitch; + fl->iter_domain()->getParallelType() == ParallelType::Unroll || + fl->iter_domain()->getParallelType() == ParallelType::Unswitch; // If we're not looking for an unroll loop, or didn't find one, process as // normal. @@ -172,10 +168,9 @@ void UnrollPass::handle(kir::ForLoop* fl) { return; } - kir::IrBuilder ir_builder(GpuLower::current()->kernel()); - auto unroll_pred = ir_builder.create(fl); + auto unroll_pred = IrBuilder::create(fl); - kir::IfThenElse* unroll_ite = ir_builder.create(unroll_pred); + kir::IfThenElse* unroll_ite = IrBuilder::create(unroll_pred); // Get the loop nest for the unrolled path kir::ForLoop* unrolled_loop_nest = cloneLoopNest(fl); @@ -199,12 +194,18 @@ void UnrollPass::handle(kir::ForLoop* fl) { handle(inlined_loop); look_for_unroll_ = true; if (!non_trivial_pred_found_) { - expr_replacement_map_.insert({fl, inlined_loop}); + kir::ExprMutator::registerReplace( + fl, + inlined_loop, + for_loops_.empty() ? nullptr : &for_loops_.back()->body()); } else { if (!canOmitElseClause(fl)) { unroll_ite->elseBody().push_back(inlined_loop); } - expr_replacement_map_.insert({fl, unroll_ite}); + kir::ExprMutator::registerReplace( + fl, + unroll_ite, + for_loops_.empty() ? nullptr : &for_loops_.back()->body()); } } @@ -221,14 +222,14 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) { // If there's any expression that requires barrier // synchronization, the else part can't be omitted for (auto expr : loop->body().exprs()) { - if (expr->isA()) { + if (expr->isA()) { const ParallelTypeBitmap domains = pred_map.getParallelBroadcastDomains( - expr->outputs()[0]->as()->fuserTv()); + expr->outputs()[0]->as()); if (domains.any()) { return false; } - } else if (expr->isA() || expr->isA()) { - auto td = ir_utils::getTVOutput(expr)->domain(); + } else if (expr->isA() || expr->isA()) { + auto td = ir_utils::getTvOutput(expr)->domain(); if (td->hasBlockReduction() || td->hasGridReduction()) { return false; } @@ -238,14 +239,14 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) { // unswitch predicate is sufficient. // When the loop stop is the same as the extent of its IterDomain, // the per-thread visit count is guaranteed to be one at most (see - // CudaKernelGenerator::visit(kir::ForLoop*) as well. Also, when a + // CudaKernelGenerator::handle(kir::ForLoop*) as well. Also, when a // loop is vectorized (not misaligned), the count must be one at // most. Even if not parallelized nor vectoirzed, it is also // sufficient if the loop stop is in fact one. bool visit_once = false; auto id = loop->iter_domain(); if ((id->isThread() && (loop->stop() == id->extent())) || - id->parallelType() == ParallelType::Vectorize) { + id->getParallelType() == ParallelType::Vectorize) { visit_once = true; } if (!visit_once) { @@ -273,30 +274,18 @@ bool UnrollPass::canOmitElseClause(kir::ForLoop* fl) { } // Generate the loop nest structure and place it in lowered_exprs -UnrollPass::UnrollPass(const std::vector& exprs) { +UnrollPass::UnrollPass(const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::UnrollPass::computeMap"); - - // Run through loop nests and further lower the expressions - for (auto* expr : exprs) { - handle(expr); - } + kir::ExprMutator::traverseAndInsert(exprs); } -std::vector UnrollPass::runPass( +std::vector UnrollPass::runPass( Fusion* fusion, - const std::vector& exprs) { + const std::vector& exprs) { FUSER_PERF_SCOPE("GpuLower::Lower::UnrollPass::runPass"); UnrollPass unroll_pass(exprs); - - std::vector mutated_exprs; - mutated_exprs.reserve(exprs.size()); - for (auto expr : exprs) { - mutated_exprs.push_back( - ir_utils::applyReplacements(unroll_pass.replacementMap(), expr)); - } - - return mutated_exprs; + return unroll_pass.exprs_; } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/lower_unroll.h b/torch/csrc/jit/codegen/cuda/lower_unroll.h index bec4966dd94..14725c405b7 100644 --- a/torch/csrc/jit/codegen/cuda/lower_unroll.h +++ b/torch/csrc/jit/codegen/cuda/lower_unroll.h @@ -1,7 +1,8 @@ #pragma once -#include +#include #include +#include #include #include #include @@ -51,33 +52,32 @@ namespace cuda { //! predicate still in the inner most loop, making sure that we cover edges and //! corners. //! -class TORCH_CUDA_CU_API UnrollPass { +class TORCH_CUDA_CU_API UnrollPass : kir::ExprMutator { public: // Take the incoming exprs and run loop unrolling, returning the new IR - static std::vector runPass( + static std::vector runPass( Fusion* fusion, - const std::vector& exprs); + const std::vector& exprs); static bool canOmitElseClause(kir::ForLoop* fl); private: // Generate the for Expr replacement map - UnrollPass(const std::vector& exprs); + UnrollPass(const std::vector& exprs); - const std::unordered_map& replacementMap() const { + const std::unordered_map& replacementMap() const { return expr_replacement_map_; } - void handle(kir::ForLoop* fl); + using OptOutDispatch::handle; - void handle(kir::Expr* expr); + void handle(kir::ForLoop* fl) final; + + void handle(Expr* expr) final; private: // We will track which loops in the incoming IR will be replaced and by what - std::unordered_map expr_replacement_map_; - - // Keep all for loops conveniently to make unrolling easier - std::vector for_loops_; + std::unordered_map expr_replacement_map_; // keep track if we're within an unrolled loop bool look_for_unroll_ = true; diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.cpp b/torch/csrc/jit/codegen/cuda/lower_utils.cpp index 5d015c450d9..ba2f618efae 100644 --- a/torch/csrc/jit/codegen/cuda/lower_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_utils.cpp @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include #include @@ -23,38 +21,14 @@ namespace cuda { namespace scope_utils { -std::vector getLoops(kir::Expr* scope) { - std::vector loops; - while (scope != nullptr) { - if (auto loop = dynamic_cast(scope)) { - loops.push_back(loop); - } - scope = scope->parentScope(); - } - std::reverse(loops.begin(), loops.end()); - return loops; -} - -void insertBefore(kir::Expr* scope, kir::Expr* ref, kir::Expr* expr) { - if (auto ite = dynamic_cast(scope)) { - ite->thenBody().insert_before(ref, expr); - } else if (auto for_loop = dynamic_cast(scope)) { - for_loop->body().insert_before(ref, expr); - } else { - TORCH_INTERNAL_ASSERT(false, "Unexpected scope expression"); - } -} - //! Create an **empty** Forloop and copy the metadata. -kir::ForLoop* cloneForLoop(kir::IrBuilder& ir_builder, kir::ForLoop* for_loop) { - return ir_builder.create(for_loop); +kir::ForLoop* cloneForLoop(kir::ForLoop* for_loop) { + return IrBuilder::create(for_loop); } //! Create an **empty** IfThenElse and copy the metadata. -kir::IfThenElse* cloneIfThenElse( - kir::IrBuilder& ir_builder, - kir::IfThenElse* ite) { - return ir_builder.create(ite->predicate()); +kir::IfThenElse* cloneIfThenElse(kir::IfThenElse* ite) { + return IrBuilder::create(ite->predicate()); } } // namespace scope_utils @@ -103,17 +77,18 @@ std::vector iterDomainInputsOfOrderedAs( } bool isTV(const Val* val) { - return val->getValType().value() == ValType::TensorView; + return val->getValType().value() == ValType::TensorView || + val->getValType().value() == ValType::TensorIndex; } // Check if we're a TensorView op that we can generate code for. -bool isTVOp(const Expr* expr) { +bool isTvOp(const Expr* expr) { if (std::any_of( expr->outputs().begin(), expr->outputs().end(), [](Val* v) { return isTV(v); }) && - (expr->getExprType().value() == ExprType::BinaryOp || - expr->getExprType().value() == ExprType::UnaryOp || + (expr->getExprType().value() == ExprType::UnaryOp || + expr->getExprType().value() == ExprType::BinaryOp || expr->getExprType().value() == ExprType::TernaryOp || expr->getExprType().value() == ExprType::ReductionOp || expr->getExprType().value() == ExprType::WelfordOp || @@ -121,28 +96,26 @@ bool isTVOp(const Expr* expr) { expr->getExprType().value() == ExprType::TransposeOp || expr->getExprType().value() == ExprType::ShiftOp || expr->getExprType().value() == ExprType::GatherOp || - expr->getExprType().value() == ExprType::ViewOp)) { + expr->getExprType().value() == ExprType::ViewOp || + expr->getExprType().value() == ExprType::GridReduction || + expr->getExprType().value() == ExprType::GridBroadcast || + expr->getExprType().value() == ExprType::GridWelford)) { return true; } return false; } -bool isTVOp(const kir::Expr* expr) { - const auto& outputs = expr->outputs(); - return outputs.size() >= 1 && outputs[0]->isA(); -} - -kir::TensorView* getTv(kir::Val* val) { - if (auto tv = dynamic_cast(val)) { - return tv; - } else if (auto ti = dynamic_cast(val)) { - return ti->view(); +TensorView* getTv(Val* val) { + if (val->isA()) { + return val->as(); + } else if (val->isA()) { + return val->as()->view(); } return nullptr; } -std::vector getTvs(const std::vector& vals) { - std::vector tvs; +std::vector getTvs(const std::vector& vals) { + std::vector tvs; for (auto val : vals) { auto tv = ir_utils::getTv(val); if (tv) { @@ -152,32 +125,7 @@ std::vector getTvs(const std::vector& vals) { return tvs; } -kir::TensorView* asTv(kir::Val* val) { - auto tv = getTv(val); - TORCH_INTERNAL_ASSERT(tv != nullptr, "Neigher TensorView nor TensorIndex"); - return tv; -} - -std::vector asTvs(const std::vector vals) { - std::vector tvs; - for (auto val : vals) { - auto tv = ir_utils::asTv(val); - tvs.emplace_back(tv); - } - return tvs; -} - -// TODO: why do we assume there's a single TV output? -TensorView* getTVOutput(const Expr* expr) { - for (auto out : expr->outputs()) { - if (out->getValType().value() == ValType::TensorView) { - return out->as(); - } - } - return nullptr; -} - -kir::TensorView* getTVOutput(const kir::Expr* expr) { +TensorView* getTvOutput(const Expr* expr) { for (auto out : expr->outputs()) { if (auto tv = getTv(out)) { return tv; @@ -193,25 +141,20 @@ bool isScalarOp(const Expr* expr) { return true; } -Expr* asExpr(Statement* stmt) { - TORCH_INTERNAL_ASSERT(stmt->isExpr()); - return stmt->as(); -} - -TensorView* asTV(Val* val) { - TORCH_INTERNAL_ASSERT(isTV(val)); - return val->as(); -} - bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) { - if (!isTVOp(expr)) { + if (!isTvOp(expr)) { return false; } - auto tv = getTVOutput(expr); + if (!(expr->isA() || expr->isA() || + expr->isA() || expr->isA() || + expr->isA() || expr->isA())) { + return false; + } - if ((expr->isA() || expr->isA()) && - (tv->hasBlockReduction() || tv->hasGridReduction())) { + auto tv = getTvOutput(expr); + + if (tv->hasBlockReduction() || tv->hasGridReduction()) { return true; } else if (expr->isA()) { const ParallelTypeBitmap pt_map = @@ -222,64 +165,22 @@ bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map) { return false; } -bool hasBlockSync(const kir::Expr* expr, const ThreadPredicateMap& pred_map) { - if (expr->isA() || expr->isA() || - expr->isA() || expr->isA() || - expr->isA() || expr->isA()) { - auto fuser_tv = getTVOutput(expr)->fuserTv(); - auto fuser_expr = fuser_tv->definition(); - TORCH_INTERNAL_ASSERT(fuser_expr != nullptr); - return hasBlockSync(fuser_expr, pred_map); - } - - return false; -} - -kir::Expr* applyReplacements( - const std::unordered_map& expr_replacement_map, - kir::Expr* expr) { - auto handle_scope = [&](kir::Scope& scope) { - for (const auto i : c10::irange(scope.size())) { - scope[i] = applyReplacements(expr_replacement_map, scope[i]); - } - }; - - const auto it = expr_replacement_map.find(expr); - if (it != expr_replacement_map.end()) { - return it->second; - } else { - if (auto for_loop = dynamic_cast(expr)) { - handle_scope(for_loop->body()); - } else if (auto ite = dynamic_cast(expr)) { - handle_scope(ite->thenBody()); - handle_scope(ite->elseBody()); - } - return expr; - } -} - -c10::optional getMaybeWarpReductionDim( - const kir::ReductionOp* node) { - auto kir_tv = ir_utils::getTVOutput(node); - if (!kir_tv) { +c10::optional getMaybeWarpReductionDim(const ReductionOp* node) { + auto tv_out = getTv(node->out()); + if (tv_out == nullptr) { return c10::nullopt; } - auto fuser_reduction = kir_tv->fuserTv()->definition()->as(); - return getMaybeWarpReductionDim(fuser_reduction); -} -c10::optional getMaybeWarpReductionDim(const ReductionOp* node) { - auto fuser_tv_out = node->out()->as(); - auto fuser_tv_in = node->in()->as(); + auto tv_in = getTv(node->in()); // only support reducing to registers for now. - if (fuser_tv_in->getMemoryType() != MemoryType::Local || - fuser_tv_out->getMemoryType() != MemoryType::Local) { + if (tv_in->getMemoryType() != MemoryType::Local || + tv_out->getMemoryType() != MemoryType::Local) { return c10::nullopt; } IterDomain* reduction_on_xdim = nullptr; - for (auto id : fuser_tv_out->domain()->domain()) { + for (auto id : tv_out->domain()->domain()) { // Currently warp reduction only allows // serial and block.x parallel reductions if (id->isReduction() && id->isParallelized()) { @@ -302,7 +203,7 @@ c10::optional getMaybeWarpReductionDim(const ReductionOp* node) { return c10::optional(reduction_on_xdim); } - if (reduction_on_xdim->extent()->isConstScalar()) { + if (reduction_on_xdim->extent()->isConst()) { auto extent_value = reduction_on_xdim->extent()->getInt().value(); if (extent_value % at::cuda::warp_size() == 0) { return c10::optional(reduction_on_xdim); @@ -329,22 +230,22 @@ bool derivedFromRootCAAxes(const TensorView* tv, IterDomain* axis) { }); } -std::unordered_map getParallelDomains( - kir::Val* val) { - kir::TensorView* kir_tv = nullptr; - if (val->isA()) { - kir_tv = val->as(); +std::unordered_map getParallelDomains( + Val* val) { + TensorView* tv = nullptr; + if (val->isA()) { + tv = val->as(); } else if (val->isA()) { - kir_tv = val->as()->view(); + tv = val->as()->view(); } else { TORCH_INTERNAL_ASSERT( false, "Provided val is not TensorIndex or TensorView."); } - std::unordered_map parallel_domains; - for (auto d : kir_tv->domain()->domain()) { + std::unordered_map parallel_domains; + for (auto d : tv->domain()->domain()) { if (d->isThread()) { - parallel_domains.insert(std::make_pair(d->parallelType(), d)); + parallel_domains.insert(std::make_pair(d->getParallelType(), d)); } } return parallel_domains; @@ -354,29 +255,60 @@ std::unordered_map getParallelDomains( namespace loop_utils { -// TODO: Clean this up, Naoya added a mechanism we should be able to reuse. -std::pair getAllocPoint( +BasicAllocInfo getAllocInformation( const TensorView* tv, - const std::vector& loops, + const std::vector& for_loops, const std::unordered_map& id_map, bool use_id_map) { - const auto gpu_lower = GpuLower::current(); + BasicAllocInfo info; + auto gpu_lower = GpuLower::current(); + const auto& loop_map = gpu_lower->caLoopMap(); - // If in global memory, it can be all the way outside the loops. - if (tv->getMemoryType() == MemoryType::Global) { - return {nullptr, 0}; - } + bool outer_alloc_found = false; - // Figure out where we want to place alloc/reduction initialization. We want - // outside an unroll loop, or inside our computeAt point. - kir::ForLoop* alloc_loop = nullptr; + for (auto fl : for_loops) { + if (info.alloc_pos == tv->getComputeAtPosition()) { + break; + } - auto loops_it = loops.begin(); - // Look at each axis individually in out's domain - for (const auto tv_i : c10::irange((int64_t)tv->getComputeAtPosition())) { - // Grab the axis ID + if (tv->axis(info.alloc_pos)->isReduction()) { + const auto outputs = FusionGuard::getCurFusion()->getTerminatingOutputs(); + TORCH_INTERNAL_ASSERT( + std::find(outputs.begin(), outputs.end(), tv) != outputs.end(), + "Invalid computeAt of T", + tv->name(), + ". A reducation axis is detected outside computeAt point even though it is not an output tensor."); + break; + } + + auto fl_id = fl->iter_domain(); + + if (fl_id->getParallelType() == ParallelType::Unroll) { + break; + } + + // Shared memory must be allocated outside of unswitched + // domains. See issue #1133. + if (fl_id->getParallelType() == ParallelType::Unswitch && + tv->getMemoryType() == MemoryType::Shared) { + outer_alloc_found = true; + } + + // Assume global memory is allocated at outer most scope. + if (tv->getMemoryType() == MemoryType::Global) { + outer_alloc_found = true; + } + + // Allocation of a double buffered tensor is placed outside its + // double buffer axis. + if (tv->isDoubleBuffered() && + tv->axis(info.alloc_pos) == + gpu_lower->doubleBufferInfo().getDoubleBufferAxis(tv)) { + outer_alloc_found = true; + } + + auto local_id = tv->axis(info.alloc_pos); - auto local_id = tv->axis(tv_i); if (use_id_map) { auto id_it = id_map.find(local_id); if (id_it != id_map.end()) { @@ -384,52 +316,33 @@ std::pair getAllocPoint( } } - if (gpu_lower->trivialReductionInfo().isDerivedFromRoot(local_id)) { - continue; + if (loop_map.areMapped(local_id, fl_id)) { + info.alloc_pos++; } - auto lowered_local_id = - gpu_lower->lowerValue(local_id)->as(); - loops_it = std::find_if( - loops_it, loops.end(), [&lowered_local_id](const auto& loop) { - return GpuLower::current()->caLoopMap().areMapped( - lowered_local_id, loop->iter_domain()) || - loop->iter_domain()->parallelType() == ParallelType::Unroll; - }); + info.init_for_loop = fl; - TORCH_INTERNAL_ASSERT( - loops_it != loops.end(), - "Could not find all required axes for indexing when trying to index into ", - tv); - if ((*loops_it)->iter_domain()->parallelType() == ParallelType::Unroll) { - return {alloc_loop, tv_i}; + if (!outer_alloc_found) { + info.alloc_for_loop = fl; } - - alloc_loop = *loops_it; - ++loops_it; } - return {alloc_loop, (int64_t)tv->getComputeAtPosition()}; -} - -std::pair getAllocPoint( - const TensorView* tv, - const std::vector& loops) { - return getAllocPoint(tv, loops, {}, false); + return info; } } // namespace loop_utils namespace { -class ReplaceExprInput : public kir::MutableIrVisitor { +class ReplaceExprInput : public OptOutDispatch { public: - static kir::Expr* replace( - kir::Expr* expr, - const std::unordered_map& replacement_map) { + using OptOutDispatch::handle; + static Expr* replace( + Expr* expr, + const std::unordered_map& replacement_map) { ReplaceExprInput replacer(expr, replacement_map); TORCH_INTERNAL_ASSERT(expr != nullptr); - expr->accept(&replacer); + replacer.handle(expr); TORCH_INTERNAL_ASSERT(replacer.replaced_expr_ != nullptr); auto ret_expr = replacer.replaced_expr_; @@ -441,10 +354,10 @@ class ReplaceExprInput : public kir::MutableIrVisitor { return ret_expr; } - static std::vector replace( - const std::vector& scope, - const std::unordered_map& replacement_map) { - std::vector ret_expr; + static std::vector replace( + const std::vector& scope, + const std::unordered_map& replacement_map) { + std::vector ret_expr; ret_expr.reserve(scope.size()); for (auto expr : scope) { @@ -455,20 +368,20 @@ class ReplaceExprInput : public kir::MutableIrVisitor { } private: + // TODO: Replace this with mutator, example of this is done in replace + // symbolic sizes ReplaceExprInput( - kir::Expr* expr, - const std::unordered_map& replacement_map) - : gpu_lower_(GpuLower::current()), - ir_builder_(gpu_lower_->kernel()), - replacement_map_(replacement_map) { + Expr* expr, + const std::unordered_map& replacement_map) + : replacement_map_(replacement_map) { replaced_expr_ = expr; } - c10::optional> - getMaybeInputReplacementMap(kir::Expr* expr) { + c10::optional> getMaybeInputReplacementMap( + Expr* expr) { bool need_replacement = false; - std::unordered_map replaced_val; + std::unordered_map replaced_val; for (auto in : expr->inputs()) { auto replace_it = replacement_map_.find(in); if (replace_it != replacement_map_.end()) { @@ -479,16 +392,15 @@ class ReplaceExprInput : public kir::MutableIrVisitor { } } if (need_replacement) { - return c10::optional>( - replaced_val); + return c10::optional>(replaced_val); } else { return c10::nullopt; } } // IR visitor interface - void visit(kir::ForLoop* for_loop) final { - auto new_for_loop = ir_builder_.create(for_loop); + void handle(kir::ForLoop* for_loop) final { + auto new_for_loop = IrBuilder::create(for_loop); auto replaced_loop_body = replace(for_loop->body().exprs(), replacement_map_); @@ -499,8 +411,8 @@ class ReplaceExprInput : public kir::MutableIrVisitor { replaced_expr_ = new_for_loop; } - void visit(kir::IfThenElse* ite) final { - auto new_ite = ir_builder_.create(ite->predicate()); + void handle(kir::IfThenElse* ite) final { + auto new_ite = IrBuilder::create(ite->predicate()); auto replaced_then_body = replace(ite->thenBody().exprs(), replacement_map_); for (auto new_expr : replaced_then_body) { @@ -516,31 +428,31 @@ class ReplaceExprInput : public kir::MutableIrVisitor { replaced_expr_ = new_ite; } - void visit(kir::UnaryOp* node) final { + void handle(UnaryOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->operation(), + replaced_expr_ = IrBuilder::create( + node->getUnaryOpType(), node->out(), replaced_inputs.value().at(node->in())); } } - void visit(kir::BinaryOp* node) final { + void handle(BinaryOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->operation(), + replaced_expr_ = IrBuilder::create( + node->getBinaryOpType(), node->out(), replaced_inputs.value().at(node->lhs()), replaced_inputs.value().at(node->rhs())); } } - void visit(kir::TernaryOp* node) final { + void handle(TernaryOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->operation(), + replaced_expr_ = IrBuilder::create( + node->getTernaryOpType(), node->out(), replaced_inputs.value().at(node->in1()), replaced_inputs.value().at(node->in2()), @@ -548,29 +460,31 @@ class ReplaceExprInput : public kir::MutableIrVisitor { } } - void visit(kir::ReductionOp* node) final { + void handle(ReductionOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->operation(), + replaced_expr_ = IrBuilder::create( + node->getReductionOpType(), node->init(), node->out(), replaced_inputs.value().at(node->in())); } } - void visit(kir::BroadcastOp* node) final { + void handle(BroadcastOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( - node->out(), replaced_inputs.value().at(node->in())); + replaced_expr_ = IrBuilder::create( + node->out(), + replaced_inputs.value().at(node->in()), + node->getBroadcastDimFlags()); } } - void visit(kir::WelfordOp* node) final { + void handle(WelfordOp* node) final { auto replaced_inputs = getMaybeInputReplacementMap(node); if (replaced_inputs.has_value()) { - replaced_expr_ = ir_builder_.create( + replaced_expr_ = IrBuilder::create( node->outAvg(), node->outVar(), node->outN(), @@ -584,17 +498,15 @@ class ReplaceExprInput : public kir::MutableIrVisitor { } private: - GpuLower* gpu_lower_; - kir::IrBuilder ir_builder_; - kir::Expr* replaced_expr_ = nullptr; - const std::unordered_map& replacement_map_; + Expr* replaced_expr_ = nullptr; + const std::unordered_map& replacement_map_; }; } // namespace -std::vector replaceInputsInExpr( - const std::vector& exprs, - const std::unordered_map& replacement_map) { +std::vector replaceInputsInExpr( + const std::vector& exprs, + const std::unordered_map& replacement_map) { return ReplaceExprInput::replace(exprs, replacement_map); } diff --git a/torch/csrc/jit/codegen/cuda/lower_utils.h b/torch/csrc/jit/codegen/cuda/lower_utils.h index 1c8a0df5cd7..4ed6c25e731 100644 --- a/torch/csrc/jit/codegen/cuda/lower_utils.h +++ b/torch/csrc/jit/codegen/cuda/lower_utils.h @@ -1,7 +1,7 @@ #pragma once -#include +#include #include #include @@ -19,27 +19,15 @@ namespace cuda { class ThreadPredicateMap; -using IterDomainMap = std::unordered_map; +using IterDomainMap = std::unordered_map; namespace scope_utils { -//! Returns the list of nesting loops starting at `scope` -// Primarily used in indexing, maybe could be moved there -std::vector getLoops(kir::Expr* scope); - -//! Insert expr in scope before ref -//! -//! \warning for kir::IfThenElse we implicitly insert in the "then" branch! -//! -void insertBefore(kir::Expr* scope, kir::Expr* ref, kir::Expr* expr); - //! Create an **empty** Forloop and copy the metadata. -kir::ForLoop* cloneForLoop(kir::IrBuilder& ir_builder, kir::ForLoop* for_loop); +kir::ForLoop* cloneForLoop(kir::ForLoop* for_loop); //! Create an **empty** IfThenElse and copy the metadata. -kir::IfThenElse* cloneIfThenElse( - kir::IrBuilder& ir_builder, - kir::IfThenElse* ite); +kir::IfThenElse* cloneIfThenElse(kir::IfThenElse* ite); } // namespace scope_utils @@ -74,107 +62,80 @@ std::vector iterDomainInputsOfOrderedAs( const std::vector& of, const std::vector& order); +// Returns if Val is a TensorView or TensorIndex bool isTV(const Val* const); -TORCH_CUDA_CU_API bool isTVOp(const Expr*); +// Returns is Expr is a TensorView or TensorIndex Expr. +TORCH_CUDA_CU_API bool isTvOp(const Expr*); -bool isTVOp(const kir::Expr* expr); - -TensorView* getTVOutput(const Expr*); -kir::TensorView* getTVOutput(const kir::Expr*); - -bool isScalarOp(const Expr*); - -// TODO(kir): remove -Expr* asExpr(Statement*); - -// TODO(kir): Remove in favor of ->as() -TensorView* asTV(Val*); - -//! Get kir::TensorView potentially via kir::TensorIndex. Returns nullptr if -//! cast fails. -kir::TensorView* getTv(kir::Val*); - -//! Get only kir::TensorView potentially via kir::TensorIndex. -std::vector getTvs(const std::vector& vals); - -//! Get kir::TensorView potentially via kir::TensorIndex. Error if cast fails. -kir::TensorView* asTv(kir::Val*); - -//! Get kir::TensorView potentially via kir::TensorIndex. Error if cast fails. -std::vector asTvs(const std::vector& vals); +// Returns the first output of Expr that is a TensorView +TensorView* getTvOutput(const Expr*); bool hasBlockSync(const Expr* expr, const ThreadPredicateMap& pred_map); -bool hasBlockSync(const kir::Expr* expr, const ThreadPredicateMap& pred_map); - -// expr_replacement_map maps an expression to its replacement. -// -// The applyReplacement function serves two purposes. -// -// 1. If expr is found in expr_replacement_map, return the value for expr key. -// Otherwise, return the original expression. -// -// 2. If a replacement is not found and the expression is a ForLoop or an -// IfThenElse, it modifies the expressions in its scope by running the -// handle_scope function -// -// The handle_scope function iterates over the expressions in the scope. -// For each expression, it updates the expression the value returned by -// applyReplacement. -kir::Expr* applyReplacements( - const std::unordered_map& expr_replacement_map, - kir::Expr* expr); //! Returns the Fuser iterdomain that maps to the thread dimension grouped //! to warps. Returns nullopt if the reduction is not to be lowered to //! a warp reduction. -c10::optional getMaybeWarpReductionDim( - const kir::ReductionOp* node); - c10::optional getMaybeWarpReductionDim(const ReductionOp* node); +bool isScalarOp(const Expr*); + +//! Get TensorView potentially via kir::TensorIndex. Returns nullptr if +//! cast fails. +TensorView* getTv(Val*); + +//! Get only TensorView potentially via kir::TensorIndex. +std::vector getTvs(const std::vector& vals); + //! Return true if axis is derived from a root axis that is an input //! to a CA leaf axis. bool derivedFromRootCAAxes(const TensorView* tv, IterDomain* axis); -std::unordered_map getParallelDomains( - kir::Val* val); +std::unordered_map getParallelDomains( + Val* val); } // namespace ir_utils namespace loop_utils { -// I wanted to make the tv's in these util functions constant, but that started -// a long const-ness project going into TensorView (making functions const -// there) then into lower_loops where we sort exprs. -// TODO: We should fix this when we have some time. +struct BasicAllocInfo { + // The for loop that the initialization of this allocation must be + // placed in, nullptr if not within a loop + kir::ForLoop* init_for_loop = nullptr; -// Figure out which loop the allocation needs to be in. Returns nullptr if -// outside the first loop in loops. Also find out which index in tv the -// first dimension that needs to be allocated is. Meaning we need to allocate -// that local axis and above. -// TODO: Only remaining use of this is in index compute, remove use from there, -// or refactor and use in lower_allocation -std::pair getAllocPoint( + // Keep track of the actual allocation loop. This can be different + // from init_for_loop only with unswitched shared memory allocations, + // which are moved outer loops to avoid duplicated allocations. This means + // that the alloc position may be outside what's expected. Most applications + // outside lower_allocation is likely looking for init_for_loop which is + // more directly related to how large an allocation is and how it's used. + // (see issue #1133). + kir::ForLoop* alloc_for_loop = nullptr; + + // The allocation position relative to buffer IDs, it could be outside the + // compute at position if it's shared memory with a compute at inside an + // unswitch + size_t alloc_pos = 0; +}; + +// Fill the above allocation struct based on provided information. id_map is +// used if we're looking at a producer tensor but loops on a consumer tensor. +BasicAllocInfo getAllocInformation( const TensorView* tv, const std::vector& loops, - const std::unordered_map& id_map, - bool use_id_map); - -std::pair getAllocPoint( - const TensorView* tv, - const std::vector& loops); + const std::unordered_map& id_map = {}, + bool use_id_map = false); } // namespace loop_utils // Replace value pass on Kernel IR. -// Replace each use of any kir::Val* that apears in the given `replacement_map` +// Replace each use of any Val* that apears in the given `replacement_map` // Keeps the predicate carried by each expr // // Warning: Blindly replaces all use based on pointer // Warning: May invalidate indexing if replacing uses of allocated values -std::vector replaceInputsInExpr( - const std::vector& exprs, - const std::unordered_map& replacement_map); +std::vector replaceInputsInExpr( + const std::vector& exprs, + const std::unordered_map& replacement_map); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp index 0579e44dcd6..25ba76ee71b 100644 --- a/torch/csrc/jit/codegen/cuda/lower_validation.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_validation.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -319,7 +318,7 @@ class VectorizeValidator : public OptInDispatch { vector_size, " however, vector sizes only upto and including 16 bytes are supported."); - auto replay_exprs = ExprSort::getExprs(fusion, {v_id}); + auto replay_exprs = StmtSort::getExprs(fusion, {v_id}, false); VectorizeValidator validator(v_id); @@ -463,6 +462,14 @@ void validateParallelizationOfTensor(TensorView* tv) { continue; } + // It doesn't matter if this axis is a non-concretized broadcast + // TODO: merging broadcast and non-broadcast + if (axis->isBroadcast() && + !GpuLower::current()->concretizedBroadcastDomains().isConcretized( + axis)) { + continue; + } + TORCH_INTERNAL_ASSERT( !pt_map.get(ptype), "Multiple use of ", @@ -489,7 +496,7 @@ void validateParallelizationOfTensor(TensorView* tv) { ". The tensor is parallelized with ", predicated_parallel_types.toString(), ", but it's invalid to use the types as the tensor is also predicated with them.", - ", thread prd: ", + ", thread pred: ", thread_pred.limited_types.toString()); } @@ -503,10 +510,10 @@ void validateParallelize(Fusion* fusion) { const auto& loop_map = GpuLower::current()->caLoopMap(); const auto& pred_map = GpuLower::current()->threadPredMap(); - auto exprs = ExprSort::getExprs(fusion); + auto exprs = StmtSort::getExprs(fusion); for (auto expr : exprs) { - if (!ir_utils::isTVOp(expr)) { + if (!ir_utils::isTvOp(expr)) { continue; } // Validate parallelization of each consumer by itself @@ -630,7 +637,7 @@ namespace { // each tensor that needs to be computed. std::unordered_map> getLiveRangeOffsets( Fusion* fusion) { - auto exprs = ExprSort::getExprs(fusion); + auto exprs = StmtSort::getExprs(fusion); std::unordered_map> map; @@ -760,7 +767,9 @@ void validatePartialSplit(Fusion* fusion) { auto range_info = getLiveRangeOffsets(fusion); for (auto tv : ir_utils::allTvs(fusion)) { - auto exprs = ir_utils::historyOf(tv); + auto exprs = StmtSort::getExprs( + tv->fusion(), + {tv->domain()->domain().begin(), tv->domain()->domain().end()}); for (auto split : ir_utils::filterByType(exprs)) { // When the start and stop offsets are not zero, make sure the // range defined by the split includes the required range to diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.h b/torch/csrc/jit/codegen/cuda/lower_validation.h index 89de85026ee..115df13c322 100644 --- a/torch/csrc/jit/codegen/cuda/lower_validation.h +++ b/torch/csrc/jit/codegen/cuda/lower_validation.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp index eaddf7faea3..630d3128e78 100644 --- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include @@ -18,20 +18,19 @@ namespace { //! and their corresponding allocations class EliminateDeadBroadcastAndAllocate { public: - static std::vector run(const std::vector& exprs) { + static std::vector run(const std::vector& exprs) { EliminateDeadBroadcastAndAllocate dce(exprs); return dce.result_exprs_; } private: - EliminateDeadBroadcastAndAllocate(const std::vector& exprs) - : ir_builder_(GpuLower::current()->kernel()) { + EliminateDeadBroadcastAndAllocate(const std::vector& exprs) { findLiveTvs(exprs); findDeadTvs(); eliminateDeadCode(exprs); } - void findLiveTvs(const std::vector& exprs) { + void findLiveTvs(const std::vector& exprs) { for (auto expr : exprs) { if (auto for_loop = dynamic_cast(expr)) { findLiveTvs(for_loop->body().exprs()); @@ -44,11 +43,10 @@ class EliminateDeadBroadcastAndAllocate { if (auto allocate = dynamic_cast(expr)) { if (allocate->memoryType() == MemoryType::Local) { - if (auto kir_tv = - dynamic_cast(allocate->buffer())) { + if (auto tv = dynamic_cast(allocate->buffer())) { // We know only tvs that we'd want to consider are broadcast outputs - if (kir_tv->fuserTv()->definition()->isA()) { - candidate_tv_set_.insert(kir_tv); + if (tv->definition()->isA()) { + candidate_tv_set_.insert(tv); } } } @@ -72,18 +70,18 @@ class EliminateDeadBroadcastAndAllocate { } } - void eliminateDeadCode(const std::vector& exprs) { + void eliminateDeadCode(const std::vector& exprs) { result_exprs_ = eliminateDeadCodeInScope(exprs); } - bool shouldEliminate(kir::Expr* expr) { + bool shouldEliminate(Expr* expr) { if (auto allocate = dynamic_cast(expr)) { - if (auto buffer_tv = dynamic_cast(allocate->buffer())) { + if (auto buffer_tv = dynamic_cast(allocate->buffer())) { if (dead_tvs_.count(buffer_tv)) { return true; } } - } else if (auto broadcast = dynamic_cast(expr)) { + } else if (auto broadcast = dynamic_cast(expr)) { if (auto out_ti = dynamic_cast(broadcast->out())) { if (dead_tvs_.count(out_ti->view())) { return true; @@ -95,9 +93,8 @@ class EliminateDeadBroadcastAndAllocate { //! Returns a new vector of exprs with dead exprs //! eliminated. - std::vector eliminateDeadCodeInScope( - const std::vector& exprs) { - std::vector result_exprs; + std::vector eliminateDeadCodeInScope(const std::vector& exprs) { + std::vector result_exprs; for (auto expr : exprs) { auto result_expr = expr; @@ -128,7 +125,7 @@ class EliminateDeadBroadcastAndAllocate { // TODO: we will need a kernel_ir cloner to make this // kind of logic re-usable. - auto new_loop = scope_utils::cloneForLoop(ir_builder_, for_loop); + auto new_loop = scope_utils::cloneForLoop(for_loop); for (auto expr : new_loop_body) { new_loop->body().push_back(expr); @@ -143,7 +140,7 @@ class EliminateDeadBroadcastAndAllocate { return nullptr; } - auto new_ite = scope_utils::cloneIfThenElse(ir_builder_, ite); + auto new_ite = scope_utils::cloneIfThenElse(ite); for (auto expr : new_then_body) { new_ite->thenBody().push_back(expr); @@ -155,12 +152,11 @@ class EliminateDeadBroadcastAndAllocate { } private: - std::unordered_set live_tvs_; - std::unordered_set dead_tvs_; - std::unordered_set candidate_tv_set_; + std::unordered_set live_tvs_; + std::unordered_set dead_tvs_; + std::unordered_set candidate_tv_set_; - std::vector result_exprs_; - kir::IrBuilder ir_builder_; + std::vector result_exprs_; }; //! A pass to eliminate redundant parallel broadcasts that are consumers @@ -189,9 +185,9 @@ class EliminateDeadBroadcastAndAllocate { //! //! 3. EliminateDeadBroadcastAndAllocate removes the broadcast ops //! and corresponding allocations if they're un-used after step 2. -class FuseBroadcastWithWarpReduce { +class FuseBroadcastWithWarpReduce : private kir::IrVisitor { public: - static std::vector fuse(const std::vector& exprs) { + static std::vector fuse(const std::vector& exprs) { FuseBroadcastWithWarpReduce fuse_broadcast_map(exprs); const auto replaced_inputs = replaceInputsInExpr(exprs, fuse_broadcast_map.val_replacement_map_); @@ -199,70 +195,50 @@ class FuseBroadcastWithWarpReduce { } private: - FuseBroadcastWithWarpReduce(const std::vector& exprs) { + FuseBroadcastWithWarpReduce(const std::vector& exprs) { // open stack space for global scope - // The scope stack for kir_tv_to_allocate wouldn't be needed + // The scope stack for tv_to_allocate wouldn't be needed // if the allocations are guaranteed to be once and unique, // which can currently be assumed but this pass tries not // to rely on this assumption. - running_kir_tv_to_allocate_map_.emplace_back( - std::make_unique< - std::unordered_map>()); + running_tv_to_allocate_map_.emplace_back( + std::make_unique>()); running_visible_allocation_stack_.emplace_back( std::make_unique>()); - - for (auto expr : exprs) { - handle(expr); - } + kir::IrVisitor::handle(exprs); } - void handle(kir::Expr* expr) { - if (auto for_loop = dynamic_cast(expr)) { - handle(for_loop); - return; - } else if (auto ite = dynamic_cast(expr)) { - handle(ite); - return; - } - - // Process expr inputs if needs replacement - for (auto inp : expr->inputs()) { - if (auto input_ti = dynamic_cast(inp)) { - auto replace = findMaybeReplacedTensorIndex(input_ti); - if (replace.has_value()) { - val_replacement_map_[input_ti] = replace.value(); + void handle(Expr* expr) final { + if (ir_utils::isTvOp(expr)) { + // Process expr inputs if needs replacement + for (auto inp : expr->inputs()) { + if (auto input_ti = dynamic_cast(inp)) { + auto replace = findMaybeReplacedTensorIndex(input_ti); + if (replace.has_value()) { + val_replacement_map_[input_ti] = replace.value(); + } } } } - - // Handle reduction definitions - if (auto reduction = dynamic_cast(expr)) { - handle(reduction); - } else if (auto broadcast = dynamic_cast(expr)) { - handle(broadcast); - } else if (auto allocate = dynamic_cast(expr)) { - handle(allocate); - } } - bool openLoopNestLevel(kir::IterDomain* id) { - if (id->isThread() || id->parallelType() == ParallelType::Unswitch) { + bool openLoopNestLevel(IterDomain* id) { + if (id->isThread() || id->getParallelType() == ParallelType::Unswitch) { return false; } - if (id->parallelType() == ParallelType::Serial || - id->parallelType() == ParallelType::Unroll) { + if (id->getParallelType() == ParallelType::Serial || + id->getParallelType() == ParallelType::Unroll) { return !id->isBroadcast(); } return true; } - void handle(kir::ForLoop* for_loop) { + void handle(kir::ForLoop* for_loop) final { // Keep track of visible reduction outputs bool open_nest_level = openLoopNestLevel(for_loop->iter_domain()); if (open_nest_level) { - running_kir_tv_to_allocate_map_.emplace_back( - std::make_unique< - std::unordered_map>()); + running_tv_to_allocate_map_.emplace_back( + std::make_unique>()); running_visible_allocation_stack_.emplace_back( std::make_unique>()); } @@ -270,12 +246,12 @@ class FuseBroadcastWithWarpReduce { handle(expr); } if (open_nest_level) { - running_kir_tv_to_allocate_map_.pop_back(); + running_tv_to_allocate_map_.pop_back(); running_visible_allocation_stack_.pop_back(); } } - void handle(kir::IfThenElse* ite) { + void handle(kir::IfThenElse* ite) final { running_visible_allocation_stack_.emplace_back( std::make_unique>()); for (auto expr : ite->thenBody().exprs()) { @@ -292,15 +268,14 @@ class FuseBroadcastWithWarpReduce { //! Place this allocate on the list of currently visible allocations, //! organized by loop nest level. - void handle(kir::Allocate* allocate) { + void handle(kir::Allocate* allocate) final { if (allocate->memoryType() != MemoryType::Local) { return; } - if (auto kir_tv = dynamic_cast(allocate->buffer())) { - auto fuser_tv = kir_tv->fuserTv(); - if (fuser_tv->definition()) { - if (fuser_tv->definition()->isA() || - fuser_tv->definition()->isA()) { + if (auto tv = dynamic_cast(allocate->buffer())) { + if (tv->definition()) { + if (tv->definition()->isA() || + tv->definition()->isA()) { running_visible_allocation_stack_.back()->push_back(allocate); } } @@ -311,18 +286,18 @@ class FuseBroadcastWithWarpReduce { //! returns the replaced TensorIndex if so. c10::optional findMaybeReplacedTensorIndex( kir::TensorIndex* tensor_index) { - auto kir_tv = tensor_index->view(); - auto tensor_index_it = running_tv_replacement_map_.find(kir_tv); + auto tv = tensor_index->view(); + auto tensor_index_it = running_tv_replacement_map_.find(tv); if (tensor_index_it != running_tv_replacement_map_.end()) { return tensor_index_it->second; } return c10::nullopt; } - //! Iteratve backwards on the currently visible loop scopes + //! Iterate backwards on the currently visible loop scopes //! and find the first allocation corresponding to the //! given tv. - kir::Allocate* getActiveAllocateFor(kir::TensorView* tv) { + kir::Allocate* getActiveAllocateFor(TensorView* tv) { for (auto frame_it = running_visible_allocation_stack_.rbegin(); frame_it != running_visible_allocation_stack_.rend(); frame_it++) { @@ -340,19 +315,10 @@ class FuseBroadcastWithWarpReduce { return nullptr; } - Expr* getFuserTVExpr(kir::Expr* expr) { - auto out = expr->outputs()[0]; - auto out_ti = dynamic_cast(out); - if (!out_ti) { - return nullptr; - } - return out_ti->view()->fuserTv()->definition(); - } - - bool isOpInputRegisterTV(kir::Expr* expr) { + bool isOpInputRegisterTV(Expr* expr) { for (auto inp : expr->inputs()) { if (auto inp_ti = dynamic_cast(inp)) { - if (inp_ti->view()->memoryType() != MemoryType::Local) { + if (inp_ti->view()->getMemoryType() != MemoryType::Local) { return false; } } @@ -361,10 +327,10 @@ class FuseBroadcastWithWarpReduce { return true; } - bool isOpOutputRegisterTV(kir::Expr* expr) { + bool isOpOutputRegisterTV(Expr* expr) { for (auto out : expr->outputs()) { if (auto out_ti = dynamic_cast(out)) { - if (out_ti->view()->memoryType() != MemoryType::Local) { + if (out_ti->view()->getMemoryType() != MemoryType::Local) { return false; } } @@ -374,8 +340,8 @@ class FuseBroadcastWithWarpReduce { } //! Updates map of serially visible reduction tvs, see comment on - //! running_kir_tv_to_allocate_map_. - void handle(kir::ReductionOp* reduction) { + //! running_tv_to_allocate_map_. + void handle(ReductionOp* reduction) final { if (!isOpOutputRegisterTV(reduction)) { return; } @@ -386,11 +352,11 @@ class FuseBroadcastWithWarpReduce { // keep track of which reduction buffer this expr writes into auto reduction_allocate = getActiveAllocateFor(reduction_ti_out->view()); - running_kir_tv_to_allocate_map_.back()->operator[]( - reduction_ti_out->view()) = reduction_allocate; + running_tv_to_allocate_map_.back()->operator[](reduction_ti_out->view()) = + reduction_allocate; } - void handle(kir::BroadcastOp* broadcast) { + void handle(BroadcastOp* broadcast) final { if (!isOpInputRegisterTV(broadcast) || !isOpOutputRegisterTV(broadcast)) { return; } @@ -400,9 +366,9 @@ class FuseBroadcastWithWarpReduce { //! Detects if this broadcast can be fused with the producer reduction. //! adds the output of broadcast to replacement map if all above mentioned //! conditions check. - void tryAddOutputToReplaceMap(kir::BroadcastOp* broadcast) { + void tryAddOutputToReplaceMap(BroadcastOp* broadcast) { if (auto in_ti = dynamic_cast(broadcast->in())) { - if (!in_ti->view()->fuserTv()->definition()->isA()) { + if (!in_ti->view()->definition()->isA()) { return; } auto out_ti = broadcast->out()->as(); @@ -410,15 +376,14 @@ class FuseBroadcastWithWarpReduce { // check reduction-broadcast mapping: if (!canFuseBroadcastWithWarpReduction( - out_tv->fuserTv()->definition()->as())) { + out_tv->definition()->as())) { return; } // check buffers are size-1 auto reduction_allocate_it = - running_kir_tv_to_allocate_map_.back()->find(in_ti->view()); - if (reduction_allocate_it == - running_kir_tv_to_allocate_map_.back()->end()) { + running_tv_to_allocate_map_.back()->find(in_ti->view()); + if (reduction_allocate_it == running_tv_to_allocate_map_.back()->end()) { // The producer reduction is not in the serially visible scope, // as defined in openLoopNestLevel. There still could be some // cases that we could fuse but disabled for simplicity. @@ -444,7 +409,7 @@ class FuseBroadcastWithWarpReduce { return; } - // Write the kir_tv in to the replacement map + // Write the tv in to the replacement map // so the future uses of this tv will put // the tensorIndex's in the actual replacement map. running_tv_replacement_map_[out_tv] = in_ti; @@ -515,7 +480,7 @@ class FuseBroadcastWithWarpReduce { //! could need some extension for more precise scope based analysis in the //! future especially if we have more complex IfThenElse blocks than //! predicates and unroll. - std::unordered_map + std::unordered_map running_tv_replacement_map_; //! Keeps track of the allocated buffers that the exprs will write/read @@ -531,21 +496,20 @@ class FuseBroadcastWithWarpReduce { //! visibility on the generated kernel. The model of IfThenElse assumes the //! only ITE's we have are predicates and unrolls, which might need to be //! more precise. - std::vector< - std::unique_ptr>> - running_kir_tv_to_allocate_map_; + std::vector>> + running_tv_to_allocate_map_; //! This map is the final output of this pass and a val replacement map will //! be run using //! it. All keys and values are TensorIndex's, and before this pass each //! TensorIndex is uniquely generated by lower_index pass for each access of - //! a kir_tv. - std::unordered_map val_replacement_map_; + //! a tv. + std::unordered_map val_replacement_map_; }; } // namespace -std::vector fuseWarpReduce(const std::vector exprs) { +std::vector fuseWarpReduce(const std::vector exprs) { return FuseBroadcastWithWarpReduce::fuse(exprs); } diff --git a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h index 785c0b59122..7480809c7dc 100644 --- a/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h +++ b/torch/csrc/jit/codegen/cuda/lower_warp_reduce.h @@ -13,7 +13,7 @@ struct WarpPaddedParallelInfo { bool has_warp_reduction = false; }; -std::vector fuseWarpReduce(const std::vector exprs); +std::vector fuseWarpReduce(const std::vector exprs); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/manager.cpp b/torch/csrc/jit/codegen/cuda/manager.cpp index ee1bea81535..0f5967c004d 100644 --- a/torch/csrc/jit/codegen/cuda/manager.cpp +++ b/torch/csrc/jit/codegen/cuda/manager.cpp @@ -141,6 +141,25 @@ class CudaFusionManager { int32_t next_unique_id_ = 0; }; +// Mark string attribute in alias-copy nodes to enable its implementation +// in the fallback path. +void enableAliasCopyNodes(const std::shared_ptr& graph, Block* block) { + static std::unordered_set alias_copy_op( + {prim::view_copy, + prim::reshape_copy, + prim::squeeze_copy, + prim::unsqueeze_copy}); + + for (Node* n : block->nodes()) { + for (Block* b : n->blocks()) { + enableAliasCopyNodes(graph, b); + } + if (alias_copy_op.find(n->kind()) != alias_copy_op.end()) { + n->s_(attr::name, "CudaFusionGroup"); + } + } +} + } // namespace void compileCudaFusionGroup(Node* fusion_node) { @@ -194,6 +213,7 @@ void runCudaFusionGroup(const Node* fusion_node, Stack& stack) { // copying graph here since we are eliminating shape information; auto copied_graph = fusion_node->g(attr::Subgraph)->copy(); EraseShapeInformation(copied_graph); + enableAliasCopyNodes(copied_graph, copied_graph->block()); InterpreterState{Code(copied_graph, "fallback_cuda_fuser")}.run(stack); }; diff --git a/torch/csrc/jit/codegen/cuda/manager.h b/torch/csrc/jit/codegen/cuda/manager.h index 39c97478eff..4b725cd80bc 100644 --- a/torch/csrc/jit/codegen/cuda/manager.h +++ b/torch/csrc/jit/codegen/cuda/manager.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include /* diff --git a/torch/csrc/jit/codegen/cuda/mutator.cpp b/torch/csrc/jit/codegen/cuda/mutator.cpp index 8d13f1e299e..c24e444eb56 100644 --- a/torch/csrc/jit/codegen/cuda/mutator.cpp +++ b/torch/csrc/jit/codegen/cuda/mutator.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -10,143 +11,177 @@ namespace jit { namespace fuser { namespace cuda { -// MUTATE FUNCTIONS FOR VALS +void OptOutMutator::mutate(Statement* s) { + Statement::mutatorDispatch(this, s); +} -Statement* OptOutMutator::mutate(IterDomain* id) { - Val* start = mutateAsVal(id->start())->asVal(); - Val* extent = mutateAsVal(id->extent())->asVal(); - Val* stop_offset = mutateAsVal(id->stopOffset())->asVal(); +void OptOutMutator::mutate(Expr* e) { + Expr::mutatorDispatch(this, e); +} + +void OptOutMutator::mutate(Val* v) { + Val::mutatorDispatch(this, v); +} + +void OptOutMutator::registerMutation(Val* val, Val* mutation) { + bool val_is_ns = val->vtype() == ValType::NamedScalar; + bool mutation_is_ns = mutation->vtype() == ValType::NamedScalar; + bool val_is_scalar = val->vtype() == ValType::Scalar; + bool mutation_is_scalar = mutation->vtype() == ValType::Scalar; + TORCH_INTERNAL_ASSERT( + mutation->dtype() == val->dtype() && + (mutation->vtype() == val->vtype() || + ((val_is_ns && mutation_is_scalar) || + (mutation_is_ns && val_is_scalar))), + "Mutations are not allowed to change types, tried to go from: (", + val->vtype(), + ", ", + val->dtype(), + ") to: (", + mutation->vtype(), + ", ", + mutation->dtype(), + ")"); + mutations[val] = mutation; +} + +void OptOutMutator::mutate(Bool* b) {} + +void OptOutMutator::mutate(Double* d) {} + +void OptOutMutator::mutate(Int* i) {} + +void OptOutMutator::mutate(NamedScalar* ns) {} + +void OptOutMutator::mutate(IterDomain* id) { + Val* start = maybeMutated(id->start()); + Val* extent = maybeMutated(id->extent()); + Val* stop_offset = maybeMutated(id->stopOffset()); if (start->sameAs(id->start()) && extent->sameAs(id->extent()) && stop_offset->sameAs(id->stopOffset())) { - return id; + return; } - Val* mutated_val = new IterDomain( + Val* mutated_val = IrBuilder::create( + id->container(), start, extent, stop_offset, id->getParallelType(), id->getIterType(), id->isRFactorProduct()); + if (id->hasPaddingToMultipleOfWarp()) { + mutated_val->as()->padToMultipleOfWarp( + id->getMaybeSizeAfterPadding()); + } registerMutation(id, mutated_val); - return mutated_val; } -Statement* OptOutMutator::mutate(TensorDomain* td) { - std::vector dom; +void OptOutMutator::mutate(TensorDomain* td) { bool mutated = false; - for (const auto i : c10::irange(td->nDims())) { - IterDomain* id = mutateAsVal(td->axis(i))->as(); - dom.push_back(id); - if (!id->sameAs(td->axis(i))) - mutated = true; + + auto updateIdVec = [&](const std::vector& ids) { + std::vector updated_ids; + for (auto id : ids) { + auto updated_id = maybeMutated(id)->as(); + updated_ids.push_back(updated_id); + if (!updated_id->sameAs(id)) { + mutated = true; + } + } + return updated_ids; + }; + + std::vector root_dom = updateIdVec(td->getRootDomain()); + std::vector rfactor_dom = td->hasRFactor() + ? updateIdVec(td->getMaybeRFactorDomain()) + : std::vector(); + std::vector domain = updateIdVec(td->domain()); + + if (!mutated) { + return; } - if (mutated) { - Val* mutated_val = new TensorDomain( - td->getRootDomain(), td->getRFactorDomain(), dom, td->contiguity()); - registerMutation(td, mutated_val); - return mutated_val; - } - return td; + Val* mutated_val = IrBuilder::create( + td->container(), root_dom, rfactor_dom, domain, td->contiguity()); + registerMutation(td, mutated_val); } -Statement* OptOutMutator::mutate(TensorView* tv) { - TensorDomain* td = mutateAsVal(tv->domain())->as(); - +void OptOutMutator::mutate(TensorView* tv) { + TensorDomain* td = maybeMutated(tv->domain())->as(); if (!tv->domain()->sameAs(td)) { - TensorView* mutated_tv = new TensorView(td, tv->getDataType().value()); - registerMutation(tv, mutated_tv); - return mutated_tv; + tv->setDomain(td); } - return tv; + // Don't register tv mutations as we just want to update the TD } -Statement* OptOutMutator::mutate(Bool* b) { - return b; +void OptOutMutator::mutate(kir::Predicate*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); } -Statement* OptOutMutator::mutate(Double* d) { - return d; -} - -Statement* OptOutMutator::mutate(Int* i) { - return i; -} - -Statement* OptOutMutator::mutate(NamedScalar* ns) { - return ns; +void OptOutMutator::mutate(kir::TensorIndex*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); } // MUTATE FUNCTIONS FOR EXPRESSIONS. +void OptOutMutator::mutate(UnaryOp* uop) { + Val* out = maybeMutated(uop->out()); + Val* in = maybeMutated(uop->in()); -Statement* OptOutMutator::mutate(Split* s) { - IterDomain* ot = mutateAsVal(s->outer())->as(); - IterDomain* inr = mutateAsVal(s->inner())->as(); - IterDomain* in = mutateAsVal(s->in())->as(); - Val* fact = mutateAsVal(s->factor())->as(); - - if (ot->sameAs(s->outer()) && inr->sameAs(s->inner()) && - in->sameAs(s->in()) && areEqualScalars(fact, s->factor())) { - return s; + if (out->sameAs(uop->out()) && in->sameAs(uop->in())) { + return; } - FusionGuard::getCurFusion()->removeExpr(s); - return new Split(ot, inr, in, fact, s->innerSplit()); + auto container = uop->container(); + auto uop_type = uop->getUnaryOpType(); + container->removeExpr(uop); + IrBuilder::create(container, uop_type, out, in); } -Statement* OptOutMutator::mutate(Merge* m) { - IterDomain* ot = mutateAsVal(m->out())->as(); - IterDomain* otr = mutateAsVal(m->outer())->as(); - IterDomain* in = mutateAsVal(m->inner())->as(); +void OptOutMutator::mutate(BinaryOp* bop) { + Val* out = maybeMutated(bop->out()); + Val* lhs = maybeMutated(bop->lhs()); + Val* rhs = maybeMutated(bop->rhs()); - if (ot->sameAs(m->out()) && otr->sameAs(m->outer()) && in->sameAs(m->inner())) - return m; + if (out == bop->out() && lhs == bop->lhs() && rhs == bop->rhs()) { + return; + } - FusionGuard::getCurFusion()->removeExpr(m); - return new Merge(ot, otr, in); + auto container = bop->container(); + auto bop_type = bop->getBinaryOpType(); + container->removeExpr(bop); + IrBuilder::create(container, bop_type, out, lhs, rhs); } -Statement* OptOutMutator::mutate(UnaryOp* uop) { - Val* out = mutateAsVal(uop->out())->asVal(); - Val* in = mutateAsVal(uop->in())->asVal(); +void OptOutMutator::mutate(TernaryOp* top) { + Val* out = maybeMutated(top->out()); + Val* in1 = maybeMutated(top->in1()); + Val* in2 = maybeMutated(top->in2()); + Val* in3 = maybeMutated(top->in3()); - if (out->sameAs(uop->out()) && in->sameAs(uop->in())) - return uop; - FusionGuard::getCurFusion()->removeExpr(uop); - return new UnaryOp(uop->getUnaryOpType(), out, in); -} - -Statement* OptOutMutator::mutate(BinaryOp* bop) { - Val* out = mutateAsVal(bop->out())->asVal(); - Val* lhs = mutateAsVal(bop->lhs())->asVal(); - Val* rhs = mutateAsVal(bop->rhs())->asVal(); - if (out == bop->out() && lhs == bop->lhs() && rhs == bop->rhs()) - return bop; - FusionGuard::getCurFusion()->removeExpr(bop); - return new BinaryOp(bop->getBinaryOpType(), out, lhs, rhs); -} - -Statement* OptOutMutator::mutate(TernaryOp* top) { - Val* out = mutateAsVal(top->out())->asVal(); - Val* in1 = mutateAsVal(top->in1())->asVal(); - Val* in2 = mutateAsVal(top->in2())->asVal(); - Val* in3 = mutateAsVal(top->in3())->asVal(); if (out == top->out() && in1 == top->in1() && in2 == top->in2() && - in3 == top->in3()) - return top; - FusionGuard::getCurFusion()->removeExpr(top); - return new TernaryOp(top->getTernaryOpType(), out, in1, in2, in3); + in3 == top->in3()) { + return; + } + + auto container = top->container(); + auto top_type = top->getTernaryOpType(); + container->removeExpr(top); + IrBuilder::create(container, top_type, out, in1, in2, in3); } -Statement* OptOutMutator::mutate(ReductionOp* rop) { - Val* out = mutateAsVal(rop->out())->asVal(); - Val* in = mutateAsVal(rop->in())->asVal(); +void OptOutMutator::mutate(ReductionOp* rop) { + Val* out = maybeMutated(rop->out()); + Val* in = maybeMutated(rop->in()); Val* init = rop->init(); if (out->sameAs(rop->out()) && in->sameAs(rop->in()) && - init->sameAs(rop->init())) - return rop; + init->sameAs(rop->init())) { + return; + } - return new ReductionOp(rop->getReductionOpType(), init, out, in); + auto container = rop->container(); + auto rop_type = rop->getReductionOpType(); + container->removeExpr(rop); + IrBuilder::create(container, rop_type, init, out, in); } namespace { @@ -159,20 +194,18 @@ inline bool compareOptional(Val* a, Val* b) { } // namespace -Statement* OptOutMutator::mutate(WelfordOp* wop) { - Val* out_avg = mutateAsVal(wop->outAvg())->asVal(); - Val* out_var = mutateAsVal(wop->outVar())->asVal(); - Val* out_N = mutateAsVal(wop->outN())->asVal(); +void OptOutMutator::mutate(WelfordOp* wop) { + Val* out_avg = maybeMutated(wop->outAvg()); + Val* out_var = maybeMutated(wop->outVar()); + Val* out_N = maybeMutated(wop->outN()); - Val* in_avg = mutateAsVal(wop->inAvg())->asVal(); - Val* in_var = wop->inVar() ? mutateAsVal(wop->inVar())->asVal() : nullptr; - Val* in_N = mutateAsVal(wop->inN())->asVal(); + Val* in_avg = maybeMutated(wop->inAvg()); + Val* in_var = wop->inVar() ? maybeMutated(wop->inVar()) : nullptr; + Val* in_N = maybeMutated(wop->inN()); - Val* init_avg = - wop->initAvg() ? mutateAsVal(wop->initAvg())->asVal() : nullptr; - Val* init_var = - wop->initVar() ? mutateAsVal(wop->initVar())->asVal() : nullptr; - Val* init_N = mutateAsVal(wop->initN())->asVal(); + Val* init_avg = wop->initAvg() ? maybeMutated(wop->initAvg()) : nullptr; + Val* init_var = wop->initVar() ? maybeMutated(wop->initVar()) : nullptr; + Val* init_N = maybeMutated(wop->initN()); const bool out_compare = out_avg->sameAs(wop->outAvg()) && out_var->sameAs(wop->outVar()) && out_N->sameAs(wop->outN()); @@ -182,56 +215,163 @@ Statement* OptOutMutator::mutate(WelfordOp* wop) { compareOptional(init_var, wop->initVar()) && init_N->sameAs(wop->initN()); if (out_compare && init_compare && in_compare) { - return wop; - } else { - return new WelfordOp( - out_avg, - out_var, - out_N, - init_avg, - init_var, - init_N, - in_avg, - in_var, - in_N); + return; } + + auto container = wop->container(); + container->removeExpr(wop); + IrBuilder::create( + container, + out_avg, + out_var, + out_N, + init_avg, + init_var, + init_N, + in_avg, + in_var, + in_N); } -Statement* OptOutMutator::mutate(BroadcastOp* bop) { - return bop; +void OptOutMutator::mutate(BroadcastOp* bop) { + Val* out = maybeMutated(bop->out()); + Val* in = maybeMutated(bop->in()); + + if (out->sameAs(bop->out()) && in->sameAs(bop->in())) { + return; + } + + auto container = bop->container(); + auto flags = bop->getBroadcastDimFlags(); + container->removeExpr(bop); + IrBuilder::create(container, out, in, flags); } -Statement* OptOutMutator::mutate(TransposeOp* top) { - return top; +void OptOutMutator::mutate(TransposeOp* top) { + TensorView* out = maybeMutated(top->out())->as(); + TensorView* in = maybeMutated(top->in())->as(); + + if (out->sameAs(top->out()) && in->sameAs(top->in())) { + return; + } + + auto container = top->container(); + auto new2old = top->new2old(); + container->removeExpr(top); + IrBuilder::create(container, out, in, new2old); } -Statement* OptOutMutator::mutate(ShiftOp* sop) { - Val* out = mutateAsVal(sop->out())->asVal(); - Val* in = mutateAsVal(sop->in())->asVal(); +void OptOutMutator::mutate(ShiftOp* sop) { + Val* out = maybeMutated(sop->out())->asVal(); + Val* in = maybeMutated(sop->in())->asVal(); + + if (out->sameAs(sop->out()) && in->sameAs(sop->in())) { + return; + } - if (out->sameAs(sop->out()) && in->sameAs(sop->in())) - return sop; auto offsets = sop->offsets(); - FusionGuard::getCurFusion()->removeExpr(sop); - return new ShiftOp(out, in, offsets, sop->pad()); + auto pad_width = sop->padWidth(); + auto container = sop->container(); + container->removeExpr(sop); + IrBuilder::create(container, out, in, offsets, pad_width); } -Statement* OptOutMutator::mutate(GatherOp* op) { - Val* out = mutateAsVal(op->out())->asVal(); - Val* in = mutateAsVal(op->in())->asVal(); +void OptOutMutator::mutate(GatherOp* op) { + Val* out = maybeMutated(op->out())->asVal(); + Val* in = maybeMutated(op->in())->asVal(); + + if (out->sameAs(op->out()) && in->sameAs(op->in())) { + return; + } - if (out->sameAs(op->out()) && in->sameAs(op->in())) - return op; auto window_shape = op->windowShape(); auto pad_width = op->padWidth(); - FusionGuard::getCurFusion()->removeExpr(op); - return new GatherOp(out, in, window_shape, pad_width); + auto container = op->container(); + container->removeExpr(op); + IrBuilder::create(container, out, in, window_shape, pad_width); } -Statement* OptOutMutator::mutate(ViewOp* vop) { - return vop; +void OptOutMutator::mutate(ViewOp* vop) { + TensorView* out = maybeMutated(vop->out())->as(); + TensorView* in = maybeMutated(vop->in())->as(); + + if (out->sameAs(vop->out()) && in->sameAs(vop->in())) { + return; + } + + auto container = vop->container(); + container->removeExpr(vop); + IrBuilder::create(container, out, in); } +void OptOutMutator::mutate(Split* s) { + IterDomain* ot = maybeMutated(s->outer())->as(); + IterDomain* inr = maybeMutated(s->inner())->as(); + IterDomain* in = maybeMutated(s->in())->as(); + Val* fact = maybeMutated(s->factor())->as(); + Val* start_offset = maybeMutated(s->startOffset()); + Val* stop_offset = maybeMutated(s->stopOffset()); + + if (ot->sameAs(s->outer()) && inr->sameAs(s->inner()) && + in->sameAs(s->in()) && areEqualScalars(fact, s->factor()) && + start_offset->sameAs(s->startOffset()) && + stop_offset->sameAs(s->stopOffset())) { + return; + } + + auto container = s->container(); + auto inner_split = s->innerSplit(); + container->removeExpr(s); + auto new_node = IrBuilder::create( + container, ot, inr, in, fact, inner_split, start_offset, stop_offset); +} + +void OptOutMutator::mutate(Merge* m) { + IterDomain* ot = maybeMutated(m->out())->as(); + IterDomain* otr = maybeMutated(m->outer())->as(); + IterDomain* in = maybeMutated(m->inner())->as(); + + if (ot->sameAs(m->out()) && otr->sameAs(m->outer()) && + in->sameAs(m->inner())) { + return; + } + + auto container = m->container(); + container->removeExpr(m); + auto new_node = IrBuilder::create(container, ot, otr, in); +} + +void OptOutMutator::mutate(kir::Allocate*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::Sync*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::InitMagicZero*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::UpdateMagicZero*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::ForLoop*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::IfThenElse*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::GridReduction*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::GridBroadcast*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} +void OptOutMutator::mutate(kir::GridWelford*) { + TORCH_INTERNAL_ASSERT(false, "Not implemented yet."); +} + +void OptOutMutator::removeExpr(IrContainer* container, Expr* expr) { + container->removeExpr(expr); +} } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/mutator.h b/torch/csrc/jit/codegen/cuda/mutator.h index f9ec40ca9f5..433de485cf1 100644 --- a/torch/csrc/jit/codegen/cuda/mutator.h +++ b/torch/csrc/jit/codegen/cuda/mutator.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/non_divisible_split.h b/torch/csrc/jit/codegen/cuda/non_divisible_split.h index f17bf2d6246..6706c9f072d 100644 --- a/torch/csrc/jit/codegen/cuda/non_divisible_split.h +++ b/torch/csrc/jit/codegen/cuda/non_divisible_split.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.cpp b/torch/csrc/jit/codegen/cuda/ops/alias.cpp new file mode 100644 index 00000000000..14aff510911 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ops/alias.cpp @@ -0,0 +1,115 @@ +#include +#include +#include +#include + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +namespace { + +//! Transform TensorView according to keep, merge, and split transformations. +//! Trivial reduction and broadcast transformations are handled separately. +//! It is recommend to use the composite ops view function, which will call +//! the analyzeView function to generate the appropriate transformations. +//! +//! For example: +//! original sizes = [2, 10, 40] +//! new_size = [2, 10, 2, 20] +//! auto analysis = analyzeView(TV0, original_sizes, new_sizes) +//! auto TV1 = TV0->view(analysis.transforms); +//! +//! Transforms = [(Keep I0), (Keep I1), (Split I2 by 2)] +//! Before: TV0[I0, I1, I2] +//! After: TV0[I0, I1, 2, ceilDiv(I2, 2)] +//! +TensorView* applyViewTransforms( + TensorView* tv, + const std::vector>& transforms) { + TORCH_INTERNAL_ASSERT( + !tv->hasComputeAt(), + "Cannot modify rfactor domain after compute at has been set."); + + TORCH_INTERNAL_ASSERT(tv->nDims() > 0, "Tried to view a 0-dim TensorView"); + + TORCH_CHECK( + !tv->domain()->hasRFactor(), + "Cannot call view on the same TensorView twice."); + + TORCH_INTERNAL_ASSERT(!transforms.empty()); + + TensorView* consumer = IrBuilder::create( + tv->container(), + tv->domain()->view(transforms), + tv->getDataType().value()); + + IrBuilder::create(tv->container(), consumer, tv); + + return consumer; +} + +} // namespace + +TensorView* view( + TensorView* x, + const std::vector& original_sizes, + const std::vector& new_sizes) { + TORCH_INTERNAL_ASSERT(x->nDims() == original_sizes.size()); + + auto analyze_view = analyzeView(x, original_sizes, new_sizes); + + auto reduction = (!analyze_view.trivial_reduction_axes.empty()) + ? sum(x, analyze_view.trivial_reduction_axes) + : x; + + auto view = (!analyze_view.transforms.empty()) + ? applyViewTransforms(reduction, analyze_view.transforms) + : reduction; + + return (analyze_view.has_broadcast) + ? broadcast(view, analyze_view.broadcast_axes) + : view; +} + +TensorView* squeeze(TensorView* x, const std::vector& sizes) { + TORCH_INTERNAL_ASSERT(x->nDims() == sizes.size()); + + std::vector trivial_reduction_axes; + for (const auto idx : c10::irange(sizes.size())) { + if (sizes[idx] == 1) { + trivial_reduction_axes.push_back(idx); + } + } + return (trivial_reduction_axes.empty()) ? x : sum(x, trivial_reduction_axes); +} + +TensorView* squeeze(TensorView* x, const std::vector& sizes, int dim) { + TORCH_INTERNAL_ASSERT(x->nDims() == sizes.size()); + if (dim < 0) { + dim = (int)(x->nDims()) + dim; + } + TORCH_INTERNAL_ASSERT(dim >= 0 && dim < x->nDims()); + if (sizes[dim] == 1) { + return sum(x, {dim}); + } else { + return set(x); + } +} + +TensorView* unsqueeze(TensorView* x, int dim) { + if (dim < 0) { + dim = (int)(x->nDims()) + dim + 1; + } + TORCH_INTERNAL_ASSERT(dim >= 0 && dim <= x->nDims()); + + std::vector broadcast_axes(x->nDims() + 1, false); + broadcast_axes[dim] = true; + return broadcast(x, broadcast_axes); +} + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ops/alias.h b/torch/csrc/jit/codegen/cuda/ops/alias.h new file mode 100644 index 00000000000..8003e3268b3 --- /dev/null +++ b/torch/csrc/jit/codegen/cuda/ops/alias.h @@ -0,0 +1,38 @@ +#pragma once + +#include + +#include +#include + +// +// The operations defined in this header is intended as user facing functions. +// The user will provide the necessary input TensorViews and the function will +// create the correct intermediate nodes and return the output TensorViews. +// + +namespace torch { +namespace jit { +namespace fuser { +namespace cuda { + +TORCH_CUDA_CU_API TensorView* view( + TensorView* x, + const std::vector& original_sizes, + const std::vector& new_sizes); + +TORCH_CUDA_CU_API TensorView* squeeze( + TensorView* x, + const std::vector& sizes); + +TORCH_CUDA_CU_API TensorView* squeeze( + TensorView* x, + const std::vector& sizes, + int dim); + +TORCH_CUDA_CU_API TensorView* unsqueeze(TensorView* x, int dim); + +} // namespace cuda +} // namespace fuser +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/codegen/cuda/ops/all_ops.h b/torch/csrc/jit/codegen/cuda/ops/all_ops.h index 1ebd2bb87f1..07d3eb944e8 100644 --- a/torch/csrc/jit/codegen/cuda/ops/all_ops.h +++ b/torch/csrc/jit/codegen/cuda/ops/all_ops.h @@ -1,4 +1,5 @@ #pragma once #include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.cpp b/torch/csrc/jit/codegen/cuda/ops/composite.cpp index 06bcf2d0494..c01b7230625 100644 --- a/torch/csrc/jit/codegen/cuda/ops/composite.cpp +++ b/torch/csrc/jit/codegen/cuda/ops/composite.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -8,9 +9,10 @@ namespace fuser { namespace cuda { ForwardDropoutResult dropout(TensorView* x, Val* prob) { - auto p1m = sub(new Double(1.), prob); - auto zero_check = add(eq(p1m, new Double(0.)), p1m); - auto scale = div(new Double(1.), zero_check); + auto p1m = sub(IrBuilder::create(x->container(), 1.), prob); + auto zero_check = + add(eq(p1m, IrBuilder::create(x->container(), 0.)), p1m); + auto scale = div(IrBuilder::create(x->container(), 1.), zero_check); return dropout(x, p1m, scale); } @@ -91,13 +93,14 @@ Val* fast_gelu(Val* x) { auto x_cube = mul(x, mul(x, x)); - auto inner_1 = mul(new Double(kKappa), x_cube); + auto inner_1 = mul(IrBuilder::create(x->container(), kKappa), x_cube); auto inner_2 = add(x, inner_1); - auto inner_3 = mul(new Double(kBeta), inner_2); + auto inner_3 = mul(IrBuilder::create(x->container(), kBeta), inner_2); auto tanh_inner = tanh(inner_3); - auto out = mul(x, add(new Double(1.), tanh_inner)); - auto y = mul(new Double(0.5), out); + auto out = + mul(x, add(IrBuilder::create(x->container(), 1.), tanh_inner)); + auto y = mul(IrBuilder::create(x->container(), 0.5), out); return y; } @@ -111,21 +114,25 @@ Val* fast_gelu_backward(Val* dy, Val* x) { auto x_sq = mul(x, x); auto x_cube = mul(x, x_sq); - auto inner_1 = mul(new Double(kKappa), x_cube); + auto inner_1 = mul(IrBuilder::create(x->container(), kKappa), x_cube); auto inner_2 = add(x, inner_1); - auto inner_3 = mul(new Double(kBeta), inner_2); + auto inner_3 = mul(IrBuilder::create(x->container(), kBeta), inner_2); auto tanh_inner = tanh(inner_3); - auto left = mul(new Double(0.5), x); - auto right = add(new Double(1.), tanh_inner); + auto left = mul(IrBuilder::create(x->container(), 0.5), x); + auto right = add(IrBuilder::create(x->container(), 1.), tanh_inner); - auto left_derivative = mul(new Double(0.5), right); + auto left_derivative = + mul(IrBuilder::create(x->container(), 0.5), right); auto tanh_inner_sq = mul(tanh_inner, tanh_inner); - auto tanh_derivative = sub(new Double(1), tanh_inner_sq); + auto tanh_derivative = + sub(IrBuilder::create(x->container(), 1), tanh_inner_sq); - auto constant_mul_x_sq = mul(new Double(kBeta * 3 * kKappa), x_sq); - auto inner_derivative = add(new Double(kBeta), constant_mul_x_sq); + auto constant_mul_x_sq = + mul(IrBuilder::create(x->container(), kBeta * 3 * kKappa), x_sq); + auto inner_derivative = + add(IrBuilder::create(x->container(), kBeta), constant_mul_x_sq); auto right_derivative = mul(left, mul(tanh_derivative, inner_derivative)); auto dx = mul(dy, add(left_derivative, right_derivative)); @@ -139,79 +146,30 @@ Val* gelu_backward(Val* dy, Val* x) { constexpr double kAlpha = M_2_SQRTPI * M_SQRT1_2 * 0.5; const double kHalf = 0.5; - auto cdf_1 = mul(x, new Double(M_SQRT1_2)); + auto cdf_1 = mul(x, IrBuilder::create(x->container(), M_SQRT1_2)); auto cdf_2 = erf(cdf_1); - auto cdf_3 = add(cdf_2, new Double(1.)); - auto cdf_4 = mul(cdf_3, new Double(kHalf)); + auto cdf_3 = add(cdf_2, IrBuilder::create(x->container(), 1.)); + auto cdf_4 = mul(cdf_3, IrBuilder::create(x->container(), kHalf)); auto pdf_1 = mul(x, x); - auto pdf_2 = mul(pdf_1, new Double(-kHalf)); + auto pdf_2 = mul(pdf_1, IrBuilder::create(x->container(), -kHalf)); auto pdf_3 = exp(pdf_2); - auto out = addcmul(cdf_4, x, pdf_3, new Double(kAlpha)); + auto out = addcmul( + cdf_4, x, pdf_3, IrBuilder::create(x->container(), kAlpha)); auto dx = mul(out, dy); return dx; } -namespace { +Val* tanh_backward(Val* dy, Val* tanh_x) { + TORCH_INTERNAL_ASSERT(dy != nullptr, "Grad Output is invalid."); + TORCH_INTERNAL_ASSERT(tanh_x != nullptr, "Input is invalid"); -//! Transform TensorView according to keep, merge, and split transformations. -//! Trivial reduction and broadcast transformations are handled separately. -//! It is recommend to use the composite ops view function, which will call -//! the analyzeView function to generate the appropriate transformations. -//! -//! For example: -//! original sizes = [2, 10, 40] -//! new_size = [2, 10, 2, 20] -//! auto analysis = analyzeView(TV0, original_sizes, new_sizes) -//! auto TV1 = TV0->view(analysis.transforms); -//! -//! Transforms = [(Keep I0), (Keep I1), (Split I2 by 2)] -//! Before: TV0[I0, I1, I2] -//! After: TV0[I0, I1, 2, ceilDiv(I2, 2)] -//! -TensorView* applyViewTransforms( - TensorView* tv, - const std::vector>& transforms) { - TORCH_INTERNAL_ASSERT( - !tv->hasComputeAt(), - "Cannot modify rfactor domain after compute at has been set."); - - TORCH_INTERNAL_ASSERT(tv->nDims() > 0, "Tried to view a 0-dim TensorView"); - - TORCH_CHECK( - !tv->domain()->hasRFactor(), - "Cannot call view on the same TensorView twice."); - - TORCH_INTERNAL_ASSERT(!transforms.empty()); - - TensorView* consumer = - new TensorView(tv->domain()->view(transforms), tv->getDataType().value()); - - new ViewOp(consumer, tv); - - return consumer; -} - -} // namespace - -TensorView* view( - TensorView* x, - const std::vector& original_sizes, - const std::vector& new_sizes) { - auto analyze_view = analyzeView(x, original_sizes, new_sizes); - - auto reduction = (!analyze_view.trivial_reduction_axes.empty()) - ? sum(x, analyze_view.trivial_reduction_axes) - : x; - - auto view = (!analyze_view.transforms.empty()) - ? applyViewTransforms(reduction, analyze_view.transforms) - : reduction; - - return (analyze_view.has_broadcast) - ? broadcast(view, analyze_view.broadcast_axes) - : view; + auto one = IrBuilder::create(tanh_x->container(), 1.); + auto tanh_sq = mul(tanh_x, tanh_x); + auto sub_tanh_sq = sub(one, tanh_sq); + auto dx = mul(dy, sub_tanh_sq); + return dx; } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/ops/composite.h b/torch/csrc/jit/codegen/cuda/ops/composite.h index 4470f0cc6f0..63e17629f40 100644 --- a/torch/csrc/jit/codegen/cuda/ops/composite.h +++ b/torch/csrc/jit/codegen/cuda/ops/composite.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -48,11 +48,7 @@ TORCH_CUDA_CU_API LstmResult lstm( TORCH_CUDA_CU_API Val* fast_gelu(Val* x); TORCH_CUDA_CU_API Val* fast_gelu_backward(Val* dy, Val* x); TORCH_CUDA_CU_API Val* gelu_backward(Val* dy, Val* x); - -TORCH_CUDA_CU_API TensorView* view( - TensorView* x, - const std::vector& x_sizes, - const std::vector& new_sizes); +TORCH_CUDA_CU_API Val* tanh_backward(Val* dy, Val* tanh_x); } // namespace cuda } // namespace fuser diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp index 19201687553..4a473f66203 100644 --- a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp +++ b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace torch { @@ -23,7 +24,7 @@ TensorView* softmax(TensorView* x, int dim) { auto exp_val = exp(x_max_sub); auto sum_exp = sum(exp_val, {kReductionAxis}); auto bcast_sum = broadcast(sum_exp, broadcast_mask); - auto y = div(exp_val, bcast_sum); + auto y = mul(exp_val, reciprocal(bcast_sum)); return y; } @@ -88,7 +89,7 @@ ForwardNormResult layer_norm( std::vector inner_reduction_axes(kNormShapeNumDims); std::vector inner_broadcast_mask(kNumberOfDims, false); - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(x->container(), 1); for (const auto idx : c10::irange(kNormShapeNumDims)) { const size_t axis = kNumberOfDims - 1 - idx; inner_reduction_axes[idx] = axis; @@ -102,7 +103,7 @@ ForwardNormResult layer_norm( auto x_sub_mean = sub(x, mean_bcast); auto var_sum_bcast = broadcast(welford_out.var_sum, inner_broadcast_mask); - auto var = div(var_sum_bcast, num_features); + auto var = mul(var_sum_bcast, reciprocal(num_features)); auto var_eps = add(var, eps); auto invstd = rsqrt(var_eps); @@ -156,7 +157,7 @@ BackwardNormResult layer_norm_backward( std::vector inner_reduction_axes(kNormShapeNumDims); std::vector inner_broadcast_mask(kNumberOfDims, false); - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(x->container(), 1); for (const auto idx : c10::irange(kNormShapeNumDims)) { const size_t axis = kNumberOfDims - 1 - idx; inner_reduction_axes[idx] = axis; @@ -243,7 +244,7 @@ ForwardNormResult batch_norm( std::vector reduction_axes; std::vector broadcast_mask(kNumberOfDims, false); - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(x->container(), 1); for (const auto axis : c10::irange(kNumberOfDims)) { if (axis != c_axis) { @@ -267,13 +268,15 @@ ForwardNormResult batch_norm( kTraining, "When running stats are provided, batch stats should only be computed during training"); - auto rev_momentum = sub(new Double(1.0), momentum); + auto rev_momentum = + sub(IrBuilder::create(x->container(), 1.0), momentum); auto current_mean_hat = mul(welford_out.avg, momentum); auto mean_hat = mul(running_mean, rev_momentum); auto new_mean_hat = add(mean_hat, current_mean_hat); - auto num_feature_decrement = sub(num_features, new Int(1)); - auto unbiased_var = div(welford_out.var_sum, num_feature_decrement); + auto num_feature_decrement = sub(num_features, x->container()->oneVal()); + auto unbiased_var = + mul(welford_out.var_sum, reciprocal(num_feature_decrement)); auto current_var_hat = mul(unbiased_var, momentum); auto var_hat = mul(running_var, rev_momentum); auto new_var_hat = add(var_hat, current_var_hat); @@ -301,14 +304,14 @@ ForwardNormResult batch_norm( fusion->aliasOutputToInput(casted_output, input_to_cast); }; - if (fusion->hasInput(running_mean)) { + if (running_mean->isFusionInput()) { fusion->addOutput(new_mean_hat); fusion->aliasOutputToInput(new_mean_hat, running_mean); } else { cast_to_input_dtype(running_mean, new_mean_hat); } - if (fusion->hasInput(running_var)) { + if (running_var->isFusionInput()) { fusion->addOutput(new_var_hat); fusion->aliasOutputToInput(new_var_hat, running_var); } else { @@ -320,7 +323,7 @@ ForwardNormResult batch_norm( auto mean_bcast = broadcast(mean, broadcast_mask); auto x_sub_mean = sub(x, mean_bcast); - auto var = div(welford_out.var_sum, num_features); + auto var = mul(welford_out.var_sum, reciprocal(num_features)); auto var_eps = add(var, eps); invstd = rsqrt(var_eps); auto invstd_bcast = broadcast(invstd, broadcast_mask); @@ -414,19 +417,6 @@ BackwardNormResult batch_norm_backward( mean = broadcast(mean, broadcast_mask); - TensorView* weight_val = nullptr; - if (weight == nullptr) { - weight_val = TensorViewBuilder() - .ndims(kNumberOfDims) - .dtype(input->getDataType().value()) - .shape(std::vector(kNumberOfDims, 1)) - .build(); - new UnaryOp( - UnaryOpType::Set, weight_val->as(), (new Double(1.0))->as()); - } else { - weight_val = broadcast(weight, broadcast_mask); - } - auto norm = reciprocal(num_features); auto grad_output_sum = sum(grad_output, reduction_axes); @@ -435,7 +425,16 @@ BackwardNormResult batch_norm_backward( auto grad_mean = broadcast(mul(grad_output_sum, norm), broadcast_mask); auto proj_scale = broadcast(mul(mul(dot_p, norm), mul(invstd, invstd)), broadcast_mask); - auto grad_scale = mul(broadcast(invstd, broadcast_mask), weight_val); + TensorView* grad_scale = nullptr; + + if (weight == nullptr) { + grad_scale = + mul(broadcast(invstd, broadcast_mask), + IrBuilder::create(input->container(), 1)); + } else { + grad_scale = mul( + broadcast(invstd, broadcast_mask), broadcast(weight, broadcast_mask)); + } TensorView* grad_input = nullptr; if (kTraining) { @@ -496,7 +495,7 @@ ForwardNormResult instance_norm( std::vector x_reduction_axes; std::vector x_broadcast_mask(kNumberOfDims, false); - Val* N = new Double(1); + Val* N = IrBuilder::create(x->container(), 1); for (const auto axis : c10::irange(kNumberOfDims)) { if (axis != kBatchDim && axis != kChannelsDim) { x_reduction_axes.push_back(axis); @@ -504,7 +503,7 @@ ForwardNormResult instance_norm( N = mul(N, x->domain()->domain()[axis]->extent()); } } - Val* B = new Double(1); + Val* B = IrBuilder::create(x->container(), 1); B = mul(B, x->domain()->domain()[kBatchDim]->extent()); std::vector channels_only_broadcast_mask(kNumberOfDims, false); @@ -523,7 +522,8 @@ ForwardNormResult instance_norm( // updating running mean and running var if (running_mean != nullptr && running_var != nullptr) { - auto rev_momentum = sub(new Double(1.0), momentum); + auto rev_momentum = + sub(IrBuilder::create(x->container(), 1.0), momentum); auto current_mean_hat = mul(welford_out.avg, momentum); auto mean_hat = mul(running_mean, rev_momentum); auto new_mean_hat = add(mean_hat, current_mean_hat); @@ -531,12 +531,13 @@ ForwardNormResult instance_norm( // NS: static_cast to workaround VC++ error, see // https://godbolt.org/z/6Prd77xYs auto new_mean_sum = sum(new_mean_hat, {static_cast(kBatchDim)}); - auto new_mean_channels_only = div(new_mean_sum, B); + auto new_mean_channels_only = mul(new_mean_sum, reciprocal(B)); fusion->addOutput(new_mean_channels_only); fusion->aliasOutputToInput(new_mean_channels_only, running_mean); - auto num_feature_decrement = sub(N, new Int(1)); - auto unbiased_var = div(welford_out.var_sum, num_feature_decrement); + auto num_feature_decrement = sub(N, x->container()->oneVal()); + auto unbiased_var = + mul(welford_out.var_sum, reciprocal(num_feature_decrement)); auto current_var_hat = mul(unbiased_var, momentum); auto var_hat = mul(running_var, rev_momentum); auto new_var_hat = add(var_hat, current_var_hat); @@ -544,7 +545,7 @@ ForwardNormResult instance_norm( // NS: static_cast to workaround VC++ error, see // https://godbolt.org/z/6Prd77xYs auto new_var_sum = sum(new_var_hat, {static_cast(kBatchDim)}); - auto new_var_channels_only = div(new_var_sum, B); + auto new_var_channels_only = mul(new_var_sum, reciprocal(B)); fusion->addOutput(new_var_channels_only); fusion->aliasOutputToInput(new_var_channels_only, running_var); } @@ -553,7 +554,7 @@ ForwardNormResult instance_norm( auto mean_bcast = broadcast(mean, x_broadcast_mask); auto x_sub_mean = sub(x, mean_bcast); - auto var = div(welford_out.var_sum, N); + auto var = mul(welford_out.var_sum, reciprocal(N)); auto var_eps = add(var, eps); invstd = rsqrt(var_eps); auto invstd_bcast = broadcast(invstd, x_broadcast_mask); diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.h b/torch/csrc/jit/codegen/cuda/ops/normalization.h index dae58462b92..b28cdf6b33c 100644 --- a/torch/csrc/jit/codegen/cuda/ops/normalization.h +++ b/torch/csrc/jit/codegen/cuda/ops/normalization.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp index 3dcb58335a4..d966fc21a97 100644 --- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp +++ b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.cpp @@ -5,8 +5,6 @@ #include #include #include -#include -#include #include #include @@ -102,7 +100,6 @@ void ParallelDimensionMap::populateDimensionMapWithSingleCASet( TORCH_INTERNAL_ASSERT(dom_set.size() == 1); const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // pt is used by only one concrete domain auto id = *dom_set.begin(); @@ -110,16 +107,16 @@ void ParallelDimensionMap::populateDimensionMapWithSingleCASet( if (it != constant_extent_map_.end()) { if (it->second.size() == 1) { - dim_map_.insert({pt, ir_builder.create(*(it->second.begin()))}); + dim_map_.insert({pt, IrBuilder::create(*(it->second.begin()))}); exact_types_.insert(pt); } else { // Multiple constant dimensions found; Use the corresponding // symbolic parallel dim - dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)}); + dim_map_.insert({pt, NamedScalar::getParallelDim(pt)}); } } else { // Prefer to use blockDim/gridDim if not constant - dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)}); + dim_map_.insert({pt, NamedScalar::getParallelDim(pt)}); exact_types_.insert(pt); } } @@ -130,11 +127,10 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet( TORCH_INTERNAL_ASSERT(dom_set.size() > 1); const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); bool all_equal = true; // Use nullptr to signal it's not initialied yet - kir::Val* known_dimension = nullptr; + Val* known_dimension = nullptr; // Use -1 to signal it's not initialied yet int64_t known_const = -1; @@ -172,7 +168,7 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet( // At this point, it still remains undetermined whether this id // matches with those previously looked at. Constant check failed, // but symbolic matching may succeed. - auto this_dimension = gpu_lower->lowerValue(concrete_id->extent()); + auto this_dimension = concrete_id->extent(); if (known_dimension == nullptr) { // No previous dimension found yet known_dimension = this_dimension; @@ -191,15 +187,14 @@ void ParallelDimensionMap::populateDimensionMapWithMultipleCASet( } // Use the const value, if found, as its dimension if (all_equal && known_const != -1) { - dim_map_.insert({pt, ir_builder.create(known_const)}); + dim_map_.insert({pt, IrBuilder::create(known_const)}); } else { - dim_map_.insert({pt, kir::NamedScalar::getParallelDim(pt)}); + dim_map_.insert({pt, NamedScalar::getParallelDim(pt)}); } } void ParallelDimensionMap::adjustMappingsForWarpPadding() { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); // If TIDx is padded to a multiple of the warp size, mark it as // non-exact. @@ -215,7 +210,7 @@ void ParallelDimensionMap::adjustMappingsForWarpPadding() { // If the dimension of TIDx is actually a multple of the warp size // before padding, it can be left as exact if (isExact(tidx_pt)) { - auto tidx_dim = dynamic_cast(get(tidx_pt)); + auto tidx_dim = dynamic_cast(get(tidx_pt)); if (tidx_dim && tidx_dim->isConst()) { auto tidx_dim_val = tidx_dim->value().value(); if (tidx_dim_val % warp_size == 0) { @@ -229,17 +224,17 @@ void ParallelDimensionMap::adjustMappingsForWarpPadding() { // single warp, use the constant warp size as the dimension of // TIDx. Otherwise, jsut use blockDim.x. if (warp_info.is_tidx_single_warp) { - dim_map_.at(ParallelType::TIDx) = ir_builder.create(warp_size); + dim_map_.at(ParallelType::TIDx) = IrBuilder::create(warp_size); } else { dim_map_.at(ParallelType::TIDx) = - kir::NamedScalar::getParallelDim(ParallelType::TIDx); + NamedScalar::getParallelDim(ParallelType::TIDx); } // TIDx is no longer exact exact_types_.erase(ParallelType::TIDx); } -kir::Val* ParallelDimensionMap::get(ParallelType pt) const { +Val* ParallelDimensionMap::get(ParallelType pt) const { TORCH_INTERNAL_ASSERT(isParallelTypeThread(pt), "Invalid ParallelType: ", pt); auto it = dim_map_.find(pt); if (it == dim_map_.end()) { @@ -261,7 +256,7 @@ IterDomain* ParallelDimensionMap::getCAMappedConcreteDomain(IterDomain* id) { // Symbolically compares equality of two KIR vals. Comparison is done // conservatively, so returning false does not guarantee non-equality. -bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) { +bool ParallelDimensionMap::equalDim(Val* dim1, Val* dim2) { TORCH_INTERNAL_ASSERT(dim1 != nullptr && dim2 != nullptr); if (dim1 == dim2) { @@ -269,8 +264,8 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) { } // When Both are Int, they are same if both have the same constant - auto dim1_int = dynamic_cast(dim1); - auto dim2_int = dynamic_cast(dim2); + auto dim1_int = dynamic_cast(dim1); + auto dim2_int = dynamic_cast(dim2); if (dim1_int && dim2_int) { if (dim1_int->isConst() && dim2_int->isConst()) { return dim1_int->value() == dim2_int->value(); @@ -279,8 +274,8 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) { // When both are NamedScalar, they are same if Both have the same // name - auto dim1_ns = dynamic_cast(dim1); - auto dim2_ns = dynamic_cast(dim2); + auto dim1_ns = dynamic_cast(dim1); + auto dim2_ns = dynamic_cast(dim2); if (dim1_ns && dim2_ns) { return dim1_ns->name() == dim2_ns->name(); } @@ -297,12 +292,12 @@ bool ParallelDimensionMap::equalDim(kir::Val* dim1, kir::Val* dim2) { // If both are BinaryOp or UnaryOp, check their inputs. Since these // Vals are IterDomain extents, UnaryOp should not occur, but // checking shouldn't be harmful. - if ((dim1_def->isA() && dim2_def->isA() && - (dim1_def->as()->operation() == - dim2_def->as()->operation())) || - (dim1_def->isA() && dim2_def->isA() && - (dim1_def->as()->operation() == - dim2_def->as()->operation()))) { + if ((dim1_def->isA() && dim2_def->isA() && + (dim1_def->as()->getBinaryOpType() == + dim2_def->as()->getBinaryOpType())) || + (dim1_def->isA() && dim2_def->isA() && + (dim1_def->as()->getUnaryOpType() == + dim2_def->as()->getUnaryOpType()))) { for (const auto i : c10::irange(dim1_def->inputs().size())) { (void)i; // Suppress unused variable warning if (!equalDim(dim1_def->inputs()[0], dim2_def->inputs()[0])) { @@ -321,7 +316,7 @@ std::string ParallelDimensionMap::toString() const { ss << pt << ": "; auto dim = get(pt); if (dim != nullptr) { - ss << kir::toString(dim); + ss << dim->toString(); if (isExact(pt)) { ss << ", exact"; } else { diff --git a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h index d05c17adea2..03bd513396f 100644 --- a/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h +++ b/torch/csrc/jit/codegen/cuda/parallel_dimension_map.h @@ -21,7 +21,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap { //! Returns the dimension of a ParallelType. nullptr is returned if //! a ParallelType is unused. - kir::Val* get(ParallelType pt) const; + Val* get(ParallelType pt) const; //! True if the dimension of a ParallelType is known to be exact bool isExact(ParallelType pt) const; @@ -29,7 +29,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap { std::string toString() const; //! Symbolically analyze if two extent vals are equal - static bool equalDim(kir::Val* dim1, kir::Val* dim2); + static bool equalDim(Val* dim1, Val* dim2); private: //! Register the extent of an IterDomain if its constant @@ -54,7 +54,7 @@ class TORCH_CUDA_CU_API ParallelDimensionMap { private: //! Maps from parallel types to dimensions, which are constant if //! a unique value is found. - std::unordered_map dim_map_; + std::unordered_map dim_map_; //! Set of parallel types whose dimensions are identified to be //! exactly the same as extents of mapped domains. std::unordered_set exact_types_; diff --git a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h b/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h index 0bf8ae39277..3bfb32d38bc 100644 --- a/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h +++ b/torch/csrc/jit/codegen/cuda/parallel_type_bitmap.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/parser.cpp b/torch/csrc/jit/codegen/cuda/parser.cpp index 11c27cffec2..94dad076db8 100644 --- a/torch/csrc/jit/codegen/cuda/parser.cpp +++ b/torch/csrc/jit/codegen/cuda/parser.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -33,25 +34,18 @@ constexpr auto kNumBinaryFloatOps = 3; constexpr auto kNumBinaryComparisonOps = 12; constexpr auto kNumBinaryCastOps = 14; -constexpr auto kNumBinaryOpsWithAlpha = 4; +constexpr auto kNumBinaryOpsWithAlpha = 6; constexpr auto kNumLerpOps = 2; constexpr auto kNumLayernormFwd = 2; constexpr auto kNumBatchnormFwd = 3; constexpr auto kNumInstancenormFwd = 1; constexpr auto kNumSumToSize = 2; constexpr auto kNumAutocastOps = 2; -// constexpr auto kNumViewSize = 2; +constexpr auto kNumAliasDimOps = 2; +constexpr auto kNumViewOps = 2; namespace { -std::vector getTensorSizes(TensorTypePtr const& tensor_type) { - TORCH_INTERNAL_ASSERT(tensor_type != nullptr, "Input must be a Tensor."); - auto optional_sizes = tensor_type->sizes().concrete_sizes(); - TORCH_INTERNAL_ASSERT( - optional_sizes.has_value(), "Missing size information for the tensor."); - return optional_sizes.value(); -} - #define REGISTER_PARSE_RULE(op, func_body, ...) \ registerParseRule( \ op, \ @@ -59,7 +53,8 @@ std::vector getTensorSizes(TensorTypePtr const& tensor_type) { -> void func_body, \ __VA_ARGS__) -const auto& sizeAttr = Symbol::attr("profiled_size"); +const auto& reductionSizeAttr = Symbol::attr("profiled_reduction_size"); +const auto& viewSizeAttr = Symbol::attr("profiled_view_size"); const auto& intListAttr = Symbol::attr("profiled_int_list"); const auto& intAttr = Symbol::attr("profiled_int"); const auto& boolListAttr = Symbol::attr("profiled_bool_list"); @@ -283,8 +278,9 @@ class ValueHolder { if (iter_val != vals_.end()) { return iter_val->second; } - // patching scalar value, because memory format doesn't carry real meaning. - if (!is_tensor_view_) { + // patching scalar (tensor), memory format doesn't carry meaning and should + // just return the value as-is. + if (!is_tensor_view_ || rank() == 0) { return std::get<1>(getEntry()); } MemoryFormat format_s; @@ -505,7 +501,7 @@ class IrParser { "Failure when register value: ", *(val->node()), " with type: ", - val->type()); + val->type()->repr_str()); MemoryFormat format; Val* operand = nullptr; std::tie(format, operand) = value_map_[val->unique()].getEntry(); @@ -523,7 +519,6 @@ class IrParser { (opt_dtype.value() == DataType::Half || opt_dtype.value() == DataType::BFloat16)) { Val* promoted_val = castOp(DataType::Float, operand); - // value_map_.emplace(val->unique(), ValueHolder(promoted_val, format)); value_map_[val->unique()] = ValueHolder(promoted_val, format); } } @@ -688,7 +683,9 @@ class IrParser { "aten::add(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", "aten::add(Tensor self, Scalar other, Scalar alpha) -> Tensor", "aten::sub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", - "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor"}; + "aten::sub(Tensor self, Scalar other, Scalar alpha) -> Tensor", + "aten::rsub(Tensor self, Tensor other, *, Scalar alpha) -> Tensor", + "aten::rsub(Tensor self, Scalar other, Scalar alpha) -> Tensor"}; for (auto signature : BinaryOpWithAlpha) { auto ptr_op = getOperatorForLiteral(signature); REGISTER_PARSE_RULE( @@ -704,6 +701,10 @@ class IrParser { BinaryOpType::Add, static_cast(&add_alpha))}, {aten::sub, + std::make_pair( + BinaryOpType::Sub, + static_cast(&sub_alpha))}, + {aten::rsub, std::make_pair( BinaryOpType::Sub, static_cast(&sub_alpha))}}); @@ -723,10 +724,12 @@ class IrParser { auto out = alpha->isOneInt() ? binaryOp( op_mapping[node->kind()].first, - lhs, - rhs, + node->kind() == aten::rsub ? rhs : lhs, + node->kind() == aten::rsub ? lhs : rhs, TypePromotion::default_op_config) - : op_mapping[node->kind()].second(lhs, rhs, alpha); + : (node->kind() == aten::rsub + ? op_mapping[node->kind()].second(rhs, lhs, alpha) + : op_mapping[node->kind()].second(lhs, rhs, alpha)); value_map.emplace( node->output()->unique(), ValueHolder(out, format)); }, @@ -1101,10 +1104,10 @@ class IrParser { list_val.pop_front(); Val* low = value_map.count(node->inputs()[1]->unique()) != 0 ? *value_map[node->inputs()[1]->unique()] - : new Double(std::numeric_limits::min()); + : IrBuilder::create(std::numeric_limits::min()); Val* high = value_map.count(node->inputs()[2]->unique()) != 0 ? *value_map[node->inputs()[2]->unique()] - : new Double(std::numeric_limits::max()); + : IrBuilder::create(std::numeric_limits::max()); auto out = clamp(operand, low, high); value_map.emplace(node->output()->unique(), out); @@ -1340,7 +1343,7 @@ class IrParser { running_mean = value_map[node->input(3)->unique()]->as(); TORCH_INTERNAL_ASSERT( - fusion->hasInput(running_mean), + running_mean->isFusionInput(), "IO_tensor `instance_norm::running_mean` can only be input tensor to fusion"); } @@ -1350,7 +1353,7 @@ class IrParser { running_var = value_map[node->input(4)->unique()]->as(); TORCH_INTERNAL_ASSERT( - fusion->hasInput(running_var), + running_var->isFusionInput(), "IO_tensor `instance_norm::running_var` can only be input tensor to fusion"); } @@ -1364,7 +1367,7 @@ class IrParser { Val* momentum_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto momentum = constant_as(node->input(6))) { - momentum_ptr = new Double(momentum.value()); + momentum_ptr = IrBuilder::create(momentum.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) momentum_ptr = value_map[node->input(6)->unique()]; @@ -1373,7 +1376,7 @@ class IrParser { Val* eps_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto eps = constant_as(node->input(7))) { - eps_ptr = new Double(eps.value()); + eps_ptr = IrBuilder::create(eps.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) eps_ptr = value_map[node->input(7)->unique()]; @@ -1458,7 +1461,7 @@ class IrParser { Val* momentum_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto momentum = constant_as(node->input(6))) { - momentum_ptr = new Double(momentum.value()); + momentum_ptr = IrBuilder::create(momentum.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) momentum_ptr = value_map[node->input(6)->unique()]; @@ -1467,7 +1470,7 @@ class IrParser { Val* eps_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto eps = constant_as(node->input(7))) { - eps_ptr = new Double(eps.value()); + eps_ptr = IrBuilder::create(eps.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) eps_ptr = value_map[node->input(7)->unique()]; @@ -1586,7 +1589,7 @@ class IrParser { Val* eps_ptr = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) if (auto eps = constant_as(node->input(9))) { - eps_ptr = new Double(eps.value()); + eps_ptr = IrBuilder::create(eps.value()); } else { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) eps_ptr = value_map[node->input(7)->unique()]; @@ -1704,7 +1707,7 @@ class IrParser { Val* eps_ptr = nullptr; if (auto eps = constant_as(node->input(4))) { - eps_ptr = new Double(eps.value()); + eps_ptr = IrBuilder::create(eps.value()); } else { eps_ptr = value_map[node->input(4)->unique()]; } @@ -2032,7 +2035,7 @@ class IrParser { keepdim.has_value(), "aten::mean cannot be fused with dynamic keepdim"); auto o_sum = sum(self, dims, keepdim.value()); - Val* num_features = new Double(1); + Val* num_features = IrBuilder::create(1); for (auto axis : dims) { if (axis < 0) { axis += int(self->nDims()); @@ -2347,6 +2350,31 @@ class IrParser { nullptr); } + { + auto ptr_op = getOperatorForLiteral( + "aten::tanh_backward(Tensor grad_output, Tensor output) -> Tensor"); + REGISTER_PARSE_RULE( + ptr_op, + { + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + c10::nullopt, + value_map[node->inputs()[0]->unique()], + value_map[node->inputs()[1]->unique()]); + auto grad_out = list_val.front(); + list_val.pop_front(); + auto self = list_val.front(); + list_val.pop_front(); + + auto grad_in = tanh_backward(grad_out, self); + value_map.emplace( + node->output()->unique(), ValueHolder(grad_in, format)); + }, + nullptr, + nullptr); + } + { auto ptr_op = getOperatorForLiteral( "aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor"); @@ -2392,37 +2420,111 @@ class IrParser { }); } - /* - // TODO: Enable view in parser by detecting non-alias view operation { - std::array View = { - "aten::view(Tensor(a) self, int[] size) -> Tensor(a)", - "aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)"}; - for (auto signature : View) { + std::array ViewOps = { + "prim::reshape_copy(Tensor self, int[] shape) -> Tensor", + "prim::view_copy(Tensor self, int[] size) -> Tensor"}; + for (auto signature : ViewOps) { auto ptr_op = getOperatorForLiteral(signature); REGISTER_PARSE_RULE( ptr_op, { auto self_value = node->inputs()[0]; - auto self = value_map[self_value->unique()]->as(); + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), value_map[self_value->unique()]); + auto self = list_val.front()->as(); + list_val.pop_front(); auto self_type = self_value->type()->cast(); TORCH_INTERNAL_ASSERT(self_type != nullptr); auto self_sizes = getTensorSizes(self_type); - auto size_optional = - constant_as>(node->input(1)); + auto view_sizes = constant_as>(node->input(1)); TORCH_INTERNAL_ASSERT( - size_optional.has_value(), "The size parameter is required."); + view_sizes.has_value(), "The size parameter is required."); - auto output = view(self, self_sizes, size_optional->vec()); + auto output = view(self, self_sizes, view_sizes->vec()); + value_map.emplace(node->output()->unique(), output); + }, + [](const Node* node) -> bool { + // Reject fusing node if view_sizes contains an inferred dimension + auto view_sizes = constant_as>(node->input(1)); + TORCH_INTERNAL_ASSERT( + view_sizes.has_value(), "The size parameter is required."); + for (auto axis_size : view_sizes->vec()) { + if (axis_size == -1) { + return false; + } + } + return true; + }, + nullptr); + } + } + + { + auto ptr_op = + getOperatorForLiteral("prim::squeeze_copy(Tensor self) -> Tensor"); + REGISTER_PARSE_RULE( + ptr_op, + { + auto self_value = node->inputs()[0]; + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), value_map[self_value->unique()]); + auto self = list_val.front()->as(); + list_val.pop_front(); + + auto self_type = self_value->type()->cast(); + TORCH_INTERNAL_ASSERT(self_type != nullptr); + auto self_sizes = getTensorSizes(self_type); + + auto output = squeeze(self, self_sizes); + value_map.emplace(node->output()->unique(), output); + }, + nullptr, + nullptr); + } + + { + std::array AliasOpWithDim = { + "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor", + "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor"}; + for (auto signature : AliasOpWithDim) { + auto ptr_op = getOperatorForLiteral(signature); + REGISTER_PARSE_RULE( + ptr_op, + { + auto self_value = node->inputs()[0]; + MemoryFormat format; + std::list list_val; + std::tie(format, list_val) = getConsistentValues( + MemoryFormat::Contiguous(), + value_map[node->inputs()[0]->unique()]); + auto self = list_val.front()->as(); + list_val.pop_front(); + + auto dim_value = constant_as(node->input(1)); + TORCH_INTERNAL_ASSERT(dim_value.has_value(), "dim is not valid"); + + TensorView* output = nullptr; + if (node->kind() == prim::unsqueeze_copy) { + output = unsqueeze(self, dim_value.value()); + } else { + auto self_type = self_value->type()->cast(); + TORCH_INTERNAL_ASSERT(self_type != nullptr); + auto self_sizes = getTensorSizes(self_type); + output = squeeze(self, self_sizes, dim_value.value()); + } value_map.emplace(node->output()->unique(), output); }, nullptr, nullptr); } } - */ } void processJitNode(const JitOp* node) { @@ -2456,9 +2558,9 @@ class IrParser { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) CgValue cg_val; if (auto ival = constant_as(val)) { - cg_val = new Double(ival.value()); + cg_val = IrBuilder::create(ival.value()); } else { - cg_val = new Double(); + cg_val = IrBuilder::create(); } value_map_.emplace(val->unique(), cg_val); return true; @@ -2467,9 +2569,9 @@ class IrParser { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) CgValue cg_val; if (auto ival = constant_as(val)) { - cg_val = new Int(ival.value()); + cg_val = IrBuilder::create(ival.value()); } else { - cg_val = new Int(); + cg_val = IrBuilder::create(); } value_map_.emplace(val->unique(), cg_val); return true; @@ -2478,9 +2580,9 @@ class IrParser { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) CgValue cg_val; if (auto ival = constant_as(val)) { - cg_val = new Bool(ival.value()); + cg_val = IrBuilder::create(ival.value()); } else { - cg_val = new Bool(); + cg_val = IrBuilder::create(); } value_map_.emplace(val->unique(), cg_val); return true; @@ -2496,7 +2598,11 @@ class IrParser { // TODO: we don't support list type in codegen yet; // This is a WAR to allow axes of reduction to be passed as constant list; // We simply ignore conversion if the scalar value is a constant; - return toIValue(val).has_value(); + auto ivalue = toIValue(val); + TORCH_INTERNAL_ASSERT( + ivalue.has_value(), + "List[T] is not supported as an argument by NvFuser. Use a Constant List."); + return true; } return false; } @@ -2566,7 +2672,10 @@ class IrParser { tensor_type->undefined()); } - cg_val = new TensorView(tensor_type); + cg_val = IrBuilder::create(tensor_type); + if (is_cpu_scalar(*tensor_type)) { + cg_val->as()->setCpuScalar(true); + } value_map_.emplace(val->unique(), ValueHolder(cg_val, format)); return true; } @@ -2611,7 +2720,7 @@ ProfileIValueOp* insertProfileIValueOp( return pn; } -void profileSize(ProfilingRecord* pr, Node* node, size_t offset) { +void profileReductionSize(ProfilingRecord* pr, Node* node, size_t offset) { auto pn = insertProfileIValueOp(node, offset, pr); const auto ivalue_profiler = [pr, pn](Stack& stack) { @@ -2631,12 +2740,14 @@ void profileSize(ProfilingRecord* pr, Node* node, size_t offset) { size_vec.clear(); } else { TORCH_INTERNAL_ASSERT( - false, "profileSize does not support data type: ", value.tagKind()); + false, + "profileReductionSize does not support data type: ", + value.tagKind()); } - if (!pn->hasAttribute(sizeAttr)) { - pn->is_(sizeAttr, size_vec); + if (!pn->hasAttribute(reductionSizeAttr)) { + pn->is_(reductionSizeAttr, size_vec); } else { - auto profiled_ints = pn->is(sizeAttr); + auto profiled_ints = pn->is(reductionSizeAttr); TORCH_INTERNAL_ASSERT( profiled_ints.size() == size_vec.size() && std::equal( @@ -2648,6 +2759,39 @@ void profileSize(ProfilingRecord* pr, Node* node, size_t offset) { pn->setCallback(ivalue_profiler); } +void profileViewSize(ProfilingRecord* pr, Node* node, size_t offset) { + auto pn = insertProfileIValueOp(node, offset, pr); + + const auto ivalue_profiler = [pr, pn](Stack& stack) { + std::lock_guard lock(pr->mutex_); + + // TODO: we don't care about merging multiple profiling runs as we don't + // support it at all; + int64_t frame_id = 0; + pop(stack, frame_id); + IValue value; + pop(stack, value); + TORCH_INTERNAL_ASSERT( + value.isIntList(), "profiling seeing the wrong data type"); + if (!pn->hasAttribute(viewSizeAttr)) { + pn->is_(viewSizeAttr, value.toIntVector()); + } else { + auto profiled_ints = pn->is(viewSizeAttr); + auto input_ints = value.toIntList(); + TORCH_INTERNAL_ASSERT( + profiled_ints.size() == input_ints.size() && + std::equal( + profiled_ints.begin(), + profiled_ints.end(), + input_ints.begin()), + "profiling ivalue doesn't support merge"); + } + push(stack, value); + }; + + pn->setCallback(ivalue_profiler); +} + void profileIntList(ProfilingRecord* pr, Node* node, size_t offset) { auto pn = insertProfileIValueOp(node, offset, pr); @@ -2943,7 +3087,7 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { // argument 1: reduction sizes; case 1: // TODO(profile_size): double check optional[size]? - profileSize(pr, node, offset); + profileReductionSize(pr, node, offset); break; default: return false; @@ -2951,28 +3095,52 @@ bool insertProfileIValue(ProfilingRecord* pr, Node* node, size_t offset) { return true; } - /* - // TODO: Enable view in parser by detecting non-alias view operation - static auto view_schema = - getOperatorForLiteral( - "aten::view(Tensor(a) self, int[] size) -> Tensor(a)") - ->schema(); static auto reshape_schema = - getOperatorForLiteral( - "aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)") + getOperatorForLiteral("aten::reshape(Tensor self, int[] shape) -> Tensor") ->schema(); - if (node->matches(view_schema) || node->matches(reshape_schema)) { + static auto reshape_copy_schema = + getOperatorForLiteral( + "prim::reshape_copy(Tensor self, int[] shape) -> Tensor") + ->schema(); + static auto view_schema = + getOperatorForLiteral("aten::view(Tensor self, int[] size) -> Tensor") + ->schema(); + static auto view_copy_schema = + getOperatorForLiteral( + "prim::view_copy(Tensor self, int[] size) -> Tensor") + ->schema(); + if (node->matches(reshape_schema) || node->matches(reshape_copy_schema) || + node->matches(view_schema) || node->matches(view_copy_schema)) { switch (offset) { // argument 1: new tensor size; case 1: - profileSize(pr, node, offset); + profileViewSize(pr, node, offset); + break; + default: + return false; + } + return true; + } + + static auto squeeze_dim_schema = + getOperatorForLiteral( + "prim::squeeze_copy.dim(Tensor self, int dim) -> Tensor") + ->schema(); + static auto unsqueeze_schema = + getOperatorForLiteral( + "prim::unsqueeze_copy(Tensor self, int dim) -> Tensor") + ->schema(); + if (node->matches(squeeze_dim_schema) || node->matches(unsqueeze_schema)) { + switch (offset) { + // argument 1: unsqueeze dim; + case 1: + profileInt(pr, node, offset); break; default: return false; } return true; } - */ static auto batch_norm_impl_index_schema = getOperatorForLiteral( diff --git a/torch/csrc/jit/codegen/cuda/parser.h b/torch/csrc/jit/codegen/cuda/parser.h index 4b2fcf50f99..6d52b325042 100644 --- a/torch/csrc/jit/codegen/cuda/parser.h +++ b/torch/csrc/jit/codegen/cuda/parser.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp b/torch/csrc/jit/codegen/cuda/partial_split_map.cpp index e7b6db4d165..e320e8ee373 100644 --- a/torch/csrc/jit/codegen/cuda/partial_split_map.cpp +++ b/torch/csrc/jit/codegen/cuda/partial_split_map.cpp @@ -12,7 +12,7 @@ void PartialSplitMap::build(Fusion* fusion) { auto used_vals = ir_utils::allTvs(fusion); for (auto tv : ir_utils::filterByType(used_vals)) { - auto exprs = ExprSort::getExprs( + auto exprs = StmtSort::getExprs( fusion, {tv->domain()->domain().begin(), tv->domain()->domain().end()}); for (auto split : ir_utils::filterByType(exprs)) { // Only needs to check root domains as partial split is only @@ -24,18 +24,10 @@ void PartialSplitMap::build(Fusion* fusion) { continue; } auto root_domain = split->in(); - auto kir_root_domain = - gpu_lower->lowerValue(split->in())->as(); auto start_offset = split->startOffset(); start_offset_map_.insert({root_domain, start_offset}); - kir_start_offset_map_.insert( - {kir_root_domain, - gpu_lower->lowerValue(start_offset)->as()}); auto stop_offset = split->stopOffset(); stop_offset_map_.insert({root_domain, stop_offset}); - kir_stop_offset_map_.insert( - {kir_root_domain, - gpu_lower->lowerValue(stop_offset)->as()}); } } } @@ -49,15 +41,6 @@ Val* PartialSplitMap::getStartOffset(IterDomain* root_domain) const { } } -kir::Val* PartialSplitMap::getStartOffset(kir::IterDomain* root_domain) const { - auto it = kir_start_offset_map_.find(root_domain); - if (it == kir_start_offset_map_.end()) { - return nullptr; - } else { - return it->second; - } -} - Val* PartialSplitMap::getStopOffset(IterDomain* root_domain) const { auto it = stop_offset_map_.find(root_domain); if (it == stop_offset_map_.end()) { @@ -67,15 +50,6 @@ Val* PartialSplitMap::getStopOffset(IterDomain* root_domain) const { } } -kir::Val* PartialSplitMap::getStopOffset(kir::IterDomain* root_domain) const { - auto it = kir_stop_offset_map_.find(root_domain); - if (it == kir_stop_offset_map_.end()) { - return nullptr; - } else { - return it->second; - } -} - } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/partial_split_map.h b/torch/csrc/jit/codegen/cuda/partial_split_map.h index be432bd5a16..8ec489915b7 100644 --- a/torch/csrc/jit/codegen/cuda/partial_split_map.h +++ b/torch/csrc/jit/codegen/cuda/partial_split_map.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -20,15 +20,11 @@ class TORCH_CUDA_CU_API PartialSplitMap { void build(Fusion* fusion); Val* getStartOffset(IterDomain* root_domain) const; - kir::Val* getStartOffset(kir::IterDomain* root_domain) const; Val* getStopOffset(IterDomain* root_domain) const; - kir::Val* getStopOffset(kir::IterDomain* root_domain) const; private: std::unordered_map start_offset_map_; - std::unordered_map kir_start_offset_map_; std::unordered_map stop_offset_map_; - std::unordered_map kir_stop_offset_map_; }; } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/partition.cpp b/torch/csrc/jit/codegen/cuda/partition.cpp index 004c836ec4e..91d68494fd4 100644 --- a/torch/csrc/jit/codegen/cuda/partition.cpp +++ b/torch/csrc/jit/codegen/cuda/partition.cpp @@ -5,12 +5,15 @@ #include #include #include +#include namespace torch { namespace jit { namespace fuser { namespace cuda { +const c10::DeviceIndex INVALID_INDEX = -2; + namespace { bool hasNonElementWiseOperation(const Node* node) { @@ -38,26 +41,61 @@ static c10::optional getDevice(const Value* value) { // not tensor type, return false as the op is not outputing scalar. return c10::nullopt; } - return value->type()->expectRef().device(); + auto tensor_type = value->type()->expectRef(); + // special case for scalar tensor: return c10::nullopt instead of cpu device. + // this allows us to fuse scalar cpu tensor with cuda tensor, while avoid + // merging ops with pure scalar cpu tensors. + if (is_cpu_scalar(tensor_type)) { + return c10::nullopt; + } + return tensor_type.device(); } static c10::optional getDevice(const Node* node) { - auto outputs = node->outputs(); - for (auto output : outputs) { - auto device = getDevice(output); + c10::optional ret = c10::nullopt; + auto merge_devices = [&ret](const c10::optional& device) { if (device.has_value()) { - return device; + if (ret.has_value()) { + if (ret.value() != device.value()) { + // invalidate device to reflect conflicts + ret->set_index(INVALID_INDEX); + // return false to indicate early termination + return false; + } else { + // same device, do nothing + return true; + } + } else { + // initialize return device + ret = device.value(); + return true; + } + } + // no device information, do nothing + return true; + }; + for (auto val : node->inputs()) { + if (!merge_devices(getDevice(val))) { + return ret; } } - return c10::nullopt; + for (auto val : node->outputs()) { + if (!merge_devices(getDevice(val))) { + return ret; + } + } + return ret; } static bool isFusibleDevice(const Node* node, const c10::Device device) { - for (auto value : node->outputs()) { - auto output_device = getDevice(value); - if (output_device.has_value() && output_device.value() != device) { - return false; - } + TORCH_INTERNAL_ASSERT( + device.index() != INVALID_INDEX, "fusible device needs to be validate"); + auto opt_device = getDevice(node); + // we can be more relaxed here as we known that this function tries to merge + // node into an existing `device` + if (opt_device.has_value() && + (opt_device->index() == INVALID_INDEX || opt_device != device)) { + return false; } return true; } @@ -65,10 +103,12 @@ static bool isFusibleDevice(const Node* node, const c10::Device device) { // TODO: we need to check input type when we handle `to()` static bool isFusibleDevice(const Node* node) { auto device = getDevice(node); + // be conservative and only fuse cuda operations, this avoids us initializing + // operations that produces cpu scalar outputs if (!device.has_value()) { - return true; + return false; } - return device->is_cuda() && + return device->index() != INVALID_INDEX && device->is_cuda() && (at::cuda::getDeviceProperties(device->index())->major >= 7 || !hasNonElementWiseOperation(node)); } @@ -400,7 +440,7 @@ bool isFusibleCudaFusionGroup(const Node* fusion, const Node* node) { bool fused = false; // TODO: lift the restriction of not fusing producer containing reduction when // we have proper scheduling. - if (isFusibleCudaFusionGroup(node)) { + if (isFusibleNode(node)) { // ensure if the node has a designated device, it's on the same device with // fusion. // TODO: is there a danger of us fusing operations that's supposed to be on @@ -408,7 +448,6 @@ bool isFusibleCudaFusionGroup(const Node* fusion, const Node* node) { auto device = getDevice(fusion); fused = (!device.has_value() || isFusibleDevice(node, device.value())); } - return fused; } diff --git a/torch/csrc/jit/codegen/cuda/partition.h b/torch/csrc/jit/codegen/cuda/partition.h index 0d8baca4700..b295cb582e5 100644 --- a/torch/csrc/jit/codegen/cuda/partition.h +++ b/torch/csrc/jit/codegen/cuda/partition.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include /* diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp index b501a6133f6..6575b374423 100644 --- a/torch/csrc/jit/codegen/cuda/predicate_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/predicate_compute.cpp @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include @@ -20,27 +18,23 @@ namespace cuda { namespace { -bool isTensorIndexOp(kir::Expr* expr) { +bool isTensorIndexOp(Expr* expr) { const auto& outputs = expr->outputs(); return outputs.size() >= 1 && outputs[0]->isA(); } -bool isOutputLocal(const kir::Expr* expr) { +bool isOutputLocal(const Expr* expr) { return std::all_of( - expr->outputs().begin(), - expr->outputs().end(), - [](const kir::Val* output) { - return !output->isA() || - output->as()->memoryType() == MemoryType::Local; + expr->outputs().begin(), expr->outputs().end(), [](const Val* output) { + return !output->isA() || + output->as()->getMemoryType() == MemoryType::Local; }); } } // namespace -bool ParallelizedDomainPredicate::PredicateInfo::addDomain( - kir::IterDomain* id) { - const auto gpu_lower = GpuLower::current(); - auto concrete_id = gpu_lower->caIndexMap().getConcreteMappedID(id); +bool ParallelizedDomainPredicate::PredicateInfo::addDomain(IterDomain* id) { + auto concrete_id = GpuLower::current()->caIndexMap().getConcreteMappedID(id); if (std::find(ids_.begin(), ids_.end(), concrete_id) == ids_.end()) { ids_.push_back(concrete_id); return true; @@ -49,21 +43,19 @@ bool ParallelizedDomainPredicate::PredicateInfo::addDomain( } } -kir::Bool* ParallelizedDomainPredicate::PredicateInfo::getPredicate() const { - const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); +Bool* ParallelizedDomainPredicate::PredicateInfo::getPredicate() const { + Bool* pred = nullptr; - kir::Bool* pred = nullptr; - - auto index = - ir_builder.create(stringifyThread(pt_), DataType::Int); + auto index = SimplifyingIrBuilder::create( + stringifyThread(pt_), DataType::Int); for (const auto& pred_id : ids()) { // Just sanity check that pred_id is concrete TORCH_INTERNAL_ASSERT( - pred_id == gpu_lower->caIndexMap().getConcreteMappedID(pred_id)); - auto new_pred = ir_builder.ltExpr(index, pred_id->extent()); - pred = ir_builder.andExpr(pred, new_pred)->as(); + pred_id == + GpuLower::current()->caIndexMap().getConcreteMappedID(pred_id)); + auto new_pred = SimplifyingIrBuilder::ltExpr(index, pred_id->extent()); + pred = SimplifyingIrBuilder::andExpr(pred, new_pred)->as(); } return pred; @@ -74,16 +66,12 @@ namespace { std::unordered_set getNonUnswitchedRootDomains( const std::vector& loops, size_t unswitched_loop_index) { - const auto gpu_lower = GpuLower::current(); - std::vector non_unswited_leaf_domains; std::transform( loops.begin(), loops.begin() + unswitched_loop_index, std::back_inserter(non_unswited_leaf_domains), - [&](kir::ForLoop* loop) { - return gpu_lower->caIndexMap().toFusion(loop->iter_domain()); - }); + [&](kir::ForLoop* loop) { return loop->iter_domain(); }); auto non_unswitched_inputs = IterVisitor::getInputsTo(non_unswited_leaf_domains); @@ -100,26 +88,23 @@ std::unordered_set getNonUnswitchedRootDomains( non_unswitched_concrete_root_domains, non_unswitched_concrete_root_domains.end()), [&](auto root_dom) { - return gpu_lower->caIndexMap().getConcreteMappedID(root_dom); + return GpuLower::current()->caIndexMap().getConcreteMappedID(root_dom); }); return non_unswitched_concrete_root_domains; } bool isFullyUnswitched( - kir::IterDomain* loop_id, + IterDomain* loop_id, const std::unordered_set& non_unswitched_root_domains) { - const auto gpu_lower = GpuLower::current(); - - auto root_vals = - IterVisitor::getInputsTo({gpu_lower->caIndexMap().toFusion(loop_id)}); + auto root_vals = IterVisitor::getInputsTo({loop_id}); auto root_domains = ir_utils::filterByType(root_vals); return std::none_of( root_domains.begin(), root_domains.end(), [&](auto root_dom) { auto concrete_root_dom = - gpu_lower->caIndexMap().getConcreteMappedID(root_dom); + GpuLower::current()->caIndexMap().getConcreteMappedID(root_dom); return non_unswitched_root_domains.count(concrete_root_dom) > 0; }); } @@ -131,12 +116,10 @@ std::unordered_map< ParallelizedDomainPredicate::PredicateInfo, TypeHash> ParallelizedDomainPredicate::getPredicateMap( - const kir::Expr* expr, + const Expr* expr, const std::vector& loops, kir::ForLoop* unswitched_loop) { const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - auto output_tvs = ir_utils::getTvs(expr->outputs()); if (output_tvs.empty()) { @@ -167,7 +150,7 @@ ParallelizedDomainPredicate::getPredicateMap( } auto loop_id = loop->iter_domain(); - auto loop_ptype = loop_id->parallelType(); + auto loop_ptype = loop_id->getParallelType(); // Not necessary to add a predicate if the paralle type is exact if (!isParallelTypeThread(loop_ptype) || @@ -193,7 +176,7 @@ ParallelizedDomainPredicate::getPredicateMap( continue; } - kir::IterDomain* tv_id = *it; + IterDomain* tv_id = *it; // If the corresponding domain is a broadcast, it's not really used. if (tv_id->isBroadcast()) { @@ -203,9 +186,9 @@ ParallelizedDomainPredicate::getPredicateMap( // If it's a root domain, it should be covered by the root // predicates, so no extra predicate is required. if (std::find( - tv->domain()->rootDomain().begin(), - tv->domain()->rootDomain().end(), - tv_id) != tv->domain()->rootDomain().end()) { + tv->domain()->getRootDomain().begin(), + tv->domain()->getRootDomain().end(), + tv_id) != tv->domain()->getRootDomain().end()) { continue; } @@ -218,26 +201,24 @@ ParallelizedDomainPredicate::getPredicateMap( return map; } -kir::Bool* ParallelizedDomainPredicate::getPredicate( - const kir::Expr* expr, +Bool* ParallelizedDomainPredicate::getPredicate( + const Expr* expr, const std::vector& loops) { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - auto pred_map = getPredicateMap(expr, loops); - kir::Val* pred = ir_builder.trueVal(); + Val* pred = GpuLower::current()->kernel()->trueVal(); for (auto pt : kParallelTypeThreads) { auto pred_info_it = pred_map.find(pt); if (pred_info_it != pred_map.end()) { const auto& pred_info = pred_info_it->second; auto tid_pred = pred_info.getPredicate(); - pred = ir_builder.andExpr(pred, tid_pred); + pred = SimplifyingIrBuilder::andExpr(pred, tid_pred); } } if (pred) { - return pred->as(); + return pred->as(); } else { return nullptr; } @@ -256,61 +237,55 @@ UnswitchPredicateKey::UnswitchPredicateKey() // concrete domains are used to uniquely collect all necessary // unswitch predicates. UnswitchPredicateKey::UnswitchPredicateKey( - IterDomain* predicated_concrete_id, - const ReferenceTensor& reference) + IterDomain* predicated_consumer_id, + TensorView* consumer_tv, + IterDomain* predicated_concrete_id) : predicated_concrete_id_(predicated_concrete_id) { // Initialize the parallelized domain map for (auto pt : kParallelTypeThreads) { parallel_concrete_ids_.insert({pt, nullptr}); } - // The id parameter is a concrete domain. Needs to find the - // corresponding reference domain to find leaf domains that are - // parallelized. - IterDomain* predicated_ref_id = - reference.concrete_to_id.at(predicated_concrete_id_); - TensorDomain* ref_td = reference.domain; - - std::vector all_parallelized_ref_leaf_ids; + std::vector all_parallelized_consumer_leaf_ids; std::copy_if( - ref_td->domain().begin(), - ref_td->domain().end(), - std::back_inserter(all_parallelized_ref_leaf_ids), + consumer_tv->domain()->domain().begin(), + consumer_tv->domain()->domain().end(), + std::back_inserter(all_parallelized_consumer_leaf_ids), [](IterDomain* x) { return isParallelTypeThread(x->getParallelType()); }); - // If the reference is not parallelized at all, no need to + // If the consumer domais are not parallelized at all, no need to // differentiate keys based on how the predicated id is parallelized - if (all_parallelized_ref_leaf_ids.empty()) { + if (all_parallelized_consumer_leaf_ids.empty()) { return; } - // All domains that are parallelized descendants of predicated_ref_id - auto all_parallelized_ref_ids = DependencyCheck::getAllValsBetween( - {predicated_ref_id}, all_parallelized_ref_leaf_ids); + // All domains that are parallelized descendants of predicated_consumer_id + auto all_parallelized_consumer_ids = DependencyCheck::getAllValsBetween( + {predicated_consumer_id}, all_parallelized_consumer_leaf_ids); // Just pick leaf domains - std::vector parallelized_ref_leaf_ids; + std::vector parallelized_consumer_leaf_ids; std::copy_if( - ref_td->domain().begin(), - ref_td->domain().end(), - std::back_inserter(parallelized_ref_leaf_ids), + consumer_tv->domain()->domain().begin(), + consumer_tv->domain()->domain().end(), + std::back_inserter(parallelized_consumer_leaf_ids), [&](IterDomain* x) { return std::find( - all_parallelized_ref_ids.begin(), - all_parallelized_ref_ids.end(), - x) != all_parallelized_ref_ids.end(); + all_parallelized_consumer_ids.begin(), + all_parallelized_consumer_ids.end(), + x) != all_parallelized_consumer_ids.end(); }); - if (parallelized_ref_leaf_ids.empty()) { - // None of the parallelized leaf domains are derived from predicated_ref_id + if (parallelized_consumer_leaf_ids.empty()) { + // None of the parallelized leaf domains are derived from + // predicated_consumer_id return; } // Find the corresponding concrete id for each parallel type - for (auto ref_leaf : parallelized_ref_leaf_ids) { - auto pt = ref_leaf->getParallelType(); - auto it = reference.id_to_concrete.find(ref_leaf); - TORCH_INTERNAL_ASSERT(it != reference.id_to_concrete.end()); - auto concrete_leaf = it->second; + for (auto consumer_leaf : parallelized_consumer_leaf_ids) { + auto pt = consumer_leaf->getParallelType(); + auto concrete_leaf = + GpuLower::current()->caIndexMap().getConcreteMappedID(consumer_leaf); parallel_concrete_ids_.at(pt) = concrete_leaf; } } @@ -344,19 +319,18 @@ std::size_t UnswitchPredicateKeyHash::operator()( return h; }; -kir::Bool* PredicateCompute::getInlinePredicate( - const kir::Expr* expr, +Bool* PredicateCompute::getInlinePredicate( + const Expr* expr, const std::vector& loops, - kir::Bool* thread_pred, + Bool* thread_pred, PredicateType pred_type) { FUSER_PERF_SCOPE("GpuLower::Lower::getInlinePredicate"); const auto gpu_lower = GpuLower::current(); - kir::SimplifyingIrBuilder ir_builder(gpu_lower->kernel()); // If outputs are registers, no need to predicate for threads if (isOutputLocal(expr)) { - thread_pred = ir_builder.trueVal(); + thread_pred = gpu_lower->kernel()->trueVal(); } if (loops.empty()) { @@ -364,8 +338,8 @@ kir::Bool* PredicateCompute::getInlinePredicate( return thread_pred; } - auto out_tv = ir_utils::getTVOutput(expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output"); + auto out_tv = ir_utils::getTvOutput(expr); + TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output"); if (gpu_lower->predicateElimination().canOmitPredicate(expr)) { return thread_pred; @@ -376,7 +350,7 @@ kir::Bool* PredicateCompute::getInlinePredicate( out_tv, loops, nullptr, pred_type == PredicateType::Padding) .first; - std::vector preds; + std::vector preds; // When pred_type is ReductionWrite, filter out predicates for // reduction axes. For blockReduce, this is necessary when reduction @@ -388,7 +362,7 @@ kir::Bool* PredicateCompute::getInlinePredicate( bool non_zero_start_found = false; for (const auto& pred_info : pred_info_vec) { if (pred_type == PredicateType::ReductionWrite) { - const auto& consumer_ids = pred_info.consumerIds(); + const auto& consumer_ids = pred_info.rootIds(); bool pred_for_reduction_axis = false; for (auto consumer_id : consumer_ids) { if (consumer_id->isReduction()) { @@ -404,21 +378,15 @@ kir::Bool* PredicateCompute::getInlinePredicate( continue; } } - for (auto pred : pred_info.startPredicates()) { - TORCH_INTERNAL_ASSERT(pred != nullptr); - preds.push_back(pred); - } - for (auto pred : pred_info.stopPredicates()) { - TORCH_INTERNAL_ASSERT(pred != nullptr); - preds.push_back(pred); - } + preds.push_back(pred_info.startPredicate()); + preds.push_back(pred_info.stopPredicate()); } // When generating a predicate for blockReduce writes and not for // gridReduce, if all reduction axes start with zero, we can just // use the same predicate for reads. nullptr is returned then. if (pred_type == PredicateType::ReductionWrite && !non_zero_start_found && - !out_tv->fuserTv()->domain()->hasGridReduction()) { + !out_tv->domain()->hasGridReduction()) { return nullptr; } @@ -433,35 +401,33 @@ kir::Bool* PredicateCompute::getInlinePredicate( } if (preds.empty()) { - return ir_builder.trueVal(); + return GpuLower::current()->kernel()->trueVal(); } - kir::Val* cond = preds[0]; + Val* cond = preds[0]; for (const auto i : c10::irange(1, preds.size())) { - cond = ir_builder.andExpr(cond, preds[i]); + cond = SimplifyingIrBuilder::andExpr(cond, preds[i]); } - return cond->as(); + return cond->as(); } -kir::Bool* UnswitchPredicate::get( +Bool* UnswitchPredicate::get( const std::vector& outer_loops, kir::ForLoop* unrolled_loop) { FUSER_PERF_SCOPE("GpuLower::Lower::UnswitchPredicate::get"); - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); - UnswitchPredicate up(outer_loops, unrolled_loop); - kir::Val* unswitch_pred = ir_builder.trueVal(); + Val* unswitch_pred = GpuLower::current()->kernel()->trueVal(); for (auto pred : up.predicates_) { - unswitch_pred = ir_builder.andExpr(unswitch_pred, pred); + unswitch_pred = SimplifyingIrBuilder::andExpr(unswitch_pred, pred); } - return unswitch_pred->as(); + return unswitch_pred->as(); } -void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { +void UnswitchPredicate::predicateOn(Expr* tv_expr) { FUSER_PERF_SCOPE("GpuLower::Lower::UnswitchPredicate::predicateOn"); if (for_loops_.empty()) { @@ -469,14 +435,12 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { } const auto gpu_lower = GpuLower::current(); - kir::IrBuilder ir_builder(gpu_lower->kernel()); - if (gpu_lower->predicateElimination().canOmitPredicate(tv_expr)) { return; } - auto out_tv = ir_utils::getTVOutput(tv_expr); - TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing kir::TensorView output"); + auto out_tv = ir_utils::getTvOutput(tv_expr); + TORCH_INTERNAL_ASSERT(out_tv != nullptr, "Missing TensorView output"); auto ref_pred_info = Index::getReferenceRootPredicates( out_tv, for_loops_, unrolled_loop_, false); @@ -491,10 +455,8 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { // predicates are generated in the finalize function. for (const auto& pred_info : ref_pred_info.first) { - if (pred_info.startPredicates().empty() && - pred_info.stopPredicates().empty()) { - continue; - } + TORCH_INTERNAL_ASSERT(pred_info.startPredicate() != nullptr); + TORCH_INTERNAL_ASSERT(pred_info.stopPredicate() != nullptr); const auto& root_ids = pred_info.rootIds(); @@ -505,13 +467,14 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { bool first_key_set = false; for (auto root_id : root_ids) { - auto kir_root_id = gpu_lower->lowerValue(root_id)->as(); + auto concrete_root_id = + gpu_lower->caIndexMap().getConcreteMappedID(root_id); - if (kir_root_id->isBroadcast()) { + if (root_id->isBroadcast()) { continue; } - UnswitchPredicateKey key(root_id, reference); + UnswitchPredicateKey key(root_id, out_tv, concrete_root_id); auto inserted = predicated_keys_.insert(key).second; add_pred = add_pred || inserted; @@ -573,14 +536,14 @@ void UnswitchPredicate::predicateOn(kir::Expr* tv_expr) { // start and stop offsets. if (merged_pred_it != pending_predicates_.end()) { mergeUnswitchPredicateOffsets( - pred_info.startPredicates(), - pred_info.startOffsets(), + pred_info.startPredicate(), + pred_info.startOffset(), merged_pred_it->start, true); mergeUnswitchPredicateOffsets( - pred_info.stopPredicates(), - pred_info.stopOffsets(), + pred_info.stopPredicate(), + pred_info.stopOffset(), merged_pred_it->stop, false); } @@ -613,7 +576,7 @@ void UnswitchPredicate::openLoop(kir::ForLoop* fl) { for_loops_.push_back(fl); for (auto expr : fl->body().exprs()) { - if (ir_utils::isTVOp(expr) || isTensorIndexOp(expr)) { + if (ir_utils::isTvOp(expr) || isTensorIndexOp(expr)) { predicateOn(expr); } else if (auto ite = dynamic_cast(expr)) { openIte(ite); @@ -630,7 +593,7 @@ void UnswitchPredicate::openIte(kir::IfThenElse* ite) { // only expand the ite thenBody for (auto expr : ite->thenBody().exprs()) { - if (ir_utils::isTVOp(expr) || isTensorIndexOp(expr)) { + if (ir_utils::isTvOp(expr) || isTensorIndexOp(expr)) { predicateOn(expr); } else if (auto ite = dynamic_cast(expr)) { openIte(ite); @@ -641,7 +604,6 @@ void UnswitchPredicate::openIte(kir::IfThenElse* ite) { } void UnswitchPredicate::finalize() { - kir::SimplifyingIrBuilder ir_builder(GpuLower::current()->kernel()); for (const auto& merged_pred : pending_predicates_) { const auto& start_info = merged_pred.start; if (start_info.static_pred) { @@ -661,12 +623,10 @@ void UnswitchPredicate::finalize() { } void UnswitchPredicate::mergeUnswitchPredicateOffsets( - const std::vector& predicates, - const std::vector& offsets, + Bool* predicate, + Val* offset, MergedPredicates::Info& merged_predicate_info, bool is_start) { - TORCH_INTERNAL_ASSERT(predicates.size() == offsets.size()); - auto is_more_restrictive = [&is_start](int64_t new_val, int64_t current_val) { if (is_start) { return new_val < current_val; @@ -675,25 +635,21 @@ void UnswitchPredicate::mergeUnswitchPredicateOffsets( } }; - for (const auto i : c10::irange(predicates.size())) { - auto pred = predicates.at(i); - auto offset = offsets.at(i); - auto offset_int = dynamic_cast(offset); - // If it's a static predicate, replace the current one if it's - // more restrictive. If it's dynamic, just adds it to the dynamic - // predicate list. - if (offset_int && offset_int->isConst()) { - auto offset_const = offset_int->value().value(); - auto& static_pred = merged_predicate_info.static_pred; - auto& static_offset = merged_predicate_info.static_offset; - if (static_pred == nullptr || - is_more_restrictive(offset_const, static_offset)) { - static_pred = pred; - static_offset = offset_const; - } - } else { - merged_predicate_info.dynamic_preds.push_back(pred); + auto offset_int = dynamic_cast(offset); + // If it's a static predicate, replace the current one if it's + // more restrictive. If it's dynamic, just adds it to the dynamic + // predicate list. + if (offset_int && offset_int->isConst()) { + auto offset_const = offset_int->value().value(); + auto& static_pred = merged_predicate_info.static_pred; + auto& static_offset = merged_predicate_info.static_offset; + if (static_pred == nullptr || + is_more_restrictive(offset_const, static_offset)) { + static_pred = predicate; + static_offset = offset_const; } + } else { + merged_predicate_info.dynamic_preds.push_back(predicate); } } diff --git a/torch/csrc/jit/codegen/cuda/predicate_compute.h b/torch/csrc/jit/codegen/cuda/predicate_compute.h index 989bffb3bd1..c6412671e43 100644 --- a/torch/csrc/jit/codegen/cuda/predicate_compute.h +++ b/torch/csrc/jit/codegen/cuda/predicate_compute.h @@ -16,10 +16,10 @@ class PredicateCompute { // ignore_internal_syncthread_ops will prevent creation of predicates on // block/grid broadcast/reduce as these have syncthread calls within them // so all threads need to execute the function. - static kir::Bool* getInlinePredicate( - const kir::Expr* expr, + static Bool* getInlinePredicate( + const Expr* expr, const std::vector& loops, - kir::Bool* thread_pred, + Bool* thread_pred, PredicateType pred_type); }; @@ -40,31 +40,31 @@ class ParallelizedDomainPredicate { explicit PredicateInfo(ParallelType pt) : pt_(pt) {} //! Adds a domain that is parallized by the same paralell type - bool addDomain(kir::IterDomain* id); + bool addDomain(IterDomain* id); - const std::vector& ids() const { + const std::vector& ids() const { return ids_; } //! Generates a predicate Val from predicate information - kir::Bool* getPredicate() const; + Bool* getPredicate() const; private: ParallelType pt_; //! Domains parallelized by the same parallel type - std::vector ids_; + std::vector ids_; }; //! Returns a predicate Val for parallelied domains of an expression. - static kir::Bool* getPredicate( - const kir::Expr* expr, + static Bool* getPredicate( + const Expr* expr, const std::vector& loops); //! Returns predicate information for parallelied domains of an //! expression. static std::unordered_map getPredicateMap( - const kir::Expr* expr, + const Expr* expr, const std::vector& loops, kir::ForLoop* unswitched_loop = nullptr); }; @@ -80,8 +80,9 @@ class UnswitchPredicateKey { UnswitchPredicateKey(); UnswitchPredicateKey( - IterDomain* predicated_concrete_id, - const ReferenceTensor& reference); + IterDomain* predicated_consumer_id, + TensorView* consumer_tv, + IterDomain* predicated_concrete_id); bool operator==(const UnswitchPredicateKey& other) const { return predicated_concrete_id_ == other.predicated_concrete_id_ && @@ -121,7 +122,7 @@ struct UnswitchPredicateKeyHash { class TORCH_CUDA_CU_API UnswitchPredicate { public: - static kir::Bool* get( + static Bool* get( const std::vector& outer_loops, kir::ForLoop* unrolled_loop); @@ -132,11 +133,11 @@ class TORCH_CUDA_CU_API UnswitchPredicate { struct Info { //! Most restrictive static predicate. Nullptr if no static //! predicate found. - kir::Bool* static_pred = nullptr; + Bool* static_pred = nullptr; //! The offset value of static_pred int64_t static_offset = 0; //! List of dynamic predicates. - std::vector dynamic_preds; + std::vector dynamic_preds; }; UnswitchPredicateKey predicate_key; Info start; @@ -147,7 +148,7 @@ class TORCH_CUDA_CU_API UnswitchPredicate { std::vector outer_loops, kir::ForLoop* unrolled_loop); - void predicateOn(kir::Expr*); + void predicateOn(Expr*); void openLoop(kir::ForLoop*); @@ -160,8 +161,8 @@ class TORCH_CUDA_CU_API UnswitchPredicate { //! static, only pick the most restrictive one, e.g., the one with the //! minimum offset for the start predication. void mergeUnswitchPredicateOffsets( - const std::vector& predicates, - const std::vector& offsets, + Bool* predicate, + Val* offset, MergedPredicates::Info& merged_predicate_info, bool is_start); @@ -181,7 +182,7 @@ class TORCH_CUDA_CU_API UnswitchPredicate { parallelized_dom_predicates_; //! The predicates that have been generated. - std::vector predicates_; + std::vector predicates_; std::vector for_loops_; diff --git a/torch/csrc/jit/codegen/cuda/reference_tensor.h b/torch/csrc/jit/codegen/cuda/reference_tensor.h index 2220831dc09..07c83bb6ed7 100644 --- a/torch/csrc/jit/codegen/cuda/reference_tensor.h +++ b/torch/csrc/jit/codegen/cuda/reference_tensor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp index ddb92371baa..b48c6b00b3a 100644 --- a/torch/csrc/jit/codegen/cuda/root_domain_map.cpp +++ b/torch/csrc/jit/codegen/cuda/root_domain_map.cpp @@ -196,7 +196,7 @@ UnmappableReductionDomains::UnmappableReductionDomains() { namespace { -//! Find all domains that a given domain is depeendent on +//! Find all domains that a given domain is dependent on class FindInputDomains : BackwardVisitor { private: FindInputDomains(TensorView* tv, const IterDomain* id) @@ -661,6 +661,58 @@ void ComputeAtRootDomainMapBuilder::setMapped( root_map_.eq_set_.join(producer, consumer); } +void ComputeAtRootDomainMapBuilder::setInvalid( + const DomainKey& key1, + const DomainKey& key2) { + invalid_mappings_.emplace_back(key1, key2); +} + +bool ComputeAtRootDomainMapBuilder::isInvalid( + const std::vector& domains) const { + // First, collect all invalid mappings for each of the keys in domains + DomainKeyMap invalid_key_map; + for (const auto& key : domains) { + DomainKeySet invalid_keys; + for (const auto& invalid_pair : invalid_mappings_) { + if (root_map_.canMap(key, invalid_pair.first)) { + invalid_keys.insert(invalid_pair.second); + } else if (root_map_.canMap(key, invalid_pair.second)) { + invalid_keys.insert(invalid_pair.first); + } + } + invalid_key_map.emplace(key, invalid_keys); + } + + // Next, check if any pair is invalid to map. + const auto num_keys = domains.size(); + for (const auto i : c10::irange(num_keys)) { + const auto& key_i = domains[i]; + // If no invalid keys found for key_i, it can be skipped. + const auto invalid_key_map_it = invalid_key_map.find(key_i); + if (invalid_key_map_it == invalid_key_map.end()) { + continue; + } + + // Set of keys that are invalid to be mapped with key_i. + const DomainKeySet& invalid_keys_for_i = invalid_key_map_it->second; + + // If any other key in domains is identified mappable with any of + // the keys in this set, the mapping with key_i is invalid. + for (const auto j : c10::irange(i + 1, num_keys)) { + const auto& key_j = domains[j]; + if (std::any_of( + invalid_keys_for_i.begin(), + invalid_keys_for_i.end(), + [&](const auto& invalid_key_for_i) { + return root_map_.canMap(key_j, invalid_key_for_i); + })) { + return true; + } + } + } + return false; +} + void ComputeAtRootDomainMapBuilder::setMaybeMapped( const TensorDomain* producer_td, const IterDomain* producer_id, @@ -853,9 +905,11 @@ bool ComputeAtRootDomainMapBuilder::mapAllConsumers( // All entries in key_set must be equivalent with each other. TORCH_INTERNAL_ASSERT(consumer_set.size() > 0); bool consistent = safeToMap(consumer_set); - if (consistent) { - for (const auto pending_consumer : consumer_set) { + for (const auto pending_consumer : consumer_set) { + if (consistent) { setMapped(producer_key, pending_consumer); + } else { + setInvalid(producer_key, pending_consumer); } } // This entry should never be used again, so remove it. @@ -931,6 +985,10 @@ bool ComputeAtRootDomainMapBuilder::safeToMap(const DomainKeySet& domains) { !map_through_reduction_) { return false; } + // Make sure mapping these domains won't cause any invalid mapping + if (isInvalid(unique_domains)) { + return false; + } return true; } diff --git a/torch/csrc/jit/codegen/cuda/root_domain_map.h b/torch/csrc/jit/codegen/cuda/root_domain_map.h index 23ada0fb120..5156dc604f1 100644 --- a/torch/csrc/jit/codegen/cuda/root_domain_map.h +++ b/torch/csrc/jit/codegen/cuda/root_domain_map.h @@ -5,7 +5,7 @@ #include #include -#include +#include namespace torch { namespace jit { @@ -110,7 +110,7 @@ class TORCH_CUDA_CU_API PairwiseRootDomainMap : public RootDomainMap { const TensorView* consumer_tv_ = nullptr; }; -std::string toString(const PairwiseRootDomainMap& root_map); +TORCH_CUDA_CU_API std::string toString(const PairwiseRootDomainMap& root_map); //! Represents an iteration domain of a TensorDomain. Only used for //! root domain mapping. @@ -206,7 +206,7 @@ class TORCH_CUDA_CU_API UnmappableReductionDomains : private IterVisitor { //! This will create mappings between i0, i2 and i4. class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap { friend class ComputeAtRootDomainMapBuilder; - friend std::string toString(const ComputeAtRootDomainMap&); + friend TORCH_CUDA_CU_API std::string toString(const ComputeAtRootDomainMap&); public: //! Builds a mapping table by analyzing the current @@ -327,7 +327,7 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMap : public RootDomainMap { std::unordered_set window_axes_; }; -std::string toString(const ComputeAtRootDomainMap& root_map); +TORCH_CUDA_CU_API std::string toString(const ComputeAtRootDomainMap& root_map); //! Create a DisjointSet of root IterDomains by traversing the //! current fusion entirely. IterDomains that can be mapped each @@ -347,6 +347,12 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder //! Set a pair of producer-consumer domain keys as mappable void setMapped(const DomainKey& producer, const DomainKey& consumer); + //! Records two domains are invalid to map + void setInvalid(const DomainKey& key1, const DomainKey& key2); + + //! Check if no pair of domains is invalid to map + bool isInvalid(const std::vector& domains) const; + //! Track a pair of producer-consumer domains as potentially mappable. Inserts //! entries into pending_map_, but does not add anything into the root_map_ //! (added when handle is called on a TensorView). Maybe mapped will, however, @@ -415,10 +421,13 @@ class TORCH_CUDA_CU_API ComputeAtRootDomainMapBuilder private: ComputeAtRootDomainMap& root_map_; - //! Keep track of what we want to try and map. Set in attemptToProveId. + //! Keep track of what we want to try and map DomainKeyMap pending_map_; std::unordered_set visited_; + //! Helper class to find invalid mappings due to reductions UnmappableReductionDomains incompatible_domains_; + //! Running vector of domain pairs that are invalid to map + std::vector> invalid_mappings_; //! Disable UnmappableReductions check, should //! always be false for compute_at use cases diff --git a/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu b/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu index ed366132689..fcbc98e7818 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/block_sync_atomic.cu @@ -41,10 +41,8 @@ __device__ void sync() { // threads have incremented the counter. while (local_sync_counter < next && old < local_sync_counter) { #if __CUDA_ARCH__ >= 700 - __nanosleep(backoff); -#else - // __nanosleep is not available for sm < 70 - assert(false); + // __nanosleep only available on compute capability 7.0 or higher + __nanosleep(backoff); // avoids busy waiting #endif if (backoff < backoff_max) { backoff *= 2; diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu b/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu index a75d0d5904a..83382f4704c 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/grid_reduction.cu @@ -69,7 +69,7 @@ template < typename Func> __device__ void gridReduceLastBlock( T& out, - const T* in, + const volatile T* in, const nvfuser_index_t grid_reduction_segment_size, // Number of reductions across // grid reduce dimensions diff --git a/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu b/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu index 0ccb07142aa..a134bd81c2d 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/grid_sync.cu @@ -54,10 +54,8 @@ __device__ void sync(int64_t& semaphore, const uint64_t& segment_size) { // Put a sleep here so we have some breaks in probing the global // semaphore, giving a better chance for other warps/blocks to catch up. #if __CUDA_ARCH__ >= 700 - __nanosleep(200); -#else - // __nanosleep is not available for sm < 70 - assert(false); + // __nanosleep only available on compute capability 7.0 or higher + __nanosleep(200); // avoids busy waiting #endif } } diff --git a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu index 61dccb4dff2..02fd8bf8777 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/helpers.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/helpers.cu @@ -279,3 +279,19 @@ template <> double pow(double a, double b) { return ::pow(a, b); } + +float pow(float a, int b) { + return pow(a, (float)b); +} + +double pow(double a, int b) { + return pow(a, (double)b); +} + +float pow(float a, int64_t b) { + return pow(a, (float)b); +} + +double pow(double a, int64_t b) { + return pow(a, (double)b); +} diff --git a/torch/csrc/jit/codegen/cuda/runtime/tensor.cu b/torch/csrc/jit/codegen/cuda/runtime/tensor.cu index aab51a8f158..ac4f2069b3b 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/tensor.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/tensor.cu @@ -19,3 +19,13 @@ struct Tensor { T* data; }; + +// Specialization for 0-dim case that's easy to pass in a CPU based tensor. +template +struct CpuScalarTensor { + __device__ T& operator[](int) { + return data; + }; + + T data; +}; diff --git a/torch/csrc/jit/codegen/cuda/runtime/welford.cu b/torch/csrc/jit/codegen/cuda/runtime/welford.cu index 07d848c55f2..c3b09d82b74 100644 --- a/torch/csrc/jit/codegen/cuda/runtime/welford.cu +++ b/torch/csrc/jit/codegen/cuda/runtime/welford.cu @@ -8,8 +8,8 @@ __inline__ __device__ void welfordCombine( T& a_avg, T& a_M2, TN& a_N, - const T& b_avg, - const T& b_M2, + const T b_avg, + const T b_M2, TN b_N) { if (b_N == 0) { return; @@ -183,9 +183,9 @@ __device__ void gridWelfordLastBlock( T& out_avg, T& out_M2, TN& out_N, - const T* in_avg, - const T* in_M2, - const TN* in_N, + const volatile T* in_avg, + const volatile T* in_M2, + const volatile TN* in_N, const nvfuser_index_t grid_reduction_segment_size, // Number of reductions across // grid reduce dimensions @@ -345,9 +345,9 @@ __device__ void gridWelford( out_avg, out_M2, out_N, - (T*)work_buf_avg, - (T*)work_buf_M2, - (TN*)work_buf_N, + work_buf_avg, + work_buf_M2, + work_buf_N, grid_reduction_segment_size, block_reduction_segment_size, shared_buf_avg, diff --git a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp index b856d83ac92..8aa3081fcc6 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp @@ -43,6 +43,9 @@ ReductionParams innerPersistentHeuristic( // Set some targets for parallelization const int64_t n_elems = total_reduction_numel * total_iteration_numel; + const int64_t outer_reduction_numel = + total_reduction_numel / inner_most_dimension_numel; + // WARNING: At some point we may want to generate heuristics for another // device that is not the current device. const int64_t device_max_threads_per_multiprocessor = @@ -228,7 +231,7 @@ ReductionParams innerPersistentHeuristic( bdimz = std::min( std::min( std::max(max_threads_in_block / (bdimx * bdimy), (int64_t)1), - ceilDiv(total_reduction_numel, inner_most_dimension_numel)), + outer_reduction_numel), scheduler_utils::z_block_limit); // If 3D doesn't fill out the threads, adjust to add to bdimy @@ -251,15 +254,13 @@ ReductionParams innerPersistentHeuristic( bdimz = std::min( std::max(max_threads_in_block / (bdimx * bdimy), (int64_t)1), - ceilDiv(total_reduction_numel, inner_most_dimension_numel)); + outer_reduction_numel); bdimy = std::min( std::max(max_threads_in_block / (bdimx * bdimz), (int64_t)1), max_multi_reduction_factor); } - godim = ceilDiv(total_iteration_numel, bdimy); - bool vectorize = false; // Move unrolling factor into vectorization upto vectorization limit. @@ -275,8 +276,7 @@ ReductionParams innerPersistentHeuristic( if (inner_reduction_unroll_factor < max_unroll) { outer_reduction_unroll_factor = std::min( ceilDiv(max_unroll, inner_reduction_unroll_factor), - ceilDiv( - ceilDiv(total_reduction_numel, inner_most_dimension_numel), bdimz)); + ceilDiv(outer_reduction_numel, bdimz)); } godim = ceilDiv(total_iteration_numel, bdimy); @@ -304,9 +304,8 @@ ReductionParams innerPersistentHeuristic( while (outer_reduction_unroll_factor < max_unroll && batches_per_block_outer_reduction >= 2) { outer_reduction_unroll_factor *= 2; - batches_per_block_outer_reduction = roundUpPow2Or8(ceilDiv( - ceilDiv(total_reduction_numel, inner_most_dimension_numel), - bdimz * outer_reduction_unroll_factor)); + batches_per_block_outer_reduction = roundUpPow2Or8( + ceilDiv(outer_reduction_numel, bdimz * outer_reduction_unroll_factor)); } // If we haven't gotten to the max_unroll case, try to take it out of the @@ -334,7 +333,7 @@ ReductionParams innerPersistentHeuristic( inner_most_dimension_numel, inner_reduction_unroll_factor * batches_per_block_inner_reduction); bdimz = ceilDiv( - ceilDiv(total_reduction_numel, inner_most_dimension_numel), + outer_reduction_numel, outer_reduction_unroll_factor * batches_per_block_outer_reduction); // Try moving persistent buffer factors into threads until we have too many @@ -368,9 +367,8 @@ ReductionParams innerPersistentHeuristic( batches_per_block_outer_reduction = roundUpPow2Or8(batches_per_block_outer_reduction / 2); bdimz = ceilDiv( - ceilDiv(total_reduction_numel, inner_most_dimension_numel), + outer_reduction_numel, batches_per_block_outer_reduction * outer_reduction_unroll_factor); - continue; } break; @@ -410,13 +408,18 @@ ReductionParams innerPersistentHeuristic( pad_bdimx = pad_bdimx && bdimx * inner_reduction_unroll_factor != inner_most_dimension_numel; + // Will be used once supporting inter-block persistence + int64_t gdimx = LaunchParams::UNINITIALIZED_VAL; + int64_t gdimy = LaunchParams::UNINITIALIZED_VAL; + int64_t gdimz = LaunchParams::UNINITIALIZED_VAL; + ReductionParams rparams; rparams.persistent_kernel = true; rparams.fastest_dim = true; // Inner reduction domain - rparams.cross_block_inner_reduce = true; + rparams.cross_block_inner_reduction = true; rparams.block_dim_inner_reduction = ParallelType::TIDx; rparams.pad_inner_reduction_to_warp = pad_bdimx; rparams.batches_per_block_inner_reduction = batches_per_block_inner_reduction; @@ -432,8 +435,15 @@ ReductionParams innerPersistentHeuristic( if (rparams.multiple_reds_per_blk) { rparams.block_dim_iter_dom = ParallelType::TIDy; } - rparams.grid_dim_iter_dom = ParallelType::BIDx; - rparams.split_grid_dim_iter_dom = godim > scheduler_utils::x_grid_limit; + + if (godim > 1) { + rparams.grid_dim_iter_dom = ParallelType::BIDx; + if (godim > scheduler_utils::x_grid_limit) { + rparams.split_grid_dim_iter_dom = true; + gdimx = scheduler_utils::x_grid_limit; + } + } + if (iter_unroll_factor > 1) { rparams.unroll_iter_dom = true; rparams.unroll_factor_iter_dom = iter_unroll_factor; @@ -445,15 +455,15 @@ ReductionParams innerPersistentHeuristic( rparams.batches_per_block_outer_reduction = batches_per_block_outer_reduction; rparams.block_dim_outer_reduction = ParallelType::TIDz; - rparams.cross_block_outer_reduce = true; + rparams.cross_block_outer_reduction = true; rparams.unroll_outer_reduction = outer_reduction_unroll_factor > 1; rparams.unroll_factor_outer_reduction = outer_reduction_unroll_factor; } rparams.lparams = LaunchParams( - LaunchParams::UNINITIALIZED_VAL, - LaunchParams::UNINITIALIZED_VAL, - LaunchParams::UNINITIALIZED_VAL, + gdimx, + gdimy, + gdimz, LaunchParams::UNINITIALIZED_VAL, bdimy, LaunchParams::UNINITIALIZED_VAL); @@ -697,8 +707,8 @@ ReductionParams OuterPersistentHeuristic( rparams.persistent_kernel = true; rparams.fastest_dim = false; - rparams.cross_block_inner_reduce = true; - rparams.cross_grid_inner_reduce = false; + rparams.cross_block_inner_reduction = true; + rparams.cross_grid_inner_reduction = false; rparams.multiple_reds_per_blk = bdimx > 1; if (rparams.multiple_reds_per_blk) { diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp index fb478f1110f..fb465b287e6 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp @@ -391,6 +391,12 @@ class DomainMap { return nullptr; } + static bool hasReferenceTensorView(Fusion* fusion) { + FusionGuard fg(fusion); + DomainMap domain_map(fusion); + return domain_map.findReferenceTensorView() != nullptr; + } + private: // Determine if output TensorView is a valid reference tensor for this fusion. // The reference tensor must map to all the iterDomains in each input. @@ -417,7 +423,8 @@ class DomainMap { // Get concrete IDs for input root or rfactor domain std::unordered_set in_concrete_ids; for (auto in_id : input_tv->getMaybeRFactorDomain()) { - if (!in_id->isBroadcast() && !in_id->isReduction()) { + if (!ca_index_map_.getConcreteMappedID(in_id)->isBroadcast() && + !in_id->isReduction()) { in_concrete_ids.insert(ca_index_map_.getConcreteMappedID(in_id)); } } @@ -491,6 +498,10 @@ class DomainMap { } // namespace +bool hasReferenceTensorView(Fusion* fusion) { + return DomainMap::hasReferenceTensorView(fusion); +} + // TODO: Inline intermediate operations (avoid inlining unrolled/vectorized // input/output caches) void schedulePointwise(Fusion* fusion, const PointwiseParams& params) { @@ -503,7 +514,8 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) { // maybe has_reduction for scheduling should be done on a per output tensor // basis. TORCH_INTERNAL_ASSERT( - !fusion->hasReduction(), "This scheduler only handles pointwise ops."); + ir_utils::getReductionOps(fusion).empty(), + "This scheduler only handles pointwise ops."); // For intermediate outputs, apply cache_fork auto outs = fusion->outputs(); diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h index cb626556579..57b77bb20cc 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h @@ -31,6 +31,11 @@ TORCH_CUDA_CU_API LaunchParams schedulePointwise( Fusion* fusion, const at::ArrayRef& runtime_inputs); +//! Utility for canSchedule interface to check if this fusion has +//! a fully broadcasted reference tensor, which is necessary for +//! the pointwise scheduler. +bool hasReferenceTensorView(Fusion* fusion); + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp index b0d4f12b921..088968b0890 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp @@ -334,9 +334,9 @@ ReductionParams innerReductionHeuristic( ReductionParams rparams; rparams.fastest_dim = true; - rparams.cross_block_inner_reduce = true; + rparams.cross_block_inner_reduction = true; rparams.block_dim_inner_reduction = ParallelType::TIDx; - rparams.cross_grid_inner_reduce = gridim > 1; + rparams.cross_grid_inner_reduction = gridim > 1; rparams.multiple_reds_per_blk = bdimy > 1; bool pad_bdimx = bdimx > 16 && bdimx * bdimy < @@ -359,7 +359,9 @@ ReductionParams innerReductionHeuristic( rparams.vectorize_inner_reduction = vectorize; } - rparams.block_dim_iter_dom = ParallelType::TIDy; + if (rparams.multiple_reds_per_blk) { + rparams.block_dim_iter_dom = ParallelType::TIDy; + } if (iter_unroll_factor > 1) { rparams.unroll_iter_dom = true; rparams.unroll_factor_iter_dom = iter_unroll_factor; @@ -368,10 +370,10 @@ ReductionParams innerReductionHeuristic( rparams.schedule_3D = total_reduction_numel != inner_most_dimension_numel; // Outer reduction domain if (rparams.schedule_3D) { - rparams.cross_grid_outer_reduce = grodim > 1; + rparams.cross_grid_outer_reduction = grodim > 1; if (bdimz > 1) { rparams.block_dim_outer_reduction = ParallelType::TIDz; - rparams.cross_block_outer_reduce = true; + rparams.cross_block_outer_reduction = true; } rparams.unroll_outer_reduction = outer_reduction_unroll_factor > 1; rparams.unroll_factor_outer_reduction = outer_reduction_unroll_factor; @@ -385,39 +387,40 @@ ReductionParams innerReductionHeuristic( // gdimx assigned to grdim. Otherwise it's helpful to pull godim into gdimx in // case it's larger than gdimy can hold, as not doing so can thrash the cache. - if (rparams.cross_grid_inner_reduce) { + if (rparams.cross_grid_inner_reduction) { rparams.grid_dim_inner_reduction = ParallelType::BIDx; - gdimx = gridim; - rparams.split_grid_dim_inner_reduction = - gdimx > scheduler_utils::x_grid_limit; + rparams.split_grid_dim_inner_reduction = true; + gdimx = std::min(gridim, scheduler_utils::x_grid_limit); rparams.grid_dim_iter_dom = ParallelType::BIDy; - gdimy = godim; - rparams.split_grid_dim_iter_dom = gdimy > scheduler_utils::y_grid_limit; + if (godim > scheduler_utils::y_grid_limit) { + rparams.split_grid_dim_iter_dom = true; + gdimy = std::min(godim, scheduler_utils::y_grid_limit); + } } else { - gdimx = godim; rparams.grid_dim_iter_dom = ParallelType::BIDx; - rparams.split_grid_dim_iter_dom = gdimx > scheduler_utils::x_grid_limit; + if (gdimx > scheduler_utils::x_grid_limit) { + rparams.split_grid_dim_iter_dom = true; + gdimx = godim; + } } - if (rparams.cross_grid_outer_reduce) { - if (rparams.cross_block_inner_reduce) { - gdimz = grodim; + if (rparams.cross_grid_outer_reduction) { + if (rparams.cross_block_inner_reduction) { rparams.grid_dim_outer_reduction = ParallelType::BIDz; + gdimz = std::min(grodim, scheduler_utils::z_grid_limit); + rparams.split_grid_dim_outer_reduction = true; } else { - gdimy = grodim; rparams.grid_dim_outer_reduction = ParallelType::BIDy; + gdimy = std::min(grodim, scheduler_utils::y_grid_limit); + rparams.split_grid_dim_outer_reduction = true; } } rparams.lparams = LaunchParams( - rparams.grid_dim_iter_dom == ParallelType::BIDx - ? LaunchParams::UNINITIALIZED_VAL - : gdimx, - rparams.grid_dim_iter_dom == ParallelType::BIDy - ? LaunchParams::UNINITIALIZED_VAL - : gdimy, + gdimx, + gdimy, gdimz, bdimx, bdimy > 1 ? bdimy : LaunchParams::UNINITIALIZED_VAL, @@ -441,12 +444,13 @@ ReductionParams innerReductionHeuristic( // schedule if (rparams.schedule_3D) { if (rparams.multiple_reds_per_blk && - (rparams.cross_grid_inner_reduce || rparams.cross_grid_outer_reduce)) { + (rparams.cross_grid_inner_reduction || + rparams.cross_grid_outer_reduction)) { if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) { std::cerr << "\n===== UNSUPPORTED REDUCTION HEURISTIC ========\n"; std::cerr << rparams.multiple_reds_per_blk << ", " << rparams.unroll_inner_reduction << ", " - << rparams.cross_grid_inner_reduce << std::endl; + << rparams.cross_grid_inner_reduction << std::endl; } return innerReductionHeuristic( total_reduction_numel, @@ -534,9 +538,9 @@ ReductionParams OuterReductionHeuristic( // domain for this // Blocks for reductions - int64_t gdimy = 1; + int64_t grdim = 1; // Blocks for outputs - int64_t gdimx = 1; + int64_t gidim = 1; // Threads for reduction int64_t bdimy = 1; @@ -597,11 +601,11 @@ ReductionParams OuterReductionHeuristic( std::min(max_unroll, ceilDiv(total_reduction_numel, bdimy)); // Go cross grid - gdimy = ceilDiv( + grdim = ceilDiv( ceilDiv(total_reduction_numel, bdimy * inner_reduction_unroll_factor), (int64_t)4); - gdimx = ceilDiv(total_iteration_numel, bdimx * iter_unroll_factor); + gidim = ceilDiv(total_iteration_numel, bdimx * iter_unroll_factor); // Clang tidy constexpr int64_t kEight = 8; @@ -611,13 +615,13 @@ ReductionParams OuterReductionHeuristic( if (ceilDiv(total_reduction_numel, bdimy * inner_reduction_unroll_factor) >= kThirtyTwo) { // Many reduction elements, go cross grid - int64_t min_gdimy = 1; - if (gdimy > 1) { + int64_t min_grdim = 1; + if (grdim > 1) { // already cross grid, don't go below target or what was already set - min_gdimy = std::min(gdimy, ceilDiv(target_blocks, gdimx)); + min_grdim = std::min(grdim, ceilDiv(target_blocks, gidim)); } - gdimy = std::max( - min_gdimy, + grdim = std::max( + min_grdim, ceilDiv( ceilDiv( total_reduction_numel, bdimy * inner_reduction_unroll_factor), @@ -625,33 +629,33 @@ ReductionParams OuterReductionHeuristic( // Don't go too far above number of threads in a block since that's how many // threads are available to do final reduction iteration // This is good! - gdimy = std::min(gdimy, bdimx * bdimy * kEight); + grdim = std::min(grdim, bdimx * bdimy * kEight); } // Try to do some cleanup of ragged waves on device if ( // If we have less than 8 waves of blocks - gdimy * gdimx < device_multiprocessor_count * kEight && + grdim * gidim < device_multiprocessor_count * kEight && // And we don't have an even divisible number of blocks - (gdimy * gdimx) % device_multiprocessor_count != 0 && + (grdim * gidim) % device_multiprocessor_count != 0 && // And we have more than one wave - gdimy * gdimx > device_multiprocessor_count) { + grdim * gidim > device_multiprocessor_count) { // round waves down auto waves = - std::max((gdimx * gdimy) / device_multiprocessor_count, (int64_t)1); - auto new_gdimy = - std::max((waves * device_multiprocessor_count) / gdimx, (int64_t)1); + std::max((gidim * grdim) / device_multiprocessor_count, (int64_t)1); + auto new_grdim = + std::max((waves * device_multiprocessor_count) / gidim, (int64_t)1); if ( - // If difference is less than 25% of the original gdimy - (new_gdimy - gdimy) * 4 < gdimy && + // If difference is less than 25% of the original grdim + (new_grdim - grdim) * 4 < grdim && // and difference is less than 25% of the original number of blocks - ((new_gdimy * gdimx) - (gdimy * gdimx)) * 4 < gdimy * gdimx) { - gdimy = new_gdimy; + ((new_grdim * gidim) - (grdim * gidim)) * 4 < grdim * gidim) { + grdim = new_grdim; } } // Cannot unroll with cross grid reductions - if (gdimy > 1 && iter_unroll_factor > 1) { + if (grdim > 1 && iter_unroll_factor > 1) { // Readjust the thread bindings, ideally we would repeat the block setup // without considering iter domain unrolling, but for now will simplify bdimx = std::min(max_threads_in_block, bdimx * iter_unroll_factor); @@ -664,10 +668,18 @@ ReductionParams OuterReductionHeuristic( iter_unroll_factor = 1; } + int64_t gdimx = LaunchParams::UNINITIALIZED_VAL; + int64_t gdimy = LaunchParams::UNINITIALIZED_VAL; + ReductionParams rparams; // cross grid implies cross block - rparams.cross_block_inner_reduce = bdimy > 1 || gdimy > 1; - rparams.cross_grid_inner_reduce = gdimy > 1; + rparams.cross_block_inner_reduction = bdimy > 1 || grdim > 1; + rparams.cross_grid_inner_reduction = grdim > 1; + if (rparams.cross_grid_inner_reduction) { + rparams.split_grid_dim_inner_reduction = true; + rparams.grid_dim_inner_reduction = ParallelType::BIDy; + gdimy = std::min(grdim, scheduler_utils::y_grid_limit); + } rparams.multiple_reds_per_blk = bdimx > 1 || iter_unroll_factor > 1; if (rparams.multiple_reds_per_blk) { @@ -675,15 +687,12 @@ ReductionParams OuterReductionHeuristic( } rparams.grid_dim_iter_dom = ParallelType::BIDx; - rparams.split_grid_dim_iter_dom = gdimx > scheduler_utils::x_grid_limit; - - if (rparams.cross_grid_inner_reduce) { - rparams.grid_dim_inner_reduction = ParallelType::BIDy; - rparams.split_grid_dim_inner_reduction = - gdimy > scheduler_utils::y_grid_limit; + if (gidim > scheduler_utils::x_grid_limit) { + rparams.split_grid_dim_iter_dom = true; + gdimx = scheduler_utils::x_grid_limit; } - if (rparams.cross_block_inner_reduce) { + if (rparams.cross_block_inner_reduction) { if (rparams.block_dim_iter_dom == ParallelType::TIDx) { rparams.block_dim_inner_reduction = ParallelType::TIDy; } else { @@ -702,7 +711,7 @@ ReductionParams OuterReductionHeuristic( } rparams.lparams = LaunchParams( - LaunchParams::UNINITIALIZED_VAL, + gdimx, gdimy, LaunchParams::UNINITIALIZED_VAL, rparams.multiple_reds_per_blk ? bdimx : bdimy, diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h index aafae3f09ff..a710e0c0ed8 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/reduction_heuristic.h @@ -31,9 +31,9 @@ class ReductionParams { // Inner Reduction Domain: // Reduce across the block? - bool cross_block_inner_reduce = false; + bool cross_block_inner_reduction = false; // Reduce across the grid? - bool cross_grid_inner_reduce = false; + bool cross_grid_inner_reduction = false; // Inner reduction unroll/vectorize bool unroll_inner_reduction = false; // Unrolling factor @@ -81,9 +81,9 @@ class ReductionParams { // Outer Reduction Domain if 3D Scheduled: // Reduce across the block? - bool cross_block_outer_reduce = false; + bool cross_block_outer_reduction = false; // Reduce across the grid? - bool cross_grid_outer_reduce = false; + bool cross_grid_outer_reduction = false; // Split grid dim for iteration axis in case it's too large for cuda bool split_grid_dim_outer_reduction = false; // Register persistent buffer size in outer dimension @@ -113,8 +113,8 @@ class ReductionParams { other.persistent_kernel == persistent_kernel && other.project_persistent_buffers == project_persistent_buffers && other.schedule_3D == schedule_3D && - other.cross_block_inner_reduce == cross_block_inner_reduce && - other.cross_grid_inner_reduce == cross_grid_inner_reduce && + other.cross_block_inner_reduction == cross_block_inner_reduction && + other.cross_grid_inner_reduction == cross_grid_inner_reduction && other.unroll_inner_reduction == unroll_inner_reduction && other.unroll_factor_inner_reduction == unroll_factor_inner_reduction && other.vectorize_inner_reduction == vectorize_inner_reduction && @@ -128,8 +128,8 @@ class ReductionParams { other.unroll_factor_iter_dom == unroll_factor_iter_dom && other.vectorize_iter_dom == vectorize_iter_dom && other.split_grid_dim_iter_dom == split_grid_dim_iter_dom && - other.cross_block_outer_reduce == cross_block_outer_reduce && - other.cross_grid_outer_reduce == cross_grid_outer_reduce && + other.cross_block_outer_reduction == cross_block_outer_reduction && + other.cross_grid_outer_reduction == cross_grid_outer_reduction && other.unroll_outer_reduction == unroll_outer_reduction && other.unroll_factor_outer_reduction == unroll_factor_outer_reduction && other.split_grid_dim_outer_reduction == @@ -153,10 +153,10 @@ class ReductionParams { if (schedule_3D) { ss << "3D Schedule\n" << "Outer Reduction: "; - if (cross_block_outer_reduce) { + if (cross_block_outer_reduction) { ss << "cross block - " << block_dim_outer_reduction << " / "; } - if (cross_grid_outer_reduce) { + if (cross_grid_outer_reduction) { ss << "cross grid - " << grid_dim_outer_reduction << " / "; ss << (split_grid_dim_outer_reduction ? "split grid dim / " : ""); } @@ -189,18 +189,18 @@ class ReductionParams { ss << "\nInner Reduction Domain: "; - if (cross_block_inner_reduce) { + if (cross_block_inner_reduction) { ss << "cross block - " << block_dim_inner_reduction << " / "; ss << (pad_inner_reduction_to_warp ? " pad to warp / " : ""); } - if (cross_grid_inner_reduce) { + if (cross_grid_inner_reduction) { ss << "cross grid - " << grid_dim_inner_reduction << " / "; ss << (split_grid_dim_inner_reduction ? "split grid dim / " : ""); } if (batches_per_block_inner_reduction > 1 || persistent_kernel) { ss << "persistent batch - " << batches_per_block_inner_reduction << " / "; } - ss << (cross_grid_inner_reduce && split_grid_dim_inner_reduction + ss << (cross_grid_inner_reduction && split_grid_dim_inner_reduction ? "split grid dimension / " : "") << (vectorize_inner_reduction ? "vectorize / " : "") @@ -225,8 +225,8 @@ class ReductionParamsHash { static_cast(rp.persistent_kernel) << (bits - 2) ^ static_cast(rp.project_persistent_buffers) << (bits - 3) ^ static_cast(rp.schedule_3D) << (bits - 4) ^ - static_cast(rp.cross_block_inner_reduce) << (bits - 5) ^ - static_cast(rp.cross_grid_inner_reduce) << (bits - 6) ^ + static_cast(rp.cross_block_inner_reduction) << (bits - 5) ^ + static_cast(rp.cross_grid_inner_reduction) << (bits - 6) ^ static_cast(rp.unroll_inner_reduction) << (bits - 7) ^ static_cast(rp.unroll_factor_inner_reduction) ^ static_cast(rp.vectorize_inner_reduction) << (bits - 8) ^ @@ -239,8 +239,8 @@ class ReductionParamsHash { static_cast(rp.unroll_factor_iter_dom) ^ static_cast(rp.vectorize_iter_dom) << (bits - 14) ^ static_cast(rp.split_grid_dim_iter_dom) << (bits - 15) ^ - static_cast(rp.cross_block_outer_reduce) << (bits - 16) ^ - static_cast(rp.cross_grid_outer_reduce) << (bits - 17) ^ + static_cast(rp.cross_block_outer_reduction) << (bits - 16) ^ + static_cast(rp.cross_grid_outer_reduction) << (bits - 17) ^ static_cast(rp.split_grid_dim_outer_reduction) << (bits - 18) ^ static_cast(rp.batches_per_block_outer_reduction) << (bits - 19); diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp index 3850fa9638b..57988d8d994 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp @@ -43,257 +43,170 @@ TensorView* scheduleReductionTV( !(!rparams.fastest_dim && rparams.vectorize_inner_reduction), "Cannot vectorize reduction domain on outer reductions."); - TORCH_INTERNAL_ASSERT( - !(rparams.cross_grid_inner_reduce && rparams.persistent_kernel), - "Grid reductions not implemented yet for persistent kernels."); - TORCH_INTERNAL_ASSERT( !(rparams.multiple_reds_per_blk && !has_iter_axis), "Multiple reductions requires an iter domain, but one wasn't found."); TORCH_INTERNAL_ASSERT( - !(rparams.cross_grid_inner_reduce && rparams.unroll_iter_dom), + !(rparams.cross_grid_inner_reduction && rparams.unroll_iter_dom), "Unrolling on iter domain not supported with cross grid reductions."); TORCH_INTERNAL_ASSERT( !(rparams.unroll_iter_dom && !has_iter_axis), "Unrolling on iter domain requires an iter domain."); - // Inner reduction axis: - if (rparams.unroll_inner_reduction) { - if (rparams.persistent_kernel) { - if (rparams.vectorize_inner_reduction) { - reduction_tv->split( - inner_reduce_axis, - rparams.batches_per_block_inner_reduction, - false); - reduction_tv->split( - inner_reduce_axis + 1, rparams.unroll_factor_inner_reduction); + auto vectorize = [&reduction_tv](int axis, int factor) { + reduction_tv->split(axis, factor); + reduction_tv->axis(axis + 1)->parallelize(ParallelType::Vectorize); + }; - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize(rparams.block_dim_inner_reduction); - if (rparams.pad_inner_reduction_to_warp) { - reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp(); - } - reduction_tv->axis(inner_reduce_axis + 2) - ->parallelize(ParallelType::Vectorize); - } else { - reduction_tv->split( - inner_reduce_axis, - rparams.batches_per_block_inner_reduction * - rparams.unroll_factor_inner_reduction, - false); - reduction_tv->split( - inner_reduce_axis, rparams.unroll_factor_inner_reduction); + auto inner_parallel = [&reduction_tv](int axis, ParallelType ptype) { + reduction_tv->split(axis, NamedScalar::getParallelDim(ptype)); + reduction_tv->axis(axis + 1)->parallelize(ptype); + }; - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize(ParallelType::Unroll); - reduction_tv->axis(inner_reduce_axis + 2) - ->parallelize(rparams.block_dim_inner_reduction); - if (rparams.pad_inner_reduction_to_warp) { - reduction_tv->axis(inner_reduce_axis + 2)->padToMultipleOfWarp(); - } - } - } else { - if (isParallelTypeThread(rparams.block_dim_inner_reduction)) { - if (rparams.vectorize_inner_reduction) { - reduction_tv->split( - inner_reduce_axis, rparams.unroll_factor_inner_reduction); - reduction_tv->split( - inner_reduce_axis, - NamedScalar::getParallelDim(rparams.block_dim_inner_reduction)); - reduction_tv->axis(inner_reduce_axis + 2) - ->parallelize(ParallelType::Vectorize); - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize(rparams.block_dim_inner_reduction); - if (rparams.pad_inner_reduction_to_warp) { - reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp(); - } - } else { - reduction_tv->split( - inner_reduce_axis, - NamedScalar::getParallelDim(rparams.block_dim_inner_reduction)); - reduction_tv->split( - inner_reduce_axis, rparams.unroll_factor_inner_reduction); + auto inner_unswitch = [&reduction_tv](int axis) { + reduction_tv->split(axis, 1); + reduction_tv->axis(axis + 1)->parallelize(ParallelType::Unswitch); + }; - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize(ParallelType::Unroll); - reduction_tv->axis(inner_reduce_axis + 2) - ->parallelize(rparams.block_dim_inner_reduction); + auto inner_unroll = [&reduction_tv](int axis, int factor) { + reduction_tv->split(axis, factor); + reduction_tv->axis(axis + 1)->parallelize(ParallelType::Unroll); + }; - if (rparams.pad_inner_reduction_to_warp) { - reduction_tv->axis(inner_reduce_axis + 2)->padToMultipleOfWarp(); - } - } - } else { - // Inner reduction is not parallelized, but is unrolled or vectorized: - reduction_tv->split( - inner_reduce_axis, rparams.unroll_factor_inner_reduction); - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize( - rparams.vectorize_inner_reduction ? ParallelType::Vectorize - : ParallelType::Unroll); - } + auto outer_parallel = [&reduction_tv](int axis, ParallelType ptype) { + reduction_tv->split(axis, NamedScalar::getParallelDim(ptype), false); + reduction_tv->axis(axis)->parallelize(ptype); + }; + + auto outer_unswitch = [&reduction_tv](int axis) { + reduction_tv->split(axis, 1, false); + reduction_tv->axis(axis)->parallelize(ParallelType::Unswitch); + }; + + auto outer_unroll = [&reduction_tv](int axis, int factor) { + reduction_tv->split(axis, factor, false); + reduction_tv->axis(axis)->parallelize(ParallelType::Unroll); + }; + + if (rparams.persistent_kernel) { + // Persistent Format: + // [Grid Split, persistent buffer, unswitch, unroll, thread dim, vectorize] + if (rparams.vectorize_inner_reduction) { + vectorize(inner_reduce_axis, rparams.unroll_factor_inner_reduction); + } + auto outer_i = inner_reduce_axis; + if (rparams.cross_grid_inner_reduction) { + outer_parallel(outer_i++, rparams.grid_dim_inner_reduction); } - // Unswitch axis which gives us finer control on allocations with - // unrolling - reduction_tv->split(inner_reduce_axis, 1); - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize(ParallelType::Unswitch); - } else { - // Parallelize reduction axis, don't unroll it0 - if (rparams.cross_block_inner_reduce) { - if (rparams.persistent_kernel) { - reduction_tv->split( - inner_reduce_axis, - rparams.batches_per_block_inner_reduction, - false); - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize(rparams.block_dim_inner_reduction); - - if (rparams.pad_inner_reduction_to_warp) { - reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp(); - } - } else { - reduction_tv->split( - inner_reduce_axis, - NamedScalar::getParallelDim(rparams.block_dim_inner_reduction)); - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize(rparams.block_dim_inner_reduction); - if (rparams.pad_inner_reduction_to_warp) { - reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp(); - } - } - } else { - // No parallelization on reduction dim, fake an unswitch axis for - // rfactor - reduction_tv->split(inner_reduce_axis, 1); - reduction_tv->axis(inner_reduce_axis + 1) - ->parallelize(ParallelType::Unswitch); - } - } - - if (rparams.cross_grid_inner_reduce) { reduction_tv->split( - inner_reduce_axis, - NamedScalar::getParallelDim(rparams.grid_dim_inner_reduction), - false); - reduction_tv->axis(inner_reduce_axis) - ->parallelize(rparams.grid_dim_inner_reduction); + outer_i++, rparams.batches_per_block_inner_reduction, false); + + outer_unswitch(outer_i++); + + if (!rparams.vectorize_inner_reduction && rparams.unroll_inner_reduction) { + outer_unroll(outer_i++, rparams.unroll_factor_inner_reduction); + } + + reduction_tv->axis(outer_i)->parallelize(rparams.block_dim_inner_reduction); + + if (rparams.pad_inner_reduction_to_warp) { + reduction_tv->axis(outer_i)->padToMultipleOfWarp(); + } + + } else { + // Non-persistent format: + // [Grid Split, Remainder, unswitch, unroll, thread dim, vectorize] + if (rparams.vectorize_inner_reduction) { + vectorize(inner_reduce_axis, rparams.unroll_factor_inner_reduction); + } + + if (rparams.cross_block_inner_reduction) { + inner_parallel(inner_reduce_axis, rparams.block_dim_inner_reduction); + if (rparams.pad_inner_reduction_to_warp) { + reduction_tv->axis(inner_reduce_axis + 1)->padToMultipleOfWarp(); + } + } + + if (!rparams.vectorize_inner_reduction && rparams.unroll_inner_reduction) { + inner_unroll(inner_reduce_axis, rparams.unroll_factor_inner_reduction); + } + + inner_unswitch(inner_reduce_axis); + if (rparams.cross_grid_inner_reduction) { + if (rparams.split_grid_dim_inner_reduction) { + outer_parallel(inner_reduce_axis, rparams.grid_dim_inner_reduction); + } else { + reduction_tv->axis(inner_reduce_axis) + ->parallelize(rparams.grid_dim_inner_reduction); + } + } } // Outer reduction axis if (rparams.schedule_3D) { - if (rparams.unroll_outer_reduction) { - if (rparams.persistent_kernel) { - reduction_tv->split( - outer_reduce_axis, - rparams.batches_per_block_outer_reduction * - rparams.unroll_factor_outer_reduction, - false); - reduction_tv->split( - outer_reduce_axis, rparams.unroll_factor_outer_reduction); - - reduction_tv->axis(outer_reduce_axis + 1) - ->parallelize(ParallelType::Unroll); - reduction_tv->axis(outer_reduce_axis + 2) - ->parallelize(rparams.block_dim_outer_reduction); - } else { - if (isParallelTypeThread(rparams.block_dim_outer_reduction)) { - reduction_tv->split( - outer_reduce_axis, - NamedScalar::getParallelDim(rparams.block_dim_outer_reduction)); - reduction_tv->split( - outer_reduce_axis, rparams.unroll_factor_outer_reduction); - - reduction_tv->axis(outer_reduce_axis + 1) - ->parallelize(ParallelType::Unroll); - reduction_tv->axis(outer_reduce_axis + 2) - ->parallelize(rparams.block_dim_outer_reduction); - - } else { - // outer reduction is not parallelized, but is unrolled or vectorized: - reduction_tv->split( - outer_reduce_axis, rparams.unroll_factor_outer_reduction); - reduction_tv->axis(outer_reduce_axis + 1) - ->parallelize(ParallelType::Unroll); - } + if (rparams.persistent_kernel) { + // Persistent Format: + // [Grid Split, persistent buffer, unroll, thread dim] + auto outer_i = outer_reduce_axis; + if (rparams.cross_grid_outer_reduction) { + outer_parallel(outer_i++, rparams.grid_dim_outer_reduction); } - } else { - // Parallelize reduction axis, don't unroll it0 - if (rparams.cross_block_outer_reduce) { - if (rparams.persistent_kernel) { - reduction_tv->split( - outer_reduce_axis, - rparams.batches_per_block_outer_reduction, - false); - reduction_tv->axis(outer_reduce_axis + 1) - ->parallelize(rparams.block_dim_outer_reduction); - } else { - reduction_tv->split( - outer_reduce_axis, - NamedScalar::getParallelDim(rparams.block_dim_outer_reduction)); - reduction_tv->axis(outer_reduce_axis + 1) - ->parallelize(rparams.block_dim_outer_reduction); - } - } - } - if (rparams.cross_grid_outer_reduce) { reduction_tv->split( - outer_reduce_axis, - NamedScalar::getParallelDim(rparams.grid_dim_outer_reduction), - false); - reduction_tv->axis(outer_reduce_axis) - ->parallelize(rparams.grid_dim_outer_reduction); + outer_i++, rparams.batches_per_block_outer_reduction, false); + + if (rparams.unroll_outer_reduction) { + outer_unroll(outer_i++, rparams.unroll_factor_outer_reduction); + } + + reduction_tv->axis(outer_i)->parallelize( + rparams.block_dim_outer_reduction); + } else { + // Non-persistent format: + // [Grid Split, Remainder, unroll, thread dim] + if (rparams.cross_block_outer_reduction) { + inner_parallel(outer_reduce_axis, rparams.block_dim_outer_reduction); + } + + if (rparams.unroll_outer_reduction) { + inner_unroll(outer_reduce_axis, rparams.unroll_factor_outer_reduction); + } + + if (rparams.cross_grid_outer_reduction) { + outer_parallel(outer_reduce_axis, rparams.grid_dim_outer_reduction); + } } } // Iteration domain if (has_iter_axis) { + // [Grid Split, unswitch, unroll, thread dim, vectorize] + + if (rparams.vectorize_iter_dom) { + vectorize(iter_axis, rparams.unroll_factor_iter_dom); + } + if (isParallelTypeThread(rparams.block_dim_iter_dom)) { - if (rparams.vectorize_iter_dom) { - reduction_tv->split(iter_axis, rparams.unroll_factor_iter_dom); - reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Vectorize); - - reduction_tv->split( - iter_axis, NamedScalar::getParallelDim(rparams.block_dim_iter_dom)); - reduction_tv->axis(iter_axis + 1) - ->parallelize(rparams.block_dim_iter_dom); - } else { - if ((rparams.fastest_dim && rparams.multiple_reds_per_blk) || - !rparams.fastest_dim) { - reduction_tv->split( - iter_axis, - NamedScalar::getParallelDim(rparams.block_dim_iter_dom)); - reduction_tv->axis(iter_axis + 1) - ->parallelize(rparams.block_dim_iter_dom); - } - if (rparams.unroll_iter_dom) { - reduction_tv->split(iter_axis, rparams.unroll_factor_iter_dom); - reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Unroll); - } - } - } else if (rparams.unroll_iter_dom) { - // Iteration domain is not parallelized but it is unrolled or vectorized - reduction_tv->split(iter_axis, rparams.unroll_factor_iter_dom); - if (rparams.vectorize_iter_dom) { - reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Vectorize); - } else { - reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Unroll); - } + inner_parallel(iter_axis, rparams.block_dim_iter_dom); } + + if (!rparams.vectorize_iter_dom && rparams.unroll_iter_dom) { + inner_unroll(iter_axis, rparams.unroll_factor_iter_dom); + } + if (rparams.unroll_iter_dom) { - reduction_tv->split(iter_axis, 1); - reduction_tv->axis(iter_axis + 1)->parallelize(ParallelType::Unswitch); + inner_unswitch(iter_axis); } - if (rparams.fastest_dim && rparams.split_grid_dim_iter_dom) { - reduction_tv->split(iter_axis, scheduler_utils::x_grid_limit); - reduction_tv->axis(iter_axis + 1)->parallelize(rparams.grid_dim_iter_dom); - } else { - reduction_tv->axis(iter_axis)->parallelize(rparams.grid_dim_iter_dom); + if (isParallelTypeThread(rparams.grid_dim_iter_dom)) { + if (rparams.split_grid_dim_iter_dom) { + outer_parallel(iter_axis, rparams.grid_dim_iter_dom); + } else { + reduction_tv->axis(iter_axis)->parallelize(rparams.grid_dim_iter_dom); + } } } @@ -563,6 +476,48 @@ void multiReductionInliner( scheduler_utils::computeWithOutputs( red_tv, pos, ComputeAtMode::BestEffort); } + // For topologies where there may not be paths to all inputs/outputs from + // the reductions, we need to take a similar approach to the unrolled + // version and setup of compute at from inputs->outputs that are not + // inputs/outputs of the reductions. + std::vector compute_to; + std::unordered_set outs_of_reds; + { + auto outs_of_red_vec = ir_utils::outputTvsOf(ref_tvs); + outs_of_reds = std::unordered_set( + outs_of_red_vec.begin(), outs_of_red_vec.end()); + } + for (auto out : ir_utils::filterByType(fusion->outputs())) { + // only terminating outputs + if (out->uses().size()) { + continue; + } + if (outs_of_reds.find(out) != outs_of_reds.end()) { + continue; + } + compute_to.push_back(out); + } + + std::vector compute_from; + std::unordered_set inps_of_reds; + { + auto inps_of_red_vec = ir_utils::inputTvsOf(ref_tvs); + inps_of_reds = std::unordered_set( + inps_of_red_vec.begin(), inps_of_red_vec.end()); + } + for (auto inp : ir_utils::filterByType(fusion->inputs())) { + if (inps_of_reds.find(inp) != inps_of_reds.end()) { + continue; + } + compute_from.push_back(inp); + } + + scheduler_utils::computeAtBetween( + compute_from, + compute_to, + -1, + ComputeAtMode::MostInlined, + mapped_to_trivial_reduction); } } @@ -595,12 +550,6 @@ int idPos(const IterDomain* id) { } inner_most--; - // Reduction and block - if (id->isReduction() && id->isBlockDim()) { - return inner_most; - } - inner_most--; - // Reduction and constant if (id->isReduction() && id->extent()->isConstScalar()) { return inner_most; @@ -614,7 +563,7 @@ int idPos(const IterDomain* id) { inner_most--; // Reduction and thread - if (id->isReduction() && id->isThreadDim()) { + if (id->isReduction() && id->isThread()) { return inner_most; } inner_most--; diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index 46b574ac6af..4f2982b01f2 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1,10 +1,12 @@ #include +#include #include #include #include #include #include #include +#include #include #include @@ -38,7 +40,8 @@ class SchedulerTopologyChecker { auto all_vals = fusion->usedMathVals(); std::vector reduction_tvs; for (auto tv : ir_utils::filterByType(all_vals)) { - if (tv->hasReduction() && !fusion->hasInput(tv)) { + if (tv->hasReduction() && + !(fusion == tv->fusion() && tv->isFusionInput())) { reduction_tvs.push_back(tv); } } @@ -355,6 +358,50 @@ class SchedulerTopologyChecker { return true; } }; + +bool isConnectedFusionGraph(Fusion* fusion) { + if (fusion->outputs().empty()) { + // Trivial case interpreted as connected + return true; + } + + // A set of connected components on the fusion graph + DisjointSet component_sets; + + // Iterate through all used exprs + for (auto expr : fusion->exprs()) { + TORCH_INTERNAL_ASSERT( + !expr->inputs().empty(), "unknown expr with zero input"); + + // Each expr joins all its inputs and + // outputs to the same component + auto input0 = expr->inputs()[0]; + for (auto input : expr->inputs()) { + component_sets.join(input0, input); + } + for (auto output : expr->outputs()) { + component_sets.join(input0, output); + } + } + + // Join aliased outputs + for (auto alias_it : fusion->ioAlias()) { + component_sets.join(alias_it.first, alias_it.second); + } + + // Check connected-ness: + // If there is no independent compute flow + // on this fusion graph, all outputs will be + // equivalent/connected to the first output. + auto output0 = fusion->outputs()[0]; + for (auto output : fusion->outputs()) { + if (!component_sets.areEquivalent(output0, output)) { + return false; + } + } + return true; +} + } // namespace SchedulerRuntimeInfo::SchedulerRuntimeInfo( @@ -634,39 +681,10 @@ bool SchedulerEntry::sameAs(const SchedulerEntry* other) { } namespace { -template -inline bool isTrivialReduction(REDUCTION_OP* red) { - auto o_tv = red->out()->template as(); - // Assuming graph unscheduled at this point. - for (auto id : o_tv->getRootDomain()) { - if (id->isReduction() && !id->extent()->isOneInt()) { - return false; - } - } - return true; -} - -template -std::vector findReductionOps(Fusion* fusion) { - std::vector red_ops; - for (auto expr : fusion->exprs()) { - if (auto red = dynamic_cast(expr)) { - if (!isTrivialReduction(red)) { - red_ops.push_back(red); - } - } - } - return red_ops; -} - std::vector findTransposeOps(Fusion* fusion) { - std::vector transpose_ops; - for (auto expr : fusion->exprs()) { - if (auto transpose_op = dynamic_cast(expr)) { - transpose_ops.push_back(transpose_op); - } - } - return transpose_ops; + auto exprs = fusion->exprs(); + auto transpose_ops = ir_utils::filterByType(exprs); + return std::vector(transpose_ops.begin(), transpose_ops.end()); } static bool checkPatternEquivalence( @@ -765,9 +783,8 @@ class ReductionScheduler : public SchedulerEntry { } // Make sure reduction axes are consistent through the fusion - if (findReductionOps(fusion).size() + - findReductionOps(fusion).size() > - 1) { + auto reduction_ops = ir_utils::getReductionOps(fusion); + if (reduction_ops.size() > 1) { // Before examining the reduction axes want to quickly // check the reductions have the same axis width // to avoid building root domain map in easier cases @@ -857,9 +874,16 @@ class PointWiseScheduler : public SchedulerEntry { } static bool canScheduleCompileTime(Fusion* fusion) { - auto red_ops = findReductionOps(fusion); - auto welford_ops = findReductionOps(fusion); - return red_ops.empty() && welford_ops.empty(); + // Currently using the same path as the scheduler + // to eliminate mismatch between canSchedule and + // schedule pointwise. + if (!hasReferenceTensorView(fusion)) { + return false; + } + + auto reduction_ops = ir_utils::getReductionOps(fusion); + auto welford_ops = ir_utils::filterByType(reduction_ops); + return reduction_ops.empty() && welford_ops.empty(); } static bool canScheduleRunTime( @@ -900,6 +924,14 @@ class PersistentKernelScheduler : public SchedulerEntry { } static bool canScheduleCompileTime(Fusion* fusion) { + auto reduction_ops = ir_utils::getReductionOps(fusion); + auto welford_ops = ir_utils::filterByType(reduction_ops); + // For persistent schedule we want welford translated to average and + // standard deviation reductions. + if (welford_ops.begin() != welford_ops.end()) { + return false; + } + auto view_tvs = scheduler_utils::getViewTVs(fusion); if (view_tvs.size() > 0) { return false; @@ -1079,8 +1111,13 @@ bool checkCanSchedule( // since for all current use cases // it has to pass all the compile time checks to create a data cache for this // fusion. - if (!data_cache && !SchedulerType::canScheduleCompileTime(fusion)) { - return false; + if (!data_cache) { + if (!isConnectedFusionGraph(fusion)) { + return false; + } + if (!SchedulerType::canScheduleCompileTime(fusion)) { + return false; + } } return SchedulerType::canScheduleRunTime(fusion, runtime_info, data_cache); diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp index 7ce9addf0cb..90b348236cf 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp @@ -287,6 +287,15 @@ class PersistentBufferResolution : public IterVisitor { } if (tv->hasReduction()) { + if (std::any_of( + resolution_points_.begin(), + resolution_points_.end(), + [&tv](TensorView* resolution_point) { + return DependencyCheck::isDependencyOf(resolution_point, tv); + })) { + // If already resolved, don't start a new reduction path. + return; + } on_reduction_path_.emplace(tv); } } @@ -587,7 +596,7 @@ void computeAtBetween( return mapped_to_trivial_reduction.count(id); }); - pos = pos_it == consumer->domain()->domain().end() + auto consumer_pos = pos_it == consumer->domain()->domain().end() ? pos : std::min( (int)std::distance( @@ -596,7 +605,7 @@ void computeAtBetween( (pos < 0 ? pos + (int)consumer->nDims() : pos)); // Assume we don't want to reset computeAt on tensors that have already // performed it. - producer->computeAt(consumer, pos, mode); + producer->computeAt(consumer, consumer_pos, mode); } } } @@ -1038,15 +1047,22 @@ std::vector> cacheAndForkOutputs( } namespace { +// If this is an rfactored reduction domain, actually check the root domain, +// this is because the rfactored reduction tensorview has the vectorized +// dimension, but that means the rfactor domain could have reordered what we +// consider the "inner most" allocated position on it if we consider the rfactor +// dimension. IterDomain* innerMostRootDim(TensorView* tv) { if (tv->nDims() == 0) { return nullptr; } IterDomain* inner_most_id = nullptr; - for (auto it = tv->getMaybeRFactorDomain().rbegin(); - it != tv->getMaybeRFactorDomain().rend(); - it++) { + auto root_domain = tv->hasReduction() && tv->hasRFactor() + ? tv->getRootDomain() + : tv->getMaybeRFactorDomain(); + + for (auto it = root_domain.rbegin(); it != root_domain.rend(); it++) { if ((*it)->isReduction() && tv->isFusionInput()) { continue; } @@ -1084,7 +1100,7 @@ IterDomain* projectIdToRoot( return reference_id; } - auto replay_exprs = ExprSort::getExprs(tv->fusion(), {reference_id}); + auto replay_exprs = StmtSort::getExprs(tv->fusion(), {reference_id}, false); if (replay_exprs.empty()) { return reference_id; } @@ -1193,12 +1209,16 @@ std::unordered_set FindAllMappedDims::from( TensorView* tv, IterDomain* id, bool vectorize_pass) { + auto root_domain = tv->hasReduction() && tv->hasRFactor() + ? tv->getRootDomain() + : tv->getMaybeRFactorDomain(); + TORCH_INTERNAL_ASSERT( std::find_if( - tv->getMaybeRFactorDomain().begin(), - tv->getMaybeRFactorDomain().end(), + root_domain.begin(), + root_domain.end(), [&id](IterDomain* root_id) { return root_id == id; }) != - tv->getMaybeRFactorDomain().end(), + root_domain.end(), "Tried to map out ", id, " from TV ", diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp index 2bf8967f74e..911bda3da04 100644 --- a/torch/csrc/jit/codegen/cuda/tensor_view.cpp +++ b/torch/csrc/jit/codegen/cuda/tensor_view.cpp @@ -3,10 +3,13 @@ #include #include #include +#include #include #include #include #include +#include +#include // Cleanup #include @@ -24,8 +27,14 @@ DataType aten_opt_type_map(const c10::optional& scalar_type) { } } // namespace -TensorView::TensorView(TensorDomain* domain, DataType dtype, MemoryType mtype) - : Val(ValType::TensorView, dtype), domain_(domain), memory_type_(mtype) { +TensorView::TensorView( + IrBuilderPasskey passkey, + TensorDomain* domain, + DataType dtype, + MemoryType mtype) + : Val(passkey, ValType::TensorView, dtype), + domain_(domain), + memory_type_(mtype) { // Don't do this after transforms if (domain_->domain() == domain_->getRootDomain()) { // Mark the size-1 axes as broadcast to support implicit broadcast semantic @@ -38,10 +47,15 @@ TensorView::TensorView(TensorDomain* domain, DataType dtype, MemoryType mtype) } } -TensorView::TensorView(const std::shared_ptr& tensor_type) - : Val(ValType::TensorView, - aten_opt_type_map(tensor_type->scalarType()), - false) { +TensorView::TensorView( + IrBuilderPasskey passkey, + const std::shared_ptr& tensor_type) + : Val(passkey, + ValType::TensorView, + aten_opt_type_map(tensor_type->scalarType())) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); std::vector sizes; TORCH_CHECK( @@ -51,13 +65,14 @@ TensorView::TensorView(const std::shared_ptr& tensor_type) if (tensor_type->sizes()[i].has_value() && tensor_type->sizes()[i].value() == 1) { // If size is known to be 1, assuem it needs to be broadcasted. - sizes.push_back(new IterDomain( - new Int(0), - new Int(1), + sizes.push_back(IrBuilder::create( + passkey.ir_container_->zeroVal(), + passkey.ir_container_->oneVal(), ParallelType::Serial, IterType::BroadcastWithStride)); } else { - sizes.push_back(new IterDomain(new Int(0), new Int())); + sizes.push_back(IrBuilder::create( + passkey.ir_container_->zeroVal(), IrBuilder::create())); } } @@ -92,8 +107,16 @@ TensorView::TensorView(const std::shared_ptr& tensor_type) } } - domain_ = new TensorDomain(sizes, contig_info); - name_ = fusion_->registerVal(this); + domain_ = IrBuilder::create(sizes, contig_info); +} + +TensorView::TensorView( + IrBuilderPasskey passkey, + const std::shared_ptr& jit_value) + : TensorView(passkey, jit_value->type()->cast()) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); } TensorView::TensorView(const TensorView* src, IrCloner* ir_cloner) @@ -102,7 +125,9 @@ TensorView::TensorView(const TensorView* src, IrCloner* ir_cloner) compute_at_pos_(src->compute_at_pos_), max_producer_pos_(src->max_producer_pos_), memory_type_(src->memory_type_), - swizzle_type_(src->swizzle_type_) { + swizzle_type_(src->swizzle_type_), + is_double_buffered_(src->is_double_buffered_), + cpu_scalar_(src->cpu_scalar_) { for (const auto id : src->axesToSwizzle()) { axes_to_swizzle_.push_back(ir_cloner->clone(id)); } @@ -152,6 +177,18 @@ std::vector::size_type TensorView::nDims() const { return domain()->nDims(); } +// sets cpu_scalar_ value, which is special handling for CPU based zero-dim +// tensors (i.e. CPU Tensors that only have one value). This is only used if +// on an input value, otherwise ignored. This is important as special handling +// because these "scalars" should be type promoted as a tensor, but we want to +// avoid explicit copying of the data, so we want to pass the data value as a +// standard kernel argument value. +void TensorView::setCpuScalar(bool is_cpu_scalar) { + TORCH_INTERNAL_ASSERT( + nDims() == 0, "Only 0-dim tensors can be marked as a cpu scalar."); + cpu_scalar_ = is_cpu_scalar; +} + IterDomain* TensorView::axis(int pos) const { TORCH_INTERNAL_ASSERT( nDims() > 0, "Tried to access an axis in a 0-dim TensorView"); @@ -167,6 +204,9 @@ IterDomain* TensorView::axis(int pos) const { } void TensorView::setComputeAt(unsigned int pos, bool decrease) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); if (pos <= compute_at_pos_ && !decrease) { return; } @@ -182,6 +222,9 @@ void TensorView::setComputeAt(unsigned int pos, bool decrease) { } void TensorView::setMaxProducer(unsigned int pos, bool decrease) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); if (pos <= max_producer_pos_ && !decrease) { return; } @@ -200,6 +243,9 @@ TensorView* TensorView::computeAt( TensorView* consumer, int position, ComputeAtMode mode) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); // Make sure this and consumer are not the same tensor, that's illegal TORCH_CHECK(!sameAs(consumer), "Cannot call this->computeAt(this, ...)"); @@ -228,6 +274,9 @@ TensorView* TensorView::computeWith( TensorView* consumer, int position, ComputeAtMode mode) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); // Make sure this and consumer are not the same tensor, that's illegal TORCH_CHECK(!sameAs(consumer), "Cannot call this->computeAt(this, ...)"); @@ -290,7 +339,7 @@ TensorView* TensorView::split( unsigned int factor, bool inner_split, bool trim_out_of_bounds) { - split(axis, new Int(factor), inner_split, trim_out_of_bounds); + split(axis, IrBuilder::create(factor), inner_split, trim_out_of_bounds); return this; } @@ -336,6 +385,9 @@ TensorView* TensorView::merge(int axis_o, int axis_i) { } TensorView* TensorView::reorder(const std::unordered_map& old2new_) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); TORCH_INTERNAL_ASSERT( !(nDims() == 0 && old2new_.size() > 0), "Tried to reorder a 0-dim TensorView"); @@ -383,6 +435,9 @@ TensorView* TensorView::reorder(const std::unordered_map& old2new_) { TensorView* TensorView::swizzle( SwizzleType type, const std::vector& axes) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); swizzle_type_ = type; // Clear previously set swizzle axes if any @@ -432,6 +487,9 @@ TensorView* TensorView::swizzle( } TensorView* TensorView::rFactor(const std::vector& axes) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); // TODO: I think we should do this but // NVFuserTest.FusionSmemBlockGemmCache_CUDA prevents it from going in at the // moment. @@ -462,7 +520,8 @@ TensorView* TensorView::rFactor(const std::vector& axes) { auto consumer_domain = domain_pair.second; // This domain will be the consumer, so create the producer - TensorView* producer = new TensorView(producer_domain, getDataType().value()); + TensorView* producer = + IrBuilder::create(producer_domain, getDataType().value()); // Set domain of consumer setDomain(consumer_domain); @@ -470,14 +529,14 @@ TensorView* TensorView::rFactor(const std::vector& axes) { // Setup dependency chain, inserting producer before this op. // Expr* producer_definition = - new ReductionOp( + IrBuilder::create( this_definition->getReductionOpType(), this_definition->init(), producer, this_definition->in()); // Expr* consumer_definition = - new ReductionOp( + IrBuilder::create( this_definition->getReductionOpType(), this_definition->init(), consumer, @@ -489,6 +548,9 @@ TensorView* TensorView::rFactor(const std::vector& axes) { TensorView* TensorView::welfordRfactorHelper( TensorView* tv, const std::vector& axes) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); // Hack: // Semantically we should always keep the outputs of welfordOp scheduled // the same but the user end cannot guarantee that. @@ -520,7 +582,8 @@ TensorView* TensorView::welfordRfactorHelper( std::vector new_contig( tv->domain()->contiguity().begin(), tv->domain()->contiguity().end()); // replace tensor domain of target tv - tv->setDomain(new TensorDomain(tv->getRootDomain(), new_id, new_contig)); + tv->setDomain(IrBuilder::create( + tv->getRootDomain(), new_id, new_contig)); } // Split tensor view into 2 parts @@ -532,7 +595,7 @@ TensorView* TensorView::welfordRfactorHelper( // This domain will be the consumer, so create the producer TensorView* producer = - new TensorView(producer_domain, tv->getDataType().value()); + IrBuilder::create(producer_domain, tv->getDataType().value()); // Set domain of consumer tv->setDomain(consumer_domain); @@ -545,6 +608,9 @@ WelfordResult TensorView::rFactor( TensorView* avg, TensorView* var, TensorView* n) { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); TORCH_INTERNAL_ASSERT(nDims() > 0, "Tried to rFactor a 0-dim TensorView"); FusionGuard fg(fusion()); TORCH_CHECK( @@ -588,7 +654,7 @@ WelfordResult TensorView::rFactor( // Setup dependency chain, inserting producer before this op. // Expr* producer_definition = - new WelfordOp( + IrBuilder::create( producer_avg, producer_var, producer_n, /*out var/avg/count */ @@ -600,7 +666,7 @@ WelfordResult TensorView::rFactor( wop->inN()); // Expr* consumer_definition = - new WelfordOp( + IrBuilder::create( avg, var, n, @@ -615,6 +681,9 @@ WelfordResult TensorView::rFactor( } TensorView* TensorView::cache_before() { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); FusionGuard fg(fusion()); TORCH_CHECK( @@ -652,8 +721,10 @@ TensorView* TensorView::cache_before() { // This domain will be the consumer which needs a new domain, so replace the // producers domain with this domain. - TensorView* producer = new TensorView( - new TensorDomain( + TensorView* producer = IrBuilder::create( + container(), + IrBuilder::create( + container(), domain()->getRootDomain(), domain()->getRFactorDomain(), domain()->domain(), @@ -671,8 +742,10 @@ TensorView* TensorView::cache_before() { new_root_domain[i++] = dom->clone(); } - consumer->setDomain(new TensorDomain( - new_root_domain, std::vector(new_root_domain.size(), true))); + consumer->setDomain(IrBuilder::create( + container(), + new_root_domain, + std::vector(new_root_domain.size(), true))); // Insert producer - Cache_Before (CB) - before this TV. // Before: Prev TV -> [Definition Op] -> This TV @@ -684,7 +757,7 @@ TensorView* TensorView::cache_before() { ir_utils::replaceValInExpr(definition(), this, producer); // Expr* producer_uses = - new UnaryOp(UnaryOpType::Set, consumer, producer); + IrBuilder::create(container(), UnaryOpType::Set, consumer, producer); // definition_ is no longer valid // setDefinition(nullptr); @@ -697,6 +770,9 @@ TensorView* TensorView::cache_before() { } TensorView* TensorView::cache_fork() { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); FusionGuard fg(fusion()); // Before: [Expr] -> This TV (Global Output) -> [Usage Expr] @@ -704,7 +780,7 @@ TensorView* TensorView::cache_fork() { // (Fork) -> [Set Expr] -> New TV (Global Output) TORCH_CHECK( - fusion()->hasOutput(this) && !this->uses().empty(), + this->isFusionOutput() && !this->uses().empty(), "Error adding cache_fork ", this, " this TensorView must be an output with subsequent uses"); @@ -717,14 +793,16 @@ TensorView* TensorView::cache_fork() { // This domain will be the producer, so create the consumer auto root_domain = TensorDomain::noReductions(getMaybeRFactorDomain()); - TensorView* new_output = new TensorView( - new TensorDomain( + TensorView* new_output = IrBuilder::create( + container(), + IrBuilder::create( + container(), IterDomain::clone(root_domain), std::vector(root_domain.size(), true)), getDataType().value()); // Create write operation from this TV to new output - new UnaryOp(UnaryOpType::Set, new_output, this); + IrBuilder::create(container(), UnaryOpType::Set, new_output, this); // The new TV becomes an output. // New TV has global memory type. @@ -739,13 +817,14 @@ TensorView* TensorView::cache_fork() { } TensorView* TensorView::cache_after() { + TORCH_INTERNAL_ASSERT( + !container()->isA(), + "Function invalid for kernel container."); FusionGuard fg(fusion()); - const bool kIsFusionInput = fusion()->hasInput(this); - // Get all the uses for this Tensorview TORCH_CHECK( - !fusion()->hasOutput(this), + !isFusionOutput(), "Error adding cache_after ", this, " we restrict using cache_after on an output."); @@ -759,7 +838,7 @@ TensorView* TensorView::cache_after() { // It also did additional transformation when this tensor is an // input and the outputs of its consumers have computeAt. Make sure // we no longer rely on that behavior. - if (kIsFusionInput) { + if (isFusionInput()) { for (const auto& expr : uses()) { for (TensorView* output : ir_utils::filterByType(expr->outputs())) { @@ -782,9 +861,12 @@ TensorView* TensorView::cache_after() { } // This domain will be the producer, so create the consumer - TensorView* consumer = new TensorView( - new TensorDomain( - new_root_domain, std::vector(new_root_domain.size(), true)), + TensorView* consumer = IrBuilder::create( + container(), + IrBuilder::create( + container(), + new_root_domain, + std::vector(new_root_domain.size(), true)), getDataType().value()); // Set domain of producer - No Change @@ -800,14 +882,14 @@ TensorView* TensorView::cache_after() { } // Expr* consumer_definition = - new UnaryOp(UnaryOpType::Set, consumer, producer); + IrBuilder::create(container(), UnaryOpType::Set, consumer, producer); return consumer; } void TensorView::setMemoryType(MemoryType mt) { memory_type_ = mt; - if (fusion()->hasInput(this) || fusion()->hasOutput(this)) { + if (isFusionInput() || isFusionOutput()) { TORCH_INTERNAL_ASSERT( mt == MemoryType::Global, "Tried to set an input or output to the fusion to a non-global memory type."); @@ -832,7 +914,23 @@ void TensorView::clearReductionIterDomains() { } } - setDomain(new TensorDomain(new_root, new_contig)); + setDomain(IrBuilder::create(container(), new_root, new_contig)); +} + +void TensorView::doubleBuffer() { + // Early correctness checking. May miss eventual errors as the + // checks depend on memory types and parallelization, which may not + // be finalized until lowering. + validateDoubleBufferedTensor(this); + is_double_buffered_ = true; +} + +bool TensorView::isEmptyTensor() const { + auto& root_domain = getMaybeRFactorDomain(); + return std::all_of( + root_domain.begin(), root_domain.end(), [](IterDomain* id) { + return id->extent()->isZeroInt(); + }); } TensorViewBuilder& TensorViewBuilder::ndims(size_t ndims) { @@ -872,7 +970,8 @@ TensorView* TensorViewBuilder::build() const { std::vector domain(ndims_, nullptr); for (const auto i : c10::irange(ndims_)) { if (shape_.empty() || shape_[i] == -1) { - domain[i] = new IterDomain(new Int(0), new Int()); + domain[i] = IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), IrBuilder::create()); } else { TORCH_CHECK( shape_[i] >= 0, @@ -880,19 +979,22 @@ TensorView* TensorViewBuilder::build() const { "For a tensor representing a single scalar use ndims = 0 with no sizes set."); if (shape_[i] == 1) { // If size is known to be 1, assume it needs to be broadcasted. - domain[i] = new IterDomain( - new Int(0), - new Int(1), + domain[i] = IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + FusionGuard::getCurFusion()->oneVal(), ParallelType::Serial, IterType::BroadcastWithStride); } else { - domain[i] = new IterDomain(new Int(0), new Int(shape_[i])); + domain[i] = IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + IrBuilder::create(shape_[i])); } } } // Create the final TensorView - return new TensorView(new TensorDomain(domain, contiguity_), dtype_); + return IrBuilder::create( + IrBuilder::create(domain, contiguity_), dtype_); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.cpp b/torch/csrc/jit/codegen/cuda/transform_iter.cpp index 54136616268..bae77943b33 100644 --- a/torch/csrc/jit/codegen/cuda/transform_iter.cpp +++ b/torch/csrc/jit/codegen/cuda/transform_iter.cpp @@ -228,7 +228,7 @@ BestEffortReplay::BestEffortReplay( } // Grab expr history of iter domains in target_domain - std::vector target_exprs = ExprSort::getExprs( + std::vector target_exprs = StmtSort::getExprs( FusionGuard::getCurFusion(), std::vector(target_domain.begin(), target_domain.end())); @@ -239,7 +239,7 @@ BestEffortReplay::BestEffortReplay( // replay_domain map. // Map replay domain's IterDomains to the Exprs they're used in - std::vector replay_exprs = ExprSort::getExprs( + std::vector replay_exprs = StmtSort::getExprs( FusionGuard::getCurFusion(), std::vector(replay_domain.begin(), replay_domain.end())); @@ -561,7 +561,7 @@ struct ConsumerForwardingInfo { auto consumer_bcast_ids_not_in_producer = consumer_bcast_roots_not_in_producer; - std::vector consumer_history = ExprSort::getExprs( + std::vector consumer_history = StmtSort::getExprs( FusionGuard::getCurFusion(), std::vector( consumer->domain()->domain().begin(), @@ -706,7 +706,7 @@ BestEffortReplay BestEffortReplay::replayCasP( } // Grab all exprs used to make the forwarded compliments - auto compliment_exprs = ExprSort::getExprs( + auto compliment_exprs = StmtSort::getExprs( FusionGuard::getCurFusion(), {compliments.begin(), compliments.end()}); // Figure out if there are any leaves in compliment_exprs that aren't diff --git a/torch/csrc/jit/codegen/cuda/transform_iter.h b/torch/csrc/jit/codegen/cuda/transform_iter.h index cde502d636e..f1c4ae378b5 100644 --- a/torch/csrc/jit/codegen/cuda/transform_iter.h +++ b/torch/csrc/jit/codegen/cuda/transform_iter.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/transform_replay.cpp b/torch/csrc/jit/codegen/cuda/transform_replay.cpp index d0d03532cd6..7ea96f74bf1 100644 --- a/torch/csrc/jit/codegen/cuda/transform_replay.cpp +++ b/torch/csrc/jit/codegen/cuda/transform_replay.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -49,23 +50,26 @@ class ReplaySelf : public ReplayTransformations { // Manually replay the split, following the output of the operations. // This is so rfactor ops are replayed correctly. - IterDomain* ido = new IterDomain( - new Int(0), + IterDomain* ido = IrBuilder::create( + s->container(), + s->container()->zeroVal(), s->innerSplit() ? remainder->as() : s->factor(), s->outer()->getParallelType(), s->outer()->getIterType(), s->outer()->isRFactorProduct()); // inner IterDomain - IterDomain* idi = new IterDomain( - new Int(0), + IterDomain* idi = IrBuilder::create( + s->container(), + s->container()->zeroVal(), s->innerSplit() ? s->factor() : remainder->as(), s->inner()->getParallelType(), s->inner()->getIterType(), s->inner()->isRFactorProduct()); // Generate the split node - new Split( + IrBuilder::create( + s->container(), ido, idi, mapped, @@ -112,14 +116,16 @@ class ReplaySelf : public ReplayTransformations { Val* merged_id_size = mul(id_outer_mapped->extent(), id_inner_mapped->extent()); - IterDomain* merged_id = new IterDomain( - new Int(0), + IterDomain* merged_id = IrBuilder::create( + m->container(), + m->container()->zeroVal(), merged_id_size->as(), m->out()->getParallelType(), m->outer()->getIterType(), m->out()->isRFactorProduct()); - new Merge(merged_id, id_outer_mapped, id_inner_mapped); + IrBuilder::create( + m->container(), merged_id, id_outer_mapped, id_inner_mapped); // Remove inputs from the leaf IDs leaf_ids_.erase(id_outer_mapped); @@ -197,7 +203,8 @@ TensorDomain* TransformReplay::fullSelfReplay( "Error during replay, didn't replay an axis."); new_rfactor_domain[i++] = it->second; } - return new TensorDomain( + return IrBuilder::create( + self->container(), new_self_root->getRootDomain(), new_rfactor_domain, new_domain, @@ -205,8 +212,11 @@ TensorDomain* TransformReplay::fullSelfReplay( } } - return new TensorDomain( - new_self_root->getRootDomain(), new_domain, new_self_root->contiguity()); + return IrBuilder::create( + self->container(), + new_self_root->getRootDomain(), + new_domain, + new_self_root->contiguity()); } // Producer could have rfactor axes which consumer may want replayed. We can @@ -407,7 +417,8 @@ std::pair TransformReplay::replayPasC( new_IDs.push_back(id); } } - TensorDomain* replayed = new TensorDomain( + TensorDomain* replayed = IrBuilder::create( + producer->container(), producer->getRootDomain(), producer->getRFactorDomain(), new_IDs, @@ -604,7 +615,8 @@ std::pair TransformReplay::replayCasP( if (used_IDs.find(id) == used_IDs.end()) new_IDs.push_back(id); - TensorDomain* replayed = new TensorDomain( + TensorDomain* replayed = IrBuilder::create( + consumer->container(), consumer->getRootDomain(), consumer->getRFactorDomain(), new_IDs, diff --git a/torch/csrc/jit/codegen/cuda/transform_replay.h b/torch/csrc/jit/codegen/cuda/transform_replay.h index 92898b54ba7..1fd3d110200 100644 --- a/torch/csrc/jit/codegen/cuda/transform_replay.h +++ b/torch/csrc/jit/codegen/cuda/transform_replay.h @@ -1,7 +1,7 @@ #pragma once +#include #include -#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp index 8ac28cf3a2c..5939ffee289 100644 --- a/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp +++ b/torch/csrc/jit/codegen/cuda/transform_rfactor.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -52,23 +53,26 @@ class ReplayRFactor : public ReplayTransformations { // Manually replay the split, making reduction = false and rfactor = true // outer IterDomain - IterDomain* ido = new IterDomain( - new Int(0), + IterDomain* ido = IrBuilder::create( + s->container(), + IrBuilder::create(s->container(), 0), s->innerSplit() ? remainder->as() : s->factor(), ParallelType::Serial, rfactor_outer ? IterType::Reduction : IterType::Iteration, true); // broadcast // inner IterDomain - IterDomain* idi = new IterDomain( - new Int(0), + IterDomain* idi = IrBuilder::create( + s->container(), + IrBuilder::create(s->container(), 0), s->innerSplit() ? s->factor() : remainder->as(), ParallelType::Serial, rfactor_inner ? IterType::Reduction : IterType::Iteration, true); // Generate the split node - new Split(ido, idi, mapped, s->factor(), s->innerSplit()); + IrBuilder::create( + s->container(), ido, idi, mapped, s->factor(), s->innerSplit()); // Remove mapped id from leaf IDs leaf_ids_.erase(mapped); @@ -115,14 +119,16 @@ class ReplayRFactor : public ReplayTransformations { Val* merged_id_size = mul(id_outer_mapped->extent(), id_inner_mapped->extent()); - IterDomain* merged_id = new IterDomain( - new Int(0), + IterDomain* merged_id = IrBuilder::create( + m->container(), + IrBuilder::create(m->container(), 0), merged_id_size->as(), ParallelType::Serial, rfactor_output ? IterType::Reduction : IterType::Iteration, true); - new Merge(merged_id, id_outer_mapped, id_inner_mapped); + IrBuilder::create( + m->container(), merged_id, id_outer_mapped, id_inner_mapped); // Remove inputs from the leaf IDs leaf_ids_.erase(id_outer_mapped); @@ -238,7 +244,8 @@ TensorDomain* TransformRFactor::runReplay( for (auto id : orig_td_root) { // If this is an rfactor root, it will be a reduction in this stage if (rfactor_root_axes.find(id) != rfactor_root_axes.end()) { - new_root[i] = new IterDomain( + new_root[i] = IrBuilder::create( + id->container(), id->start(), id->extent(), id->stopOffset(), @@ -248,7 +255,8 @@ TensorDomain* TransformRFactor::runReplay( // If this is not an rfactor root, but a reduction root, it should be // turned into an iteration domain } else if (id->isReduction()) { - new_root[i] = new IterDomain( + new_root[i] = IrBuilder::create( + id->container(), id->start(), id->extent(), id->stopOffset(), @@ -296,7 +304,8 @@ TensorDomain* TransformRFactor::runReplay( if (dom->isRFactorProduct()) rfactor_root.push_back(dom); - return new TensorDomain( + return IrBuilder::create( + orig_td->container(), new_root, rfactor_root, new_domain, @@ -400,8 +409,11 @@ TensorDomain* TransformRFactor::runReplay2( } } - return new TensorDomain( - new_root, new_domain, std::vector(new_root.size(), true)); + return IrBuilder::create( + orig_td->container(), + new_root, + new_domain, + std::vector(new_root.size(), true)); } } // namespace cuda diff --git a/torch/csrc/jit/codegen/cuda/transform_rfactor.h b/torch/csrc/jit/codegen/cuda/transform_rfactor.h index 551f67905b0..593eb287d0b 100644 --- a/torch/csrc/jit/codegen/cuda/transform_rfactor.h +++ b/torch/csrc/jit/codegen/cuda/transform_rfactor.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/torch/csrc/jit/codegen/cuda/transform_view.cpp b/torch/csrc/jit/codegen/cuda/transform_view.cpp index ea4d188c092..433e34a11eb 100644 --- a/torch/csrc/jit/codegen/cuda/transform_view.cpp +++ b/torch/csrc/jit/codegen/cuda/transform_view.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -44,11 +45,31 @@ class Transform { size_t index() const { return index_; } + + size_t originalIndex() const { + return original_index_; + } + + size_t newIndex() const { + return new_index_; + } + virtual ~Transform() = default; protected: - Transform(size_t index) : index_(index) {} + Transform(const ViewIndexState& state, size_t index) + : index_(index), + original_index_(state.original_view_index), + new_index_(Transform::computeNewIndex(state)) {} + const size_t index_ = 0; + const size_t original_index_ = 0; + const size_t new_index_ = 0; + + static size_t computeNewIndex(const ViewIndexState& state) { + return state.original_view_index - state.trivial_reduction_offset + + state.split_offset - state.merge_offset + state.broadcast_offset; + } }; //! Base class for all view tranformations - Merge, Split, Keep @@ -61,9 +82,11 @@ class ViewTransform : public Transform { std::vector& rfactor_domain) = 0; ~ViewTransform() override = default; + virtual bool isOriginalAxisDynamic() const = 0; + protected: ViewTransform(const ViewIndexState& state) - : Transform(ViewTransform::computeIndex(state)) {} + : Transform(state, ViewTransform::computeIndex(state)) {} static size_t computeIndex(const ViewIndexState& state) { return state.original_view_index - state.trivial_reduction_offset; @@ -71,6 +94,7 @@ class ViewTransform : public Transform { }; namespace { +typedef std::vector Sizes; const size_t kEmptyAxis = 0; const size_t kSingletonAxis = 1; @@ -86,6 +110,10 @@ class MergeTransform final : public ViewTransform { << std::endl; } + bool isOriginalAxisDynamic() const override { + return false; + } + void createRfactorDomain( const std::vector& new_root_domain, std::vector& rfactor_domain) override { @@ -108,14 +136,15 @@ class MergeTransform final : public ViewTransform { auto merged_extent = mul(merged_id->extent(), new_root_domain[index_ + 1]->extent()); - auto new_merged_id = new IterDomain( - new Int(0), + auto new_merged_id = IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), merged_extent, ParallelType::Serial, IterType::Iteration, true); - new Merge(new_merged_id, merged_id, new_root_domain[index_ + 1]); + IrBuilder::create( + new_merged_id, merged_id, new_root_domain[index_ + 1]); rfactor_domain.push_back(new_merged_id); } @@ -140,6 +169,10 @@ class SplitTransform final : public ViewTransform { << " ARG: " << split_factor_ << std::endl; } + bool isOriginalAxisDynamic() const override { + return false; + } + void createRfactorDomain( const std::vector& new_root_domain, std::vector& rfactor_domain) override { @@ -150,7 +183,7 @@ class SplitTransform final : public ViewTransform { "\t Domain Size:\t", new_root_domain.size()); - auto factor = new Int(split_factor_); + auto factor = IrBuilder::create(split_factor_); IterDomain* id = nullptr; if (is_last_axis_rfactor_) { @@ -164,18 +197,22 @@ class SplitTransform final : public ViewTransform { Val* remainder = ceilDiv(id->extent(), factor); // outer loop IterDomain - IterDomain* factor_id = new IterDomain( - new Int(0), factor, id->getParallelType(), id->getIterType(), true); + IterDomain* factor_id = IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), + factor, + id->getParallelType(), + id->getIterType(), + true); // inner loop IterDomain - IterDomain* remainder_id = new IterDomain( - new Int(0), + IterDomain* remainder_id = IrBuilder::create( + FusionGuard::getCurFusion()->zeroVal(), remainder->as(), ParallelType::Serial, IterType::Iteration, true); - new Split(factor_id, remainder_id, id, factor, false); + IrBuilder::create(factor_id, remainder_id, id, factor, false); rfactor_domain.push_back(factor_id); rfactor_domain.push_back(remainder_id); @@ -195,6 +232,10 @@ class KeepTransform final : public ViewTransform { output << "Keep Index: " << index_ << std::endl; } + bool isOriginalAxisDynamic() const override { + return true; + } + void createRfactorDomain( const std::vector& new_root_domain, std::vector& rfactor_domain) override { @@ -214,17 +255,11 @@ class KeepTransform final : public ViewTransform { class BroadcastTransform final : public Transform { public: BroadcastTransform(const ViewIndexState& state) - : Transform(BroadcastTransform::computeIndex(state)) {} + : Transform(state, Transform::computeNewIndex(state)) {} void toString(std::stringstream& output) const override { output << "Bcast Index: " << index_ << std::endl; } - - private: - static size_t computeIndex(const ViewIndexState& state) { - return state.original_view_index - state.trivial_reduction_offset + - state.split_offset - state.merge_offset + state.broadcast_offset; - } }; //! For any implicit broadcast dimensions in the original view, we remove @@ -232,7 +267,7 @@ class BroadcastTransform final : public Transform { class TrivialReductionTransform final : public Transform { public: TrivialReductionTransform(const ViewIndexState& state) - : Transform(TrivialReductionTransform::computeIndex(state)) {} + : Transform(state, TrivialReductionTransform::computeIndex(state)) {} void toString(std::stringstream& output) const override { output << "1-Red Index: " << index_ << std::endl; @@ -249,10 +284,11 @@ class TrivialReductionTransform final : public Transform { class AnalyzeViewTransformation { public: AnalyzeViewTransformation( - const std::vector root_domain, - const std::vector& original_view, - const std::vector& new_view) - : root_domain_(root_domain), + const Sizes& original_view, + const Sizes& new_view, + std::vector root_domain = {}) + : default_implicit_broadcast_(root_domain.empty()), + root_domain_(root_domain), original_view_(original_view), new_view_(new_view), transform_view_(original_view) { @@ -264,6 +300,24 @@ class AnalyzeViewTransformation { TORCH_INTERNAL_ASSERT(kOriginalNumElements == kNewNumElements); } + AnalyzeViewConstraint constraint() { + findTransformation(); + TORCH_INTERNAL_ASSERT( + validate(), + "Analyze View Transformation failed to find valid transformation.\n", + toString()); + std::vector original_constraint( + original_view_.begin(), original_view_.end()); + std::vector new_constraint(new_view_.begin(), new_view_.end()); + for (auto& vt : view_transforms_) { + if (vt->isOriginalAxisDynamic()) { + original_constraint[vt->originalIndex()] = -1; + new_constraint[vt->newIndex()] = -1; + } + } + return {original_constraint, new_constraint}; + } + AnalyzeViewResult run() { findTransformation(); TORCH_INTERNAL_ASSERT( @@ -382,6 +436,15 @@ class AnalyzeViewTransformation { return true; } + bool isImplicitBroadcast(size_t original_view_index) const { + if (default_implicit_broadcast_) { + return original_view_[original_view_index] == 1; + } else { + TORCH_INTERNAL_ASSERT(!root_domain_.empty()); + return root_domain_[original_view_index]->isImplicitBroadcast(); + } + } + //! This utility class merges a fixed set of axes together //! according to some invariant. Implicit broadcast axes cannot be //! merged with standard iterDomains, so they are handled separately @@ -400,8 +463,7 @@ class AnalyzeViewTransformation { bool any_merge = false; for (size_t idx = 0; idx < num_merge_axes_; ++idx) { - if (avt_->root_domain_[state_.original_view_index] - ->isImplicitBroadcast()) { + if (avt_->isImplicitBroadcast(state_.original_view_index)) { avt_->addTrivialReductionTransform(); } else { avt_->addMergeTransform( @@ -603,9 +665,10 @@ class AnalyzeViewTransformation { std::vector> trivial_reduction_transforms_; + bool default_implicit_broadcast_ = true; const std::vector root_domain_; - const std::vector& original_view_; - const std::vector& new_view_; + const Sizes& original_view_; + const Sizes& new_view_; // transform_view is a mutable view and is initialized with the original_view. // It is used to track the current state of the original tensor domain. @@ -622,7 +685,7 @@ class AnalyzeViewTransformation { // If transform size != original size for an axis, then the transformation // uses the last rfactor domain. Otherwise, it is a root domain // transformation. - std::vector transform_view_; + Sizes transform_view_; }; //! Create new TensorDomain with a modified rfactor domain using the specified @@ -644,7 +707,7 @@ TensorDomain* createViewDomain( t->createRfactorDomain(new_root_domain, rfactor_domain); } - return new TensorDomain( + return IrBuilder::create( new_root_domain, rfactor_domain, rfactor_domain, @@ -652,11 +715,19 @@ TensorDomain* createViewDomain( } //! Infer -1 value in new view sizes from original view sizes -std::vector inferNewViewShape( - const std::vector& original_view, +std::pair inferNewViewShape( + const std::vector& original_sizes, const std::vector& new_sizes) { - std::vector new_view(new_sizes.size()); + bool valid_original_sizes = std::all_of( + original_sizes.begin(), original_sizes.end(), [](int64_t dim) { + return dim > 0; + }); + TORCH_INTERNAL_ASSERT(valid_original_sizes); + Sizes original_view(original_sizes.begin(), original_sizes.end()); + Sizes new_view(new_sizes.size()); + + // TODO: refactor int64_t dynamic_index = -1; size_t new_size_num_elements = 1; for (size_t idx = 0; idx < new_sizes.size(); ++idx) { @@ -665,6 +736,7 @@ std::vector inferNewViewShape( dynamic_index == -1, "Only one dimension can by inferred.") dynamic_index = idx; } else { + TORCH_INTERNAL_ASSERT(new_sizes[idx] > 0); new_size_num_elements *= new_sizes[idx]; new_view[idx] = new_sizes[idx]; } @@ -676,7 +748,7 @@ std::vector inferNewViewShape( new_view[dynamic_index] = kNumElements / new_size_num_elements; } - return new_view; + return {original_view, new_view}; } } // namespace @@ -690,22 +762,24 @@ AnalyzeViewResult analyzeView( FUSER_PERF_SCOPE("analyzeView"); TORCH_INTERNAL_ASSERT( tv->getMaybeRFactorDomain().size() == original_sizes.size()); - - bool valid_original_sizes = std::all_of( - original_sizes.begin(), original_sizes.end(), [](int64_t dim) { - return dim > 0; - }); - - TORCH_INTERNAL_ASSERT(valid_original_sizes); - - std::vector original_view( - original_sizes.begin(), original_sizes.end()); - auto new_view = inferNewViewShape(original_view, new_sizes); + auto sizes = inferNewViewShape(original_sizes, new_sizes); AnalyzeViewTransformation analyzer( - tv->getRootDomain(), original_view, new_view); + sizes.first /* original_view */, + sizes.second /* new_view */, + tv->getRootDomain()); return analyzer.run(); } +AnalyzeViewConstraint analyzeViewConstraint( + const std::vector& original_sizes, + const std::vector& new_sizes) { + FUSER_PERF_SCOPE("analyzeViewConstraint"); + auto sizes = inferNewViewShape(original_sizes, new_sizes); + AnalyzeViewTransformation analyzer( + sizes.first /* original_view */, sizes.second /* new_view */); + return analyzer.constraint(); +} + //! Create new TensorDomain with a modified rfactor domain using the specified //! view transformations TensorDomain* transformView( diff --git a/torch/csrc/jit/codegen/cuda/transform_view.h b/torch/csrc/jit/codegen/cuda/transform_view.h index e7473a1b9b4..f8a986048be 100644 --- a/torch/csrc/jit/codegen/cuda/transform_view.h +++ b/torch/csrc/jit/codegen/cuda/transform_view.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -40,6 +40,11 @@ struct AnalyzeViewResult { std::vector> transforms; }; +struct AnalyzeViewConstraint { + std::vector original_constraint; + std::vector new_constraint; +}; + // Find the transformations necessary to convert TensorView // from original size to new size. AnalyzeViewResult analyzeView( @@ -47,6 +52,11 @@ AnalyzeViewResult analyzeView( const std::vector& original_sizes, const std::vector& new_sizes); +// Find the constraints derived from the view transformations +AnalyzeViewConstraint analyzeViewConstraint( + const std::vector& original_sizes, + const std::vector& new_sizes); + // Generate a new TensorDomain from the given view transformations. // The original root domain is kept in the new TensorDomain, // but a new rfactor domain is created from the view transformations. diff --git a/torch/csrc/jit/codegen/cuda/type.cpp b/torch/csrc/jit/codegen/cuda/type.cpp index 3afb1b540b8..e883421eb1e 100644 --- a/torch/csrc/jit/codegen/cuda/type.cpp +++ b/torch/csrc/jit/codegen/cuda/type.cpp @@ -87,6 +87,9 @@ ValType promote_type(const ValType& t1, const ValType& t2) { (t1 == ValType::Scalar || t1 == ValType::NamedScalar)) { return ValType::Scalar; } + if (t1 == ValType::NamedScalar && t2 == ValType::NamedScalar) { + return ValType::Scalar; + } TORCH_CHECK(false, "Expected promotable ValTypes but got: ", t1, " and ", t2); } @@ -107,7 +110,7 @@ static const char* data_type2string(DataType t) { case DataType::Int32: return "int"; case DataType::Null: - return "nullptr"; + return "null_type"; default: break; } @@ -127,6 +130,10 @@ static const char* val_type2string(ValType t) { return "Scalar"; case ValType::NamedScalar: return "NamedScalar"; + case ValType::Predicate: + return "Predicate"; + case ValType::TensorIndex: + return "TensorIndex"; default: TORCH_INTERNAL_ASSERT(false, "No string found for val type."); } @@ -144,12 +151,38 @@ static const char* expr_type2string(ExprType t) { return "ReductionOp"; case ExprType::BroadcastOp: return "BroadcastOp"; + case ExprType::WelfordOp: + return "WelfordOp"; + case ExprType::TransposeOp: + return "TransposeOp"; case ExprType::ShiftOp: return "ShiftOp"; + case ExprType::GatherOp: + return "GatherOp"; + case ExprType::ViewOp: + return "ViewOp"; case ExprType::Split: return "Split"; case ExprType::Merge: return "Merge"; + case ExprType::Allocate: + return "Allocate"; + case ExprType::Sync: + return "Sync"; + case ExprType::InitMagicZero: + return "InitMagicZero"; + case ExprType::UpdateMagicZero: + return "UpdateMagicZero"; + case ExprType::ForLoop: + return "ForLoop"; + case ExprType::IfThenElse: + return "IfThenElse"; + case ExprType::GridReduction: + return "GridReduction"; + case ExprType::GridBroadcast: + return "GridBroadcast"; + case ExprType::GridWelford: + return "GridWelford"; default: TORCH_INTERNAL_ASSERT(false, "No string found for expr type."); } @@ -281,7 +314,6 @@ bool needFloatSuffix(BinaryOpType t) { case BinaryOpType::Atan2: case BinaryOpType::Div: case BinaryOpType::Fmod: - case BinaryOpType::Pow: return true; default: return false; @@ -522,6 +554,7 @@ static const char* supported_casts2string( case supported_switch_pair(DataType::Float, DataType::Int): case supported_switch_pair(DataType::Double, DataType::Int): return "(int64_t)"; + case supported_switch_pair(DataType::Int, DataType::Int32): case supported_switch_pair(DataType::Float, DataType::Int32): case supported_switch_pair(DataType::Double, DataType::Int32): return "(int32_t)"; diff --git a/torch/csrc/jit/codegen/cuda/type.h b/torch/csrc/jit/codegen/cuda/type.h index 43aadb62006..ea7e8bd04d3 100644 --- a/torch/csrc/jit/codegen/cuda/type.h +++ b/torch/csrc/jit/codegen/cuda/type.h @@ -3,7 +3,7 @@ #include #include -#include +#include #include #include @@ -32,6 +32,8 @@ enum class ValType { TensorView, Scalar, NamedScalar, + Predicate, + TensorIndex, }; // Manual - The user provides the Bool value. Predicate generation is bypassed. @@ -73,6 +75,15 @@ enum class ExprType { ViewOp, Split, Merge, + Allocate, + Sync, + InitMagicZero, + UpdateMagicZero, + ForLoop, + IfThenElse, + GridReduction, + GridBroadcast, + GridWelford, }; enum class UnaryOpType { @@ -257,8 +268,11 @@ std::string stringifyThread(const ParallelType); std::string typePrefix(const DataType); // TODO: ThreadDim should be BlockDim and BlockDim should be GridDim +// Returns if parallel type is TID[x, y, z] TORCH_CUDA_CU_API bool isParallelTypeThreadDim(ParallelType); +// Returns if parallel type is BID[x, y, z] TORCH_CUDA_CU_API bool isParallelTypeBlockDim(ParallelType); +// Returns if parallel type is a grid or block parallelization dimension TORCH_CUDA_CU_API bool isParallelTypeThread(ParallelType); TORCH_CUDA_CU_API bool isParallelTypeVectorize(ParallelType); diff --git a/torch/csrc/jit/codegen/cuda/type_inference.cpp b/torch/csrc/jit/codegen/cuda/type_inference.cpp index 8c7d7d36a06..a8facc6a45b 100644 --- a/torch/csrc/jit/codegen/cuda/type_inference.cpp +++ b/torch/csrc/jit/codegen/cuda/type_inference.cpp @@ -141,6 +141,7 @@ class NaiveTypePropagator { } // binary operations that forward meta info and broadcast shape: case aten::gelu_backward: + case aten::tanh_backward: case aten::mul: case aten::div: case aten::min: @@ -414,19 +415,14 @@ class NaiveTypePropagator { node->output()->setType(out_type->withDim(c10::nullopt)); break; } - /* - // TODO: Enable view in parser by detecting non-alias view operation - case aten::view: - case aten::reshape: { + case prim::unsqueeze_copy: + case prim::squeeze_copy: + case prim::reshape_copy: + case prim::view_copy: { auto out_type = node->input(0)->type()->cast(); - auto size_optional = constant_as>(node->input(1)); - TORCH_INTERNAL_ASSERT( - size_optional.has_value(), "The size parameter is required."); - auto new_size = size_optional->vec(); - node->output()->setType(out_type->withSizes(new_size)); + node->output()->setType(out_type); break; } - */ case aten::type_as: { const auto type0 = getInputTensorType(node, 0); const auto type1 = getInputTensorType(node, 1); diff --git a/torch/csrc/jit/codegen/cuda/type_promotion.cpp b/torch/csrc/jit/codegen/cuda/type_promotion.cpp index 016e8825acf..68a38e67378 100644 --- a/torch/csrc/jit/codegen/cuda/type_promotion.cpp +++ b/torch/csrc/jit/codegen/cuda/type_promotion.cpp @@ -55,13 +55,14 @@ at::native::ResultTypeState updateResultTypeState( TORCH_INTERNAL_ASSERT( !c10::isComplexType(scalar), "NvFuser does not support complex data types."); + at::native::ResultTypeState new_state = in_state; c10::ScalarType current = scalar; if (c10::isFloatingType(scalar)) { current = c10::typeMetaToScalarType(at::get_default_dtype()); } new_state.wrappedResult = - promoteTypesSkipUndefined(in_state.wrappedResult, scalar); + promoteTypesSkipUndefined(in_state.wrappedResult, current); return new_state; } @@ -195,11 +196,16 @@ std::vector promoteValues( Val* optionalCast(DataType dtype, Val* v) { TORCH_INTERNAL_ASSERT(v->getDataType().has_value()); + // Avoid casting Float/Int scalar to any corresponding FloatingPoint/Integral + // type in fusion. Instead, we cast them directly. The exception is Bool, + // which is always casted to the desired type. const bool kSameDtype = v->getDataType().value() == dtype; const bool kIsScalarFloat = !v->isA() && isFloatingPointType(dtype); + const bool kIsScalarInt = !v->isA() && isIntegralType(dtype); if (kSameDtype || - (kIsScalarFloat && isFloatingPointType(v->getDataType().value()))) { + (kIsScalarFloat && isFloatingPointType(v->getDataType().value())) || + (kIsScalarInt && isIntegralType(v->getDataType().value()))) { return v; } else { return castOp(dtype, v); diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp index 67c8359b502..127078b45f7 100644 --- a/torch/csrc/jit/codegen/cuda/utils.cpp +++ b/torch/csrc/jit/codegen/cuda/utils.cpp @@ -143,6 +143,19 @@ void debugPrint(const c10::TensorTypePtr& type) { } #pragma clang diagnostic pop +bool is_cpu_scalar(const at::Tensor& tensor) { + return tensor.device().is_cpu() && tensor.numel() == 1 && tensor.dim() == 0; +} + +bool is_cpu_scalar(const c10::TensorType& tensor_type) { + auto opt_device = tensor_type.device(); + auto opt_dim = tensor_type.dim(); + auto opt_numel = tensor_type.numel(); + return opt_device.has_value() && opt_device.value().is_cpu() && + opt_dim.has_value() && opt_numel.has_value() && opt_dim.value() == 0 && + opt_numel.value() == 1; +} + bool isDebugDumpEnabled(DebugDumpOption option) { const static auto dump_options = parseDebugDumpOptions(); return dump_options.at(option); @@ -158,6 +171,14 @@ bool disableRNGUnrolling() { return disable_rng_unroll ? atoi(disable_rng_unroll) : false; } +std::vector getTensorSizes(TensorTypePtr const& tensor_type) { + TORCH_INTERNAL_ASSERT(tensor_type != nullptr, "Input must be a Tensor."); + auto optional_sizes = tensor_type->sizes().concrete_sizes(); + TORCH_INTERNAL_ASSERT( + optional_sizes.has_value(), "Missing size information for the tensor."); + return optional_sizes.value(); +} + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h index f56d0f8d52e..c035cdeae24 100644 --- a/torch/csrc/jit/codegen/cuda/utils.h +++ b/torch/csrc/jit/codegen/cuda/utils.h @@ -1,7 +1,8 @@ #pragma once -#include +#include #include +#include namespace torch { namespace jit { @@ -10,6 +11,9 @@ namespace cuda { void debugPrint(const c10::TensorTypePtr& type); +bool is_cpu_scalar(const at::Tensor& tensor); +bool is_cpu_scalar(const c10::TensorType& tensor_type); + //! Types of debug print-outs //! //! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable @@ -116,6 +120,8 @@ constexpr unsigned int switch_pair(T t1, T t2) { return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2; } +std::vector getTensorSizes(TensorTypePtr const& tensor_type); + } // namespace cuda } // namespace fuser } // namespace jit diff --git a/torch/csrc/jit/ir/alias_analysis.h b/torch/csrc/jit/ir/alias_analysis.h index fa840621682..0129938568c 100644 --- a/torch/csrc/jit/ir/alias_analysis.h +++ b/torch/csrc/jit/ir/alias_analysis.h @@ -152,11 +152,11 @@ class AliasDb { * this. */ // Copy `existing`s aliasing info to `new_value`, and remove `existing`. - void replaceWithNewValue(Value* existing, Value* new_value); + TORCH_API void replaceWithNewValue(Value* existing, Value* new_value); // Copy `from`s aliasing info to `to`. - void copyValue(Value* from, Value* to); + TORCH_API void copyValue(Value* from, Value* to); // Create a new `value` that does not alias anything else. - void createValue(const Value* value); + TORCH_API void createValue(const Value* value); // Enable more precise treatment of prim::TupleConstruct. void enablePreciseTupleContainerAnalysis(); diff --git a/torch/jit/_script.py b/torch/jit/_script.py index 9ad8934d55b..ca07e63cb61 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -1308,6 +1308,10 @@ def script(obj, optimize=None, _frames_up=0, _rcb=None, obj = obj.__original_fn _rcb = _jit_internal.createResolutionCallbackFromClosure(obj) + # some functions are explicitly marked as not supported in script mode + if hasattr(obj, "__script_unsupported"): + raise RuntimeError("TorchScript error: " + obj.__script_unsupported) + _check_directly_compile_overloaded(obj) maybe_already_compiled_fn = _try_get_jit_cached_function(obj) if maybe_already_compiled_fn: From 6f29313e38881edeb8e877ed40626f56cb00c34a Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Feb 2022 17:29:31 -0800 Subject: [PATCH 026/199] [PyTorch] Use DimVector in at::matmul (#72230) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72230 Here's a small PR that only fixes the extra heap allocations for shapes. Hopefully won't get stuck like #64387. ghstack-source-id: 149069527 Test Plan: CI Reviewed By: ngimel Differential Revision: D33962610 fbshipit-source-id: 51e200f5237bdf225bfb2445e1e36bacd0e65e92 (cherry picked from commit 027537f32965d23fc78a36fec71be41cd5cbce3d) --- aten/src/ATen/native/LinearAlgebra.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index aed94f10705..f43c52576d0 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1627,8 +1627,7 @@ Tensor matmul( Tensor t2 = dim_tensor2 == 1 ? tensor2.unsqueeze(-1) : tensor2; auto size1 = tensor1.sizes(); auto size2 = t2.sizes(); - std::vector output_size; - output_size.insert(output_size.end(), size1.begin(), size1.end() - 1); + DimVector output_size(size1.begin(), size1.end() - 1); if (dim_tensor2 > 1) { output_size.push_back(size2[dim_tensor2 - 1]); } @@ -1660,7 +1659,8 @@ Tensor matmul( return has_out ? out.set_(res) : res; } else { - std::vector shape = tensor2.sizes().slice(0, dim_tensor2 - 2).vec(); + c10::IntArrayRef shape_array = tensor2.sizes().slice(0, dim_tensor2 - 2); + DimVector shape(shape_array.begin(), shape_array.end()); shape.push_back(p); Tensor res = res_T.reshape(shape).contiguous(); @@ -1677,29 +1677,29 @@ Tensor matmul( IntArrayRef batch_tensor2(tensor2.sizes().data(), std::max(dim_tensor2 - 2, 0)); // expand the batch portion (i.e. cut off matrix dimensions and expand rest) - std::vector expand_batch_portion = infer_size(batch_tensor1, batch_tensor2); + DimVector expand_batch_portion = infer_size_dimvector(batch_tensor1, batch_tensor2); - std::vector tensor1_expand_size(expand_batch_portion); - tensor1_expand_size.insert(tensor1_expand_size.end(), {n, m1}); + DimVector tensor1_expand_size(expand_batch_portion); + tensor1_expand_size.push_back(n); + tensor1_expand_size.push_back(m1); - std::vector tensor2_expand_size(expand_batch_portion); - tensor2_expand_size.insert(tensor2_expand_size.end(), {m2, p}); + DimVector tensor2_expand_size(expand_batch_portion); + tensor2_expand_size.push_back(m2); + tensor2_expand_size.push_back(p); const int64_t expand_batch_product = c10::multiply_integers(expand_batch_portion); - std::vector tensor1_bmm_view({expand_batch_product}); - tensor1_bmm_view.insert(tensor1_bmm_view.end(), {n, m1}); + std::array tensor1_bmm_view = {expand_batch_product, n, m1}; - std::vector tensor2_bmm_view({expand_batch_product}); - tensor2_bmm_view.insert(tensor2_bmm_view.end(), {m2, p}); + std::array tensor2_bmm_view = {expand_batch_product, m2, p}; // flatten expanded batches Tensor tensor1_expanded = tensor1.expand(tensor1_expand_size).reshape(tensor1_bmm_view); Tensor tensor2_expanded = tensor2.expand(tensor2_expand_size).reshape(tensor2_bmm_view); // reshape batches back into result - std::vector output_shape(expand_batch_portion); + DimVector output_shape(expand_batch_portion); if (dim_tensor1 > 1) { output_shape.push_back(n); } From f2f4847e160396069e854c670448234df4014aba Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Feb 2022 18:12:50 -0800 Subject: [PATCH 027/199] [PyTorch] MHA: add debug shape checks (#72457) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72457 No cost to adding this basic self-check on our operations. ghstack-source-id: 149067335 Test Plan: CI Reviewed By: zrphercule Differential Revision: D33954672 fbshipit-source-id: f57b3c2463db403431f884db56063cec2ca93ef2 (cherry picked from commit 3fb79f3cf8e2de8cfc4d9efc1cec0f9ed7a7ecd3) --- aten/src/ATen/native/attention.cpp | 50 +++++++++++++++++++++++--- aten/src/ATen/native/cuda/attention.cu | 48 +++++++++++++++++++++++-- 2 files changed, 91 insertions(+), 7 deletions(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index 599f0f866e2..24797ade6b9 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -185,6 +185,8 @@ Tensor bmm_nn(const Tensor& a, const Tensor& b) { Tensor transform_0213(const Tensor& a) { // TODO: check perf vs dedicated kernel. + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1)); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3)); return a.permute({0, 2, 1, 3}) .contiguous() .view({a.size(0), a.size(2), a.size(1) * a.size(3)}); @@ -196,6 +198,13 @@ Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) { return r_.view({a.size(0), a.size(1), r_.size(1)}); } +void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim()); + for (auto idx : c10::irange(shape.size())) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]); + } +} + } // namespace Tensor multi_head_self_attention_cpu( @@ -209,30 +218,63 @@ Tensor multi_head_self_attention_cpu( // query shape: [B, T, D] // qkv_weight shape: [3 * D, D] + const auto D = query.sizes()[2]; + + TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor"); + TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor"); + TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query"); + TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal"); + TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head"); + +#ifndef NDEBUG + const auto B = query.sizes()[0]; + const auto T = query.sizes()[1]; + const auto dim_per_head = D / num_head; +#endif + // shape: [B, T, 3 x D] auto qkv = gemm_nt(query, qkv_weight); +#ifndef NDEBUG + debug_assert_shape(qkv, {B, T, 3 * D}); +#endif // shape: 3 x [B, num_head, T, dim_per_head] auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); - auto q = std::get<0>(q_k_v); - auto k = std::get<1>(q_k_v); - auto v = std::get<2>(q_k_v); + const auto& q = std::get<0>(q_k_v); + const auto& k = std::get<1>(q_k_v); + const auto& v = std::get<2>(q_k_v); +#ifndef NDEBUG + debug_assert_shape(q, {B, num_head, T, dim_per_head}); + debug_assert_shape(k, {B, num_head, T, dim_per_head}); + debug_assert_shape(v, {B, num_head, T, dim_per_head}); +#endif // shape: [B, num_head, T, T] auto qkt = bmm_nt(q, k); +#ifndef NDEBUG + debug_assert_shape(qkt, {B, num_head, T, T}); +#endif // shape: [B, num_head, T, T] masked_softmax_dropout(qkt, mask); // shape: [B, num_head, T, dim_per_head] auto attn_ctx = bmm_nn(qkt, v); +#ifndef NDEBUG + debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head}); +#endif // shape: [B, T, D] auto attn = transform_0213(attn_ctx); +#ifndef NDEBUG + debug_assert_shape(attn, {B, T, D}); +#endif // shape: [B, T, D] auto proj = gemm_nt_bias(attn, proj_weight, proj_bias); - +#ifndef NDEBUG + debug_assert_shape(proj, {B, T, D}); +#endif return proj; } diff --git a/aten/src/ATen/native/cuda/attention.cu b/aten/src/ATen/native/cuda/attention.cu index 8dad56fac0e..6e2dbe4e9b4 100644 --- a/aten/src/ATen/native/cuda/attention.cu +++ b/aten/src/ATen/native/cuda/attention.cu @@ -209,6 +209,14 @@ Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) { return r_.view({a.size(0), a.size(1), r_.size(1)}); } +void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim()); + for (auto idx : c10::irange(shape.size())) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]); + } +} + + } // namespace Tensor multi_head_self_attention_cuda( @@ -222,29 +230,63 @@ Tensor multi_head_self_attention_cuda( // query shape: [B, T, D] // qkv_weight shape: [3 * D, D] + const auto D = query.sizes()[2]; + + TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor"); + TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor"); + TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query"); + TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal"); + TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head"); + +#ifndef NDEBUG + const auto B = query.sizes()[0]; + const auto T = query.sizes()[1]; + const auto dim_per_head = D / num_head; +#endif + // shape: [B, T, 3 x D] auto qkv = gemm_nt(query, qkv_weight); +#ifndef NDEBUG + debug_assert_shape(qkv, {B, T, 3 * D}); +#endif // shape: 3 x [B, num_head, T, dim_per_head] auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); - auto q = std::get<0>(q_k_v); - auto k = std::get<1>(q_k_v); - auto v = std::get<2>(q_k_v); + const auto& q = std::get<0>(q_k_v); + const auto& k = std::get<1>(q_k_v); + const auto& v = std::get<2>(q_k_v); +#ifndef NDEBUG + debug_assert_shape(q, {B, num_head, T, dim_per_head}); + debug_assert_shape(k, {B, num_head, T, dim_per_head}); + debug_assert_shape(v, {B, num_head, T, dim_per_head}); +#endif // shape: [B, num_head, T, T] auto qkt = bmm_nt(q, k); +#ifndef NDEBUG + debug_assert_shape(qkt, {B, num_head, T, T}); +#endif // shape: [B, num_head, T, T] masked_softmax_dropout(qkt, mask); // shape: [B, num_head, T, dim_per_head] auto attn_ctx = bmm_nn(qkt, v); +#ifndef NDEBUG + debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head}); +#endif // shape: [B, T, D] auto attn = transform_0213(attn_ctx); +#ifndef NDEBUG + debug_assert_shape(attn, {B, T, D}); +#endif // shape: [B, T, D] auto proj = gemm_nt_bias(attn, proj_weight, proj_bias); +#ifndef NDEBUG + debug_assert_shape(proj, {B, T, D}); +#endif return proj; } From d707114a2f376c15874d3ca6d07455f1c6f8f242 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Feb 2022 18:12:50 -0800 Subject: [PATCH 028/199] [PyTorch] Fix ASAN errors in CPU MHA impl (#72458) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72458 Now ASAN stops flagging errors during the MHA test (though the results are still incorrect). ghstack-source-id: 149067334 Test Plan: buck test mode/dbg-asan-ubsan mode/no-gpu //caffe2/test:nn -- -r test_native_multihead_attention_cpu_float32 Reviewed By: zrphercule Differential Revision: D33956797 fbshipit-source-id: 6824987f8a9ad55c4409eda5b3a2a5cbc535312f (cherry picked from commit f5a9dfdff9ac121b36758a7a34eca40875b85bb0) --- aten/src/ATen/native/attention.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index 24797ade6b9..b9d34e3d98f 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -79,17 +79,17 @@ std::tuple transform_bias_rescale_qkv( q_data.store(&q_k_v_data [0 * B * num_head * T * dim_per_head + b * num_head * T * dim_per_head + - num_head * T * dim_per_head + + nh * T * dim_per_head + t * dim_per_head + dh]); k_data.store(&q_k_v_data [1 * B * num_head * T * dim_per_head + b * num_head * T * dim_per_head + - num_head * T * dim_per_head + + nh * T * dim_per_head + t * dim_per_head + dh]); v_data.store(&q_k_v_data [2 * B * num_head * T * dim_per_head + b * num_head * T * dim_per_head + - num_head * T * dim_per_head + + nh * T * dim_per_head + t * dim_per_head + dh]); } } @@ -134,7 +134,7 @@ void masked_softmax_dropout( using Vec = vec::Vectorized; auto V = vec::Vectorized::size(); - scalar_t* input_data = attn_scores_data + i * T; + scalar_t* input_data = attn_scores_data + i; auto max_input = Vec(std::numeric_limits::lowest()); // TODO: handle epilogue for (auto t = 0; t < T; t += V) { From f5d078088bca8e2df823d6219a22d774f6cd4c75 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Feb 2022 18:12:50 -0800 Subject: [PATCH 029/199] [PyTorch] MHA: fix dim_per_head / V bug (#72459) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72459 This was pointed out in a comment on the original diff, but not fixed. ghstack-source-id: 149067331 Test Plan: cosine similarity with the existing MHA impl result on CPU + float32 goes from 0.2457 to 0.5097 Reviewed By: zrphercule Differential Revision: D33987869 fbshipit-source-id: b560ade85f577e83bcaf5b37da2e89d8646d5909 (cherry picked from commit 47511a2138a35b5e71ef3562a6e93cb59d965ab2) --- aten/src/ATen/native/attention.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index b9d34e3d98f..a0c5e0ecb7a 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -57,7 +57,8 @@ std::tuple transform_bias_rescale_qkv( using Vec = vec::Vectorized; auto V = vec::Vectorized::size(); // TODO: handle epilogue - for (auto dh = 0; dh < dim_per_head / V; dh += V) { + TORCH_INTERNAL_ASSERT(dim_per_head % V == 0, "epilogue not implemented yet"); + for (auto dh = 0; dh < dim_per_head; dh += V) { auto d = nh * dim_per_head + dh; // load auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]); From 49611a33297140a9c736b8630142af8438c526ac Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Feb 2022 18:12:50 -0800 Subject: [PATCH 030/199] [PyTorch] MHA: simplify gemm_nt (#72460) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72460 Just call existing matmul (which, IIUC, handles batching itself) rather than doing a few view ops. (Please let me know if this is actually a bad idea and why!) ghstack-source-id: 149067333 Test Plan: CI Reviewed By: ngimel Differential Revision: D33961843 fbshipit-source-id: ace37ad3110e1134db6c8b638ae302f0d556e00a (cherry picked from commit 258231c0f951bd701da179eaedc1ef795416c53f) --- aten/src/ATen/native/attention.cpp | 5 +---- aten/src/ATen/native/cuda/attention.cu | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index a0c5e0ecb7a..d90911852f6 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -14,10 +14,7 @@ namespace native { namespace { Tensor gemm_nt(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2)}); - auto b_ = b.transpose(1, 0); - auto c_ = at::native::matmul(a_, b_); - return c_.view({a.size(0), a.size(1), b.size(0)}); + return at::native::matmul(a, b.t()); } // compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias diff --git a/aten/src/ATen/native/cuda/attention.cu b/aten/src/ATen/native/cuda/attention.cu index 6e2dbe4e9b4..0b9414f3b67 100644 --- a/aten/src/ATen/native/cuda/attention.cu +++ b/aten/src/ATen/native/cuda/attention.cu @@ -23,10 +23,7 @@ namespace native { namespace { Tensor gemm_nt(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2)}); - auto b_ = b.transpose(1, 0); - auto c_ = at::native::matmul(a_, b_); - return c_.view({a.size(0), a.size(1), b.size(0)}); + return at::native::matmul(a, b.t()); } template From ee79e4c6b298a9515dc81bf574920c7926920e94 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Feb 2022 18:12:50 -0800 Subject: [PATCH 031/199] [PyTorch] MHA: guard epilogue TODOs w/checks & implement 1 (#72461) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72461 TODO in the CPU implementation about missing epilogues on vectorized loops. Guard some and implement the one that was failing ghstack-source-id: 149067336 Test Plan: cosine similarity w/existing impl is unchanged for the CPU implementation (which is surprising; should expect improvement IIUC) Reviewed By: zrphercule, ngimel Differential Revision: D33988259 fbshipit-source-id: 72739b7ea210c6e51a76f356a77e49ea00095f49 (cherry picked from commit e1ea8aa405fba4b19d6549bca19a79b5e7841049) --- aten/src/ATen/native/attention.cpp | 32 +++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index d90911852f6..9895b9cd125 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -53,9 +53,8 @@ std::tuple transform_bias_rescale_qkv( auto b = i; using Vec = vec::Vectorized; auto V = vec::Vectorized::size(); - // TODO: handle epilogue - TORCH_INTERNAL_ASSERT(dim_per_head % V == 0, "epilogue not implemented yet"); - for (auto dh = 0; dh < dim_per_head; dh += V) { + auto dh = 0; + for (; dh < dim_per_head; dh += V) { auto d = nh * dim_per_head + dh; // load auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]); @@ -90,6 +89,30 @@ std::tuple transform_bias_rescale_qkv( nh * T * dim_per_head + t * dim_per_head + dh]); } + if (dh != dim_per_head) { + for (dh = std::max(0, dh - V); dh < dim_per_head; dh++) { + auto d = nh * dim_per_head + dh; + auto q_bias = qkv_bias_data[d + 0 * D]; + auto k_bias = qkv_bias_data[d + 1 * D]; + auto v_bias = qkv_bias_data[d + 2 * D]; + auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias; + auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias; + auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias; + q_data = q_data / sqrt_dim_per_head; + q_k_v_data[0 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh] = q_data; + q_k_v_data[1 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh] = k_data; + q_k_v_data[2 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh] = v_data; + } + } } }); }); @@ -135,6 +158,7 @@ void masked_softmax_dropout( scalar_t* input_data = attn_scores_data + i; auto max_input = Vec(std::numeric_limits::lowest()); // TODO: handle epilogue + TORCH_CHECK(T % V == 0, "epilogue not implemented yet"); for (auto t = 0; t < T; t += V) { auto v = Vec::loadu(&input_data[t]); max_input = vec::maximum(max_input, v); @@ -145,6 +169,7 @@ void masked_softmax_dropout( hmax = std::max(max_input[i], hmax); } accscalar_t hsum = 0; + TORCH_CHECK(T % V == 0, "epilogue not implemented yet"); for (auto t = 0; t < T; t += V) { auto v = Vec::loadu(&input_data[t]); // TODO: vectorize in accscalar_t? @@ -153,6 +178,7 @@ void masked_softmax_dropout( } } auto inv_denominator = 1.0 / hsum; + TORCH_CHECK(T % V == 0, "epilogue not implemented yet"); for (auto t = 0; t < T; t += V) { Vec v = Vec::loadu(&input_data[t]); From 00769060bc610dd40a92faef3b1994c2716ce2fe Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Feb 2022 18:12:50 -0800 Subject: [PATCH 032/199] [PyTorch] MHA: just use existing softmax on CPU (#72462) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72462 Eliminating one potential source of bugs. ghstack-source-id: 149067329 Test Plan: CI Reviewed By: zrphercule Differential Revision: D34006432 fbshipit-source-id: 55fda186636dc457db7f3f9c8e18f1627ff33b6a (cherry picked from commit 5d8de9a12200db236d0fedfd3b13b1209fd4bc18) --- aten/src/ATen/native/attention.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index 9895b9cd125..38db814ceba 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -131,13 +131,16 @@ Tensor bmm_nt(const Tensor& a, const Tensor& b) { } void masked_softmax_dropout( - const Tensor& attn_scores, + Tensor& attn_scores, const c10::optional& attn_mask) { auto B = attn_scores.size(0); auto num_heads = attn_scores.size(1); auto T = attn_scores.size(2); if (attn_mask) { TORCH_CHECK(attn_mask->is_contiguous()); + } else { + at::_softmax_out(attn_scores, attn_scores, 3, false); + return; } AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::Half, From d2d982c739c9f7ea82468d1abd6a97385360d7f9 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Mon, 14 Feb 2022 18:12:50 -0800 Subject: [PATCH 033/199] [PyTorch] Fix MHA grain size computation (#72463) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72463 maxing with 1 makes a lot more sense to me than minning with 1, but I have no idea what I'm doing. ghstack-source-id: 149067332 Test Plan: CI Reviewed By: zrphercule Differential Revision: D33990633 fbshipit-source-id: c706148c357473c929020f5dc65cc5050611af8f (cherry picked from commit 2adf3be11a59387bbab7fc73da236ab5fff7be9c) --- aten/src/ATen/native/attention.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index 38db814ceba..26dca7ed4ed 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -42,7 +42,7 @@ std::tuple transform_bias_rescale_qkv( const scalar_t sqrt_dim_per_head = std::sqrt(static_cast(dim_per_head)); int64_t grain_size = - std::min(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1); + std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1); parallel_for( 0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) { for (auto i : c10::irange(begin, end)) { From a482aeb0ce034ac9dac1953146772c569c2e6c0a Mon Sep 17 00:00:00 2001 From: Pavithran Ramachandran Date: Mon, 14 Feb 2022 19:42:44 -0800 Subject: [PATCH 034/199] [PyTorchEdge] backport v8 to v7 to support promoted ops as instruction (#71662) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71662 backport v8 to v7 to support promoted ops as instruction a flag to help export as instruction from v8 and export as operators for v7 and below Test Plan: ``` buck test caffe2/test/cpp/jit:jit -- LiteInterpreterTest.BackPortByteCodeModelAllVersions Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/5629499620570927 ✓ ListingSuccess: caffe2/test/cpp/jit:jit : 461 tests discovered (15.693) ✓ Pass: caffe2/test/cpp/jit:jit - LiteInterpreterTest.BackPortByteCodeModelAllVersions (2.712) Summary Pass: 1 ListingSuccess: 1 If you need help understanding your runs, please follow the wiki: https://fburl.com/posting_in_tpx_users Finished test run: https://www.internalfb.com/intern/testinfra/testrun/5629499620570927 ``` ``` buck run mode/opt //caffe2/torch/fb/mobile/upgrader_codegen:upgrader_codegen buck test mode/opt //caffe2/test:upgrader_codegen -- mobile.test_upgrader_codegen.TestLiteScriptModule Parsing buck files: finished in 0.8 sec Downloaded 0/2 artifacts, 0.00 bytes, 100.0% cache miss (for updated rules) Building: finished in 01:39.4 min (100%) 11031/11031 jobs, 2/11031 updated Total time: 01:40.2 min More details at https://www.internalfb.com/intern/buck/build/a8b0e417-019c-44ba-be6b-23379411a965 BUILD SUCCEEDED Tpx test run coordinator for Facebook. See https://fburl.com/tpx for details. Running with tpx session id: 44fbfa66-cce8-4277-82ac-f89d79558581 Trace available for this run at /tmp/tpx-20220202-160956.915412/trace.log RemoteExecution session id: reSessionID-44fbfa66-cce8-4277-82ac-f89d79558581-tpx Started reporting to test run: https://www.internalfb.com/intern/testinfra/testrun/281475200877601 ✓ ListingSuccess: caffe2/test:upgrader_codegen : 1 tests discovered (1.249) ✓ Pass: caffe2/test:upgrader_codegen - test_generate_bytecode (mobile.test_upgrader_codegen.TestLiteScriptModule) (1.365) Summary Pass: 1 ListingSuccess: 1 If you need help understanding your runs, please follow the wiki: https://fburl.com/posting_in_tpx_users Finished test run: https://www.internalfb.com/intern/testinfra/testrun/281475200877601 ``` Reviewed By: iseeyuan Differential Revision: D33719098 fbshipit-source-id: e2d2b23d298f98e4d4fcdfc344f7b8c6f92cff26 (cherry picked from commit 81b956c23abc19489b69eee986721252474d00dc) --- caffe2/serialize/versions.h | 32 ++++++---- test/cpp/jit/test_lite_interpreter.cpp | 63 +++++++++++++++---- test/test_mobile_optimizer.py | 4 +- .../mobile/compatibility/backport_manager.cpp | 41 +++++++++++- torch/csrc/jit/mobile/upgrader_mobile.cpp | 36 +++++------ torch/csrc/jit/runtime/interpreter.cpp | 2 + torch/csrc/jit/runtime/interpreter.h | 1 + .../csrc/jit/runtime/interpreter/code_impl.h | 7 ++- torch/csrc/jit/serialization/export.h | 15 ++++- .../jit/serialization/export_bytecode.cpp | 3 +- .../csrc/jit/serialization/export_bytecode.h | 1 + .../csrc/jit/serialization/export_module.cpp | 11 ++++ 12 files changed, 163 insertions(+), 53 deletions(-) diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h index fa18e46b2c6..40cbd9dea87 100644 --- a/caffe2/serialize/versions.h +++ b/caffe2/serialize/versions.h @@ -110,22 +110,28 @@ constexpr uint64_t kMinProducedFileFormatVersion = 0x3L; // 0x2L: (Comment missing) // 0x3L: (Comment missing) // 0x4L: (update) Added schema to function tuple. Forward-compatible change. -// 0x5L: (update) Update bytecode is sharing constant tensor files from torchscript, and only serialize -// extra tensors that are not in the torchscript constant table. Also update tensor storage schema adapting -// to the unify format, the root key of tensor storage is updated from {index} to -// {the_pointer_value_the_tensor.storage}, for example: `140245072983168.storage` -// Forward-compatibility change. -// 0x6L: Implicit opereator versioning using number of specified argument. -// Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 -// for details. -// 0x7L: Enable support for operators with default arguments plus out arguments. -constexpr uint64_t kProducedBytecodeVersion = 0x7L; +// 0x5L: (update) Update bytecode is sharing constant tensor files from +// torchscript, and only serialize extra tensors that are not in the +// torchscript constant table. Also update tensor storage schema adapting to +// the unify format, the root key of tensor storage is updated from {index} to +// {the_pointer_value_the_tensor.storage}, for example: +// `140245072983168.storage` Forward-compatibility change. 0x6L: Implicit +// opereator versioning using number of specified argument. Refer to the +// summary of https://github.com/pytorch/pytorch/pull/56845 for details. 0x7L: +// Enable support for operators with default arguments plus out arguments. +// 0x8L: Emit promoted operators as instructions +constexpr uint64_t kProducedBytecodeVersion = 0x8L; + +// static_assert( +// kProducedBytecodeVersion >= kProducedFileFormatVersion, +// "kProducedBytecodeVersion must be higher or equal to +// kProducedFileFormatVersion."); // Introduce kMinSupportedBytecodeVersion and kMaxSupportedBytecodeVersion // for limited backward/forward compatibility support of bytecode. If -// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion (in loader), -// we should support this model_version. For example, we provide a wrapper to -// handle an updated operator. +// kMinSupportedBytecodeVersion <= model_version <= kMaxSupportedBytecodeVersion +// (in loader), we should support this model_version. For example, we provide a +// wrapper to handle an updated operator. constexpr uint64_t kMinSupportedBytecodeVersion = 0x3L; constexpr uint64_t kMaxSupportedBytecodeVersion = 0x8L; diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp index 0e40e48514d..5e00eafa738 100644 --- a/test/cpp/jit/test_lite_interpreter.cpp +++ b/test/cpp/jit/test_lite_interpreter.cpp @@ -571,19 +571,34 @@ namespace { void compareModelOutput( c10::ArrayRef actual_result_list, - const std::vector& expect_result_list) { + const std::vector& expect_result_list) { AT_ASSERT(actual_result_list.size() == expect_result_list.size()); - AT_ASSERT(actual_result_list[0].toTensor().equal(expect_result_list[0])); AT_ASSERT( - actual_result_list[1].toTensor().dim() == expect_result_list[1].dim()); - AT_ASSERT(actual_result_list[2].toTensor().equal(expect_result_list[2])); - AT_ASSERT(actual_result_list[3].toTensor().equal(expect_result_list[3])); + actual_result_list[0].toTensor().equal(expect_result_list[0].toTensor())); + AT_ASSERT( + actual_result_list[1].toTensor().dim() == + expect_result_list[1].toTensor().dim()); + AT_ASSERT( + actual_result_list[2].toTensor().equal(expect_result_list[2].toTensor())); + AT_ASSERT( + actual_result_list[3].toTensor().equal(expect_result_list[3].toTensor())); + ASSERT_EQ( + actual_result_list[4].toStringRef(), expect_result_list[4].toStringRef()); + ASSERT_EQ(actual_result_list[5].toBool(), expect_result_list[5].toBool()); + ASSERT_EQ(actual_result_list[6].toBool(), expect_result_list[6].toBool()); + ASSERT_EQ(actual_result_list[7].toBool(), expect_result_list[7].toBool()); + AT_ASSERT( + actual_result_list[8].toTensor().equal(expect_result_list[8].toTensor())); + ASSERT_EQ( + actual_result_list[9].toStringRef(), expect_result_list[9].toStringRef()); + ASSERT_EQ(actual_result_list[10].toInt(), expect_result_list[10].toInt()); + ASSERT_EQ(actual_result_list[11].toBool(), expect_result_list[11].toBool()); } void runAndCheckTorchScriptModel( std::stringstream& input_model_stream, const std::vector& input_data, - const std::vector& expect_result_list, + const std::vector& expect_result_list, const int64_t expect_version) { auto actual_version = _get_model_bytecode_version(input_model_stream); AT_ASSERT(actual_version == expect_version); @@ -600,7 +615,7 @@ void runAndCheckTorchScriptModel( void runAndCheckBytecodeModel( std::stringstream& input_model_stream, const std::vector& input_data, - const std::vector& expect_result_list, + const std::vector& expect_result_list, const int64_t expect_version) { auto actual_version = _get_model_bytecode_version(input_model_stream); AT_ASSERT(actual_version == expect_version); @@ -618,7 +633,7 @@ void runAndCheckBytecodeModel( void backportAllVersionCheck( std::stringstream& test_model_file_stream, std::vector& input_data, - std::vector& expect_result_list, + std::vector& expect_result_list, const int64_t expect_from_version) { auto from_version = _get_model_bytecode_version(test_model_file_stream); AT_ASSERT(from_version == expect_from_version); @@ -668,6 +683,9 @@ TEST(LiteInterpreterTest, BackPortByteCodeModelAllVersions) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) module.register_parameter("bias", torch::ones({20}), false); module.define(R"( + def fn(self, x:float=1.0): + return x + def forward(self, input): x1 = torch.zeros(2, 2) x2 = torch.empty_like(torch.empty(2, 2)) @@ -677,8 +695,22 @@ TEST(LiteInterpreterTest, BackPortByteCodeModelAllVersions) { x = 2 * torch.ones(1) h = torch.ones(1) torch.add(x, h, out=x) - return (x1, x2, x3, x) - )"); + device = torch.ones(1, 1).cpu().device.type + is_cuda = x1.is_cuda + bool_val = True + check_is = [] is None + check_is_not = [1] is not None + check_not = not bool_val + num_to_tensor = torch.tensor([self.fn()]) + d = {"a": "abc"} + check_dict_index = d["a"] + check_dim = x1.dim() + return ( + x1, x2, x3, x, device, is_cuda, check_is, + check_is_not, num_to_tensor, check_dict_index, + check_dim, check_not + ) + )"); torch::jit::Module module_freeze = freeze(module); @@ -686,12 +718,21 @@ TEST(LiteInterpreterTest, BackPortByteCodeModelAllVersions) { module_freeze._save_for_mobile(input_model_stream); std::vector input_data = std::vector({torch::ones({1, 1, 28, 28})}); - std::vector expect_result_list; + std::vector expect_result_list; expect_result_list.emplace_back(at::ones({2, 2}, ScalarType::Float) * 0); expect_result_list.emplace_back(at::ones({2, 2}, ScalarType::Float)); expect_result_list.emplace_back( at::ones({1, 20, 24, 24}, ScalarType::Float) * 26); expect_result_list.emplace_back(3 * at::ones({1})); + // "cpu" False, False, True, tensor(1), "abc", 2, False) + expect_result_list.emplace_back(c10::IValue("cpu")); + expect_result_list.emplace_back(c10::IValue(false)); + expect_result_list.emplace_back(c10::IValue(false)); + expect_result_list.emplace_back(c10::IValue(true)); + expect_result_list.emplace_back(c10::IValue(at::ones({1}))); + expect_result_list.emplace_back(c10::IValue("abc")); + expect_result_list.emplace_back(c10::IValue(2)); + expect_result_list.emplace_back(c10::IValue(false)); backportAllVersionCheck( input_model_stream, diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py index bb42702f536..9b728df8808 100644 --- a/test/test_mobile_optimizer.py +++ b/test/test_mobile_optimizer.py @@ -151,7 +151,7 @@ class TestOptimizer(TestCase): bn_scripted_module = torch.jit.script(bn_test_module) bn_scripted_module.eval() - self.assertEqual(len(torch.jit.export_opnames(bn_scripted_module)), 14) + self.assertEqual(len(torch.jit.export_opnames(bn_scripted_module)), 11) FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \ .run(str(get_forward(bn_scripted_module._c).graph)) @@ -252,7 +252,7 @@ class TestOptimizer(TestCase): bn_no_forward_scripted_module = torch.jit.script(bn_test_no_forward_module) bn_no_forward_scripted_module.eval() - self.assertEqual(len(torch.jit.export_opnames(bn_no_forward_scripted_module)), 14) + self.assertEqual(len(torch.jit.export_opnames(bn_no_forward_scripted_module)), 11) FileCheck().check_count("prim::CallMethod[name=\"forward\"]", 2, exactly=True) \ .run(bn_no_forward_scripted_module.foo.graph) diff --git a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp index 5cf6fea9701..09377093e0b 100644 --- a/torch/csrc/jit/mobile/compatibility/backport_manager.cpp +++ b/torch/csrc/jit/mobile/compatibility/backport_manager.cpp @@ -27,6 +27,7 @@ constexpr int64_t kBytecodeVersionV4 = 0x4L; constexpr int64_t kBytecodeVersionV5 = 0x5L; constexpr int64_t kBytecodeVersionV6 = 0x6L; constexpr int64_t kBytecodeVersionV7 = 0x7L; +constexpr int64_t kBytecodeVersionV8 = 0x8L; } // namespace /********************** Utility Functions **********************/ @@ -434,7 +435,8 @@ std::stringstream backport_v6_to_v5(std::stringstream& input_model_stream) { { BytecodeEmitModeGuard argNumGuard( true /*emit_default_input_instructions*/, - false /*enable_defaults_args_with_out_args*/); + false /*enable_defaults_args_with_out_args*/, + false /*enable_emit_promoted_ops*/); torch_script._save_for_mobile( intermediate_model_stream, extra_files, hasBytecodeDebug); } @@ -501,7 +503,8 @@ std::stringstream backport_v7_to_v6(std::stringstream& input_model_stream) { { BytecodeEmitModeGuard argNumGuard( false /*emit_default_input_instructions*/, - false /*enable_defaults_args_with_out_args*/); + false /*enable_defaults_args_with_out_args*/, + false /*enable_emit_promoted_ops*/); torch_script._save_for_mobile( intermediate_model_stream, extra_files, hasBytecodeDebug); } @@ -512,6 +515,39 @@ std::stringstream backport_v7_to_v6(std::stringstream& input_model_stream) { return output_model_stream; } +std::stringstream backport_v8_to_v7(std::stringstream& input_model_stream) { + std::shared_ptr rai = + std::make_shared(&input_model_stream); + auto reader = std::make_shared(rai); + // extra_files are kept + auto records = reader->getAllRecords(); + bool hasBytecodeDebug = reader->hasRecord("mobile_debug_handles.pkl"); + ExtraFilesMap extra_files; + for (const auto& record : records) { + std::size_t found = record.find_last_of("/\\"); + auto path = record.substr(0, found); + if ("extra" == path) { + extra_files.emplace(record.substr(found + 1), ""); + } + } + Module torch_script = torch::jit::load(rai, c10::nullopt, extra_files); + std::stringstream intermediate_model_stream; + { + BytecodeEmitModeGuard argNumGuard( + false /*emit_default_input_instructions*/, + true /*enable_defaults_args_with_out_args*/, + false /*enable_emit_promoted_ops*/); + torch_script._save_for_mobile( + intermediate_model_stream, extra_files, hasBytecodeDebug); + } + + // Update the bytecode version (from 8 to 7) + std::stringstream output_model_stream = + update_bytecode_version(intermediate_model_stream, kBytecodeVersionV7); + + return output_model_stream; +} + } // namespace /********************** BackportManager **********************/ @@ -528,6 +564,7 @@ BackportManager::BackportManager() { registerBytecodeBackportFunction(kBytecodeVersionV5, backport_v5_to_v4); registerBytecodeBackportFunction(kBytecodeVersionV6, backport_v6_to_v5); registerBytecodeBackportFunction(kBytecodeVersionV7, backport_v7_to_v6); + registerBytecodeBackportFunction(kBytecodeVersionV8, backport_v8_to_v7); } std::unordered_map< diff --git a/torch/csrc/jit/mobile/upgrader_mobile.cpp b/torch/csrc/jit/mobile/upgrader_mobile.cpp index 83e23342d5c..3e876de4766 100644 --- a/torch/csrc/jit/mobile/upgrader_mobile.cpp +++ b/torch/csrc/jit/mobile/upgrader_mobile.cpp @@ -346,7 +346,7 @@ const std::vector& getUpgraderBytecodeList() { Instruction{OpCode::STOREN, 1, 7}, Instruction{OpCode::LOAD, 3, 0}, Instruction{OpCode::LOADC, 0, 0}, - Instruction{OpCode::OP, 0, 0}, + Instruction{OpCode::__IS__, 0, 0}, Instruction{OpCode::JF, 10, 0}, Instruction{OpCode::LOAD, 1, 0}, Instruction{OpCode::LOAD, 2, 0}, @@ -355,17 +355,17 @@ const std::vector& getUpgraderBytecodeList() { Instruction{OpCode::LOAD, 5, 0}, Instruction{OpCode::LOAD, 6, 0}, Instruction{OpCode::LOAD, 7, 0}, - Instruction{OpCode::OP, 1, 0}, + Instruction{OpCode::OP, 0, 0}, Instruction{OpCode::JMP, 10, 0}, Instruction{OpCode::LOAD, 1, 0}, Instruction{OpCode::LOAD, 2, 0}, Instruction{OpCode::LOAD, 3, 0}, - Instruction{OpCode::OP, 2, 0}, + Instruction{OpCode::OP, 1, 0}, Instruction{OpCode::LOAD, 4, 0}, Instruction{OpCode::LOAD, 5, 0}, Instruction{OpCode::LOAD, 6, 0}, Instruction{OpCode::LOAD, 7, 0}, - Instruction{OpCode::OP, 1, 0}, + Instruction{OpCode::OP, 0, 0}, Instruction{OpCode::STORE, 8, 0}, Instruction{OpCode::DROPR, 7, 0}, Instruction{OpCode::DROPR, 6, 0}, @@ -385,7 +385,6 @@ const std::vector& getUpgraderBytecodeList() { 8 ), std::vector({ - OperatorString({"aten::__is__", "", 2}), OperatorString({"aten::linspace", "", 7}), OperatorString({"prim::unchecked_cast", "", 1}), }), // operators list @@ -397,20 +396,20 @@ const std::vector& getUpgraderBytecodeList() { Instruction{OpCode::STOREN, 1, 4}, Instruction{OpCode::LOAD, 3, 0}, Instruction{OpCode::LOADC, 0, 0}, - Instruction{OpCode::OP, 0, 0}, + Instruction{OpCode::__IS__, 0, 0}, Instruction{OpCode::JF, 7, 0}, Instruction{OpCode::LOAD, 1, 0}, Instruction{OpCode::LOAD, 2, 0}, Instruction{OpCode::LOADC, 1, 0}, Instruction{OpCode::LOAD, 4, 0}, - Instruction{OpCode::OP, 1, 0}, + Instruction{OpCode::OP, 0, 0}, Instruction{OpCode::JMP, 7, 0}, Instruction{OpCode::LOAD, 1, 0}, Instruction{OpCode::LOAD, 2, 0}, Instruction{OpCode::LOAD, 3, 0}, - Instruction{OpCode::OP, 2, 0}, - Instruction{OpCode::LOAD, 4, 0}, Instruction{OpCode::OP, 1, 0}, + Instruction{OpCode::LOAD, 4, 0}, + Instruction{OpCode::OP, 0, 0}, Instruction{OpCode::STORE, 5, 0}, Instruction{OpCode::DROPR, 4, 0}, Instruction{OpCode::DROPR, 2, 0}, @@ -427,7 +426,6 @@ const std::vector& getUpgraderBytecodeList() { 5 ), std::vector({ - OperatorString({"aten::__is__", "", 2}), OperatorString({"aten::linspace", "out", 4}), OperatorString({"prim::unchecked_cast", "", 1}), }), // operators list @@ -439,7 +437,7 @@ const std::vector& getUpgraderBytecodeList() { Instruction{OpCode::STOREN, 1, 8}, Instruction{OpCode::LOAD, 3, 0}, Instruction{OpCode::LOADC, 0, 0}, - Instruction{OpCode::OP, 0, 0}, + Instruction{OpCode::__IS__, 0, 0}, Instruction{OpCode::JF, 11, 0}, Instruction{OpCode::LOAD, 1, 0}, Instruction{OpCode::LOAD, 2, 0}, @@ -449,18 +447,18 @@ const std::vector& getUpgraderBytecodeList() { Instruction{OpCode::LOAD, 6, 0}, Instruction{OpCode::LOAD, 7, 0}, Instruction{OpCode::LOAD, 8, 0}, - Instruction{OpCode::OP, 1, 0}, + Instruction{OpCode::OP, 0, 0}, Instruction{OpCode::JMP, 11, 0}, Instruction{OpCode::LOAD, 1, 0}, Instruction{OpCode::LOAD, 2, 0}, Instruction{OpCode::LOAD, 3, 0}, - Instruction{OpCode::OP, 2, 0}, + Instruction{OpCode::OP, 1, 0}, Instruction{OpCode::LOAD, 4, 0}, Instruction{OpCode::LOAD, 5, 0}, Instruction{OpCode::LOAD, 6, 0}, Instruction{OpCode::LOAD, 7, 0}, Instruction{OpCode::LOAD, 8, 0}, - Instruction{OpCode::OP, 1, 0}, + Instruction{OpCode::OP, 0, 0}, Instruction{OpCode::STORE, 9, 0}, Instruction{OpCode::DROPR, 8, 0}, Instruction{OpCode::DROPR, 7, 0}, @@ -481,7 +479,6 @@ const std::vector& getUpgraderBytecodeList() { 9 ), std::vector({ - OperatorString({"aten::__is__", "", 2}), OperatorString({"aten::logspace", "", 8}), OperatorString({"prim::unchecked_cast", "", 1}), }), // operators list @@ -493,22 +490,22 @@ const std::vector& getUpgraderBytecodeList() { Instruction{OpCode::STOREN, 1, 5}, Instruction{OpCode::LOAD, 3, 0}, Instruction{OpCode::LOADC, 0, 0}, - Instruction{OpCode::OP, 0, 0}, + Instruction{OpCode::__IS__, 0, 0}, Instruction{OpCode::JF, 8, 0}, Instruction{OpCode::LOAD, 1, 0}, Instruction{OpCode::LOAD, 2, 0}, Instruction{OpCode::LOADC, 1, 0}, Instruction{OpCode::LOAD, 4, 0}, Instruction{OpCode::LOAD, 5, 0}, - Instruction{OpCode::OP, 1, 0}, + Instruction{OpCode::OP, 0, 0}, Instruction{OpCode::JMP, 8, 0}, Instruction{OpCode::LOAD, 1, 0}, Instruction{OpCode::LOAD, 2, 0}, Instruction{OpCode::LOAD, 3, 0}, - Instruction{OpCode::OP, 2, 0}, + Instruction{OpCode::OP, 1, 0}, Instruction{OpCode::LOAD, 4, 0}, Instruction{OpCode::LOAD, 5, 0}, - Instruction{OpCode::OP, 1, 0}, + Instruction{OpCode::OP, 0, 0}, Instruction{OpCode::STORE, 6, 0}, Instruction{OpCode::DROPR, 5, 0}, Instruction{OpCode::DROPR, 4, 0}, @@ -526,7 +523,6 @@ const std::vector& getUpgraderBytecodeList() { 6 ), std::vector({ - OperatorString({"aten::__is__", "", 2}), OperatorString({"aten::logspace", "out", 5}), OperatorString({"prim::unchecked_cast", "", 1}), }), // operators list diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp index a01da0b8c05..e421815d7e7 100644 --- a/torch/csrc/jit/runtime/interpreter.cpp +++ b/torch/csrc/jit/runtime/interpreter.cpp @@ -1059,12 +1059,14 @@ MobileCode::MobileCode( std::string function_name, bool emit_default_input_instructions, bool support_default_args_before_out, + bool emit_promoted_ops, size_t remaining_bailout_depth) : Code(new interpreter::MobileCodeImpl( graph, std::move(function_name), emit_default_input_instructions, support_default_args_before_out, + emit_promoted_ops, remaining_bailout_depth)) {} MobileCode::~MobileCode() = default; diff --git a/torch/csrc/jit/runtime/interpreter.h b/torch/csrc/jit/runtime/interpreter.h index 12441735ae6..19f997981f4 100644 --- a/torch/csrc/jit/runtime/interpreter.h +++ b/torch/csrc/jit/runtime/interpreter.h @@ -88,6 +88,7 @@ struct TORCH_API MobileCode : Code { std::string function_name, bool emit_default_input_instructions = true, bool support_default_args_before_out = true, + bool emit_promoted_ops = true, size_t remaining_bailout_depth = 0); ~MobileCode(); }; diff --git a/torch/csrc/jit/runtime/interpreter/code_impl.h b/torch/csrc/jit/runtime/interpreter/code_impl.h index 03411a19632..63844c4e981 100644 --- a/torch/csrc/jit/runtime/interpreter/code_impl.h +++ b/torch/csrc/jit/runtime/interpreter/code_impl.h @@ -869,10 +869,12 @@ struct MobileCodeImpl : CodeImpl { std::string function_name, bool emit_default_input_instructions, bool support_default_args_before_out, + bool emit_promoted_ops, size_t remaining_bailout_depth) : CodeImpl(graph, function_name, remaining_bailout_depth, false), emit_default_input_instructions_(emit_default_input_instructions), - support_default_args_before_out_(support_default_args_before_out) { + support_default_args_before_out_(support_default_args_before_out), + emit_promoted_ops_(emit_promoted_ops) { // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall) run(); } @@ -965,7 +967,6 @@ struct MobileCodeImpl : CodeImpl { int64_t X = 0, uint64_t N = 0, bool emit_inputs = true) override { - bool emit_promoted_ops_ = false; if (emit_promoted_ops_) { CodeImpl::emitOperatorOrInstruction(node, op, X, N, emit_inputs); } else { @@ -977,6 +978,8 @@ struct MobileCodeImpl : CodeImpl { bool emit_default_input_instructions_; // To support forward compatibility for bytecode version bump from v6 to v7 bool support_default_args_before_out_; + // To support forward compatibility for bytecode version bump from v7 to v8 + bool emit_promoted_ops_; }; } // namespace interpreter diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h index b73817fb23c..17996a8ec05 100644 --- a/torch/csrc/jit/serialization/export.h +++ b/torch/csrc/jit/serialization/export.h @@ -201,6 +201,9 @@ struct TORCH_API BytecodeEmitMode { static bool is_default_args_before_out_args_enabled(); static void set_default_args_before_out_args_enabled(bool enabled); + + static bool is_emit_promoted_ops_enabled(); + static void set_default_emit_promoted_ops_enabled(bool enabled); }; // RAII guard to switch the way JIT emits the bytecode for inputs. @@ -216,24 +219,32 @@ struct TORCH_API BytecodeEmitMode { struct TORCH_API BytecodeEmitModeGuard { BytecodeEmitModeGuard( bool enable_default_value_for_unspecified_arg, - bool enable_default_args_before_out_args) + bool enable_default_args_before_out_args, + bool enable_emit_promoted_ops) : prev_default_value_for_unspecified_arg_mode( BytecodeEmitMode::is_default_value_for_unspecified_arg_enabled()), prev_default_args_before_out_args( - BytecodeEmitMode::is_default_args_before_out_args_enabled()) { + BytecodeEmitMode::is_default_args_before_out_args_enabled()), + prev_default_emit_promoted_ops( + BytecodeEmitMode::is_emit_promoted_ops_enabled()) { BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled( enable_default_value_for_unspecified_arg); BytecodeEmitMode::set_default_args_before_out_args_enabled( enable_default_args_before_out_args); + BytecodeEmitMode::set_default_emit_promoted_ops_enabled( + enable_emit_promoted_ops); } ~BytecodeEmitModeGuard() { BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled( prev_default_value_for_unspecified_arg_mode); BytecodeEmitMode::set_default_args_before_out_args_enabled( prev_default_args_before_out_args); + BytecodeEmitMode::set_default_emit_promoted_ops_enabled( + prev_default_emit_promoted_ops); } bool prev_default_value_for_unspecified_arg_mode; bool prev_default_args_before_out_args; + bool prev_default_emit_promoted_ops; }; TORCH_API IValue to_tuple(std::vector ivalues); diff --git a/torch/csrc/jit/serialization/export_bytecode.cpp b/torch/csrc/jit/serialization/export_bytecode.cpp index 007e29ec7c3..cb2b104e039 100644 --- a/torch/csrc/jit/serialization/export_bytecode.cpp +++ b/torch/csrc/jit/serialization/export_bytecode.cpp @@ -142,7 +142,8 @@ mobile::Code compileGraphToMobileCode( graph, name, compilation_options.enable_default_value_for_unspecified_arg, - compilation_options.enable_default_args_before_out_args); + compilation_options.enable_default_args_before_out_args, + compilation_options.enable_emit_promoted_ops); mobile::Code mobile_code; diff --git a/torch/csrc/jit/serialization/export_bytecode.h b/torch/csrc/jit/serialization/export_bytecode.h index 4fb0b5043f5..96397a56eac 100644 --- a/torch/csrc/jit/serialization/export_bytecode.h +++ b/torch/csrc/jit/serialization/export_bytecode.h @@ -20,6 +20,7 @@ struct TORCH_API CompilationOptions { bool incl_interface_call = false; bool enable_default_value_for_unspecified_arg = false; bool enable_default_args_before_out_args = true; + bool enable_emit_promoted_ops = true; int model_version = caffe2::serialize::kProducedBytecodeVersion; }; diff --git a/torch/csrc/jit/serialization/export_module.cpp b/torch/csrc/jit/serialization/export_module.cpp index 23bd357130f..cbfe143c0e7 100644 --- a/torch/csrc/jit/serialization/export_module.cpp +++ b/torch/csrc/jit/serialization/export_module.cpp @@ -44,6 +44,8 @@ CompilationOptions getOptionsFromGlobal() { BytecodeEmitMode::is_default_args_before_out_args_enabled(); compilation_options.enable_default_value_for_unspecified_arg = BytecodeEmitMode::is_default_value_for_unspecified_arg_enabled(); + compilation_options.enable_emit_promoted_ops = + BytecodeEmitMode::is_emit_promoted_ops_enabled(); compilation_options.incl_interface_call = getMobileInterfaceCallExport(); compilation_options.model_version = caffe2::serialize::kProducedBytecodeVersion; @@ -864,5 +866,14 @@ void BytecodeEmitMode::set_default_args_before_out_args_enabled(bool enabled) { emitDefautlArgsWithOutArgs = enabled; } +thread_local bool emitDefaultEmitPromotedOps = + caffe2::serialize::kProducedBytecodeVersion <= 7 ? false : true; +bool BytecodeEmitMode::is_emit_promoted_ops_enabled() { + return emitDefaultEmitPromotedOps; +} +void BytecodeEmitMode::set_default_emit_promoted_ops_enabled(bool enabled) { + emitDefaultEmitPromotedOps = enabled; +} + } // namespace jit } // namespace torch From 511ec7f366dd4a54fb150cc1b8f90614d835f77d Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 14 Feb 2022 22:45:13 -0800 Subject: [PATCH 035/199] Fix `sequence_ops_test` (#72844) Summary: Fuzzing gone bad again: `np.unique([])` returns array or float64, but `np.delete` expects array of int Fixes recent regressions in ONNX tests in OSS CI, see https://github.com/pytorch/pytorch/runs/5188636426?check_suite_focus=true for example Pull Request resolved: https://github.com/pytorch/pytorch/pull/72844 Reviewed By: gmagogsfm Differential Revision: D34235295 Pulled By: malfet fbshipit-source-id: 37ad39ac04f81ac519a5d4e4e8a86901944973bd (cherry picked from commit 683c767e72dad70a12297545a1b9345c89add3c4) --- caffe2/python/operator_test/sequence_ops_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py index 6fbc445a776..cb07a96fa0f 100644 --- a/caffe2/python/operator_test/sequence_ops_test.py +++ b/caffe2/python/operator_test/sequence_ops_test.py @@ -385,7 +385,7 @@ class TestSequenceOps(serial.SerializedTestCase): ["shrunk_data"]) def op_ref(data, indices): - unique_indices = np.unique(indices) + unique_indices = np.unique(indices) if len(indices)>0 else np.array([],dtype=np.int64) sorted_indices = np.sort(unique_indices) shrunk_data = np.delete(data, sorted_indices, axis=0) return (shrunk_data,) From 856157fcee3f5f8c48ca185ecad65457be490a7d Mon Sep 17 00:00:00 2001 From: Shunting Zhang Date: Mon, 14 Feb 2022 23:16:14 -0800 Subject: [PATCH 036/199] (2/2) Make TorchScript Preserve Fully Qualified Class Name for Python Exceptions: frontend change (#70471) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/70471 Reland D33282878 (https://github.com/pytorch/pytorch/commit/911d527b870bb4371da39be0c18a1ce109acb1d5). This is the frontend change. ghstack-source-id: 149114933 Test Plan: Refer to D33282878 (https://github.com/pytorch/pytorch/commit/911d527b870bb4371da39be0c18a1ce109acb1d5). Also check CI Reviewed By: gmagogsfm Differential Revision: D33342569 fbshipit-source-id: 57984ac67ae2c56c38f72d3b1fb69105901fb472 (cherry picked from commit b47cc935ee1fd7aa63aa453a323a637bc2c22f3c) --- test/cpp/jit/test_exception.cpp | 159 ++++++++++++++++ test/jit/myexception.py | 8 + test/jit/test_exception.py | 176 ++++++++++++++++++ test/test_jit.py | 147 --------------- torch/_jit_internal.py | 19 +- torch/csrc/jit/frontend/ir_emitter.cpp | 8 +- torch/csrc/jit/frontend/sugared_value.h | 12 +- .../csrc/jit/python/python_sugared_value.cpp | 5 +- torch/csrc/jit/python/python_sugared_value.h | 10 +- 9 files changed, 385 insertions(+), 159 deletions(-) create mode 100644 test/cpp/jit/test_exception.cpp create mode 100644 test/jit/myexception.py create mode 100644 test/jit/test_exception.py diff --git a/test/cpp/jit/test_exception.cpp b/test/cpp/jit/test_exception.cpp new file mode 100644 index 00000000000..b6b3cbcd679 --- /dev/null +++ b/test/cpp/jit/test_exception.cpp @@ -0,0 +1,159 @@ +/* + * We have a python unit test for exceptions in test/jit/test_exception.py . + * Add a CPP version here to verify that excepted exception types thrown from + * C++. This is hard to test in python code since C++ exceptions will be + * translated to python exceptions. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +namespace py = pybind11; + +TEST(TestException, TestAssertion) { + std::string pythonCode = R"PY( + def foo(): + raise AssertionError("An assertion failed") + )PY"; + auto cu_ptr = torch::jit::compile(pythonCode); + torch::jit::GraphFunction* gf = + (torch::jit::GraphFunction*)&cu_ptr->get_function("foo"); + std::cerr << "Graph is\n" << *gf->graph() << std::endl; + + bool is_jit_exception = false; + std::string message; + c10::optional exception_class; + try { + cu_ptr->run_method("foo"); + } catch (JITException& e) { + is_jit_exception = true; + message = e.what(); + exception_class = e.getPythonClassName(); + } + EXPECT_TRUE(is_jit_exception); + EXPECT_FALSE(exception_class); + EXPECT_TRUE( + message.find("RuntimeError: AssertionError: An assertion failed") != + std::string::npos); +} + +struct MyPythonExceptionValue : public torch::jit::SugaredValue { + explicit MyPythonExceptionValue(const py::object& exception_class) { + qualified_name_ = + (py::str(py::getattr(exception_class, "__module__", py::str(""))) + + py::str(".") + + py::str(py::getattr(exception_class, "__name__", py::str("")))) + .cast(); + } + + std::string kind() const override { + return "My Python exception"; + } + + // Simplified from PythonExceptionValue::call + std::shared_ptr call( + const torch::jit::SourceRange& loc, + torch::jit::GraphFunction& caller, + at::ArrayRef args, + at::ArrayRef kwargs, + size_t n_binders) override { + TORCH_CHECK(args.size() == 1); + Value* error_message = args.at(0).value(*caller.graph()); + Value* qualified_class_name = + insertConstant(*caller.graph(), qualified_name_, loc); + return std::make_shared( + error_message, qualified_class_name); + } + + private: + std::string qualified_name_; +}; + +class SimpleResolver : public torch::jit::Resolver { + public: + explicit SimpleResolver() {} + + std::shared_ptr resolveValue( + const std::string& name, + torch::jit::GraphFunction& m, + const torch::jit::SourceRange& loc) override { + // follows toSugaredValue (toSugaredValue is defined in caffe2:_C which is + // a python extension. We can not add that as a cpp_binary's dep) + if (name == "SimpleValueError") { + py::object obj = py::globals()["SimpleValueError"]; + return std::make_shared(obj); + } + TORCH_CHECK(false, "resolveValue: can not resolve '", name, "{}'"); + } + + torch::jit::TypePtr resolveType( + const std::string& name, + const torch::jit::SourceRange& loc) override { + return nullptr; + } +}; + +/* + * - The python source code parsing for TorchScript here is learned from + * torch::jit::compile. + * - The code only parses one Def. If there are multiple in the code, those + * except the first one are skipped. + */ +TEST(TestException, TestCustomException) { + py::scoped_interpreter guard{}; + py::exec(R"PY( + class SimpleValueError(ValueError): + def __init__(self, message): + super(SimpleValueError, self).__init__(message) + )PY"); + + std::string pythonCode = R"PY( + def foo(): + raise SimpleValueError("An assertion failed") + )PY"; + + torch::jit::Parser p( + std::make_shared(pythonCode, "", 1)); + auto def = torch::jit::Def(p.parseFunction(/*is_method=*/false)); + std::cerr << "Def is:\n" << def << std::endl; + auto cu = std::make_shared(); + (void)cu->define( + c10::nullopt, + {}, + {}, + {def}, + // class PythonResolver is defined in + // torch/csrc/jit/python/script_init.cpp. It's not in a header file so I + // can not use it. Create a SimpleResolver insteand + {std::make_shared()}, + nullptr); + torch::jit::GraphFunction* gf = + (torch::jit::GraphFunction*)&cu->get_function("foo"); + std::cerr << "Graph is\n" << *gf->graph() << std::endl; + bool is_jit_exception = false; + c10::optional exception_class; + std::string message; + try { + cu->run_method("foo"); + } catch (JITException& e) { + is_jit_exception = true; + exception_class = e.getPythonClassName(); + message = e.what(); + } + EXPECT_TRUE(is_jit_exception); + EXPECT_EQ("__main__.SimpleValueError", *exception_class); + EXPECT_TRUE( + message.find("__main__.SimpleValueError: An assertion failed") != + std::string::npos); +} + +} // namespace jit +} // namespace torch diff --git a/test/jit/myexception.py b/test/jit/myexception.py new file mode 100644 index 00000000000..5937bd3c91b --- /dev/null +++ b/test/jit/myexception.py @@ -0,0 +1,8 @@ +r""" +Define exceptions used in test_exception.py. We define them in a +separate file on purpose to make sure the fully qualified exception class name +is captured correctly in suce cases. +""" +class MyKeyError(KeyError): + def __init__(self, msg): + super(KeyError, self).__init__(msg) diff --git a/test/jit/test_exception.py b/test/jit/test_exception.py new file mode 100644 index 00000000000..dce38e3be89 --- /dev/null +++ b/test/jit/test_exception.py @@ -0,0 +1,176 @@ +# Owner(s): ["oncall: jit"] +from torch.testing._internal.common_utils import TestCase +import torch +from torch import nn + +r""" +Test TorchScript exception handling. +""" +class TestException(TestCase): + def test_pyop_exception_message(self): + class Foo(torch.jit.ScriptModule): + def __init__(self): + super(Foo, self).__init__() + self.conv = nn.Conv2d(1, 10, kernel_size=5) + + @torch.jit.script_method + def forward(self, x): + return self.conv(x) + foo = Foo() + # testing that the correct error message propagates + with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"): + foo(torch.ones([123])) # wrong size + + def test_builtin_error_messsage(self): + with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"): + @torch.jit.script + def close_match(x): + return x.masked_fill(True) + + with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently " + "supported in TorchScript"): + @torch.jit.script + def unknown_op(x): + torch.set_anomaly_enabled(True) + return x + + def test_exceptions(self): + cu = torch.jit.CompilationUnit(''' + def foo(cond): + if bool(cond): + raise ValueError(3) + return 1 + ''') + + cu.foo(torch.tensor(0)) + with self.assertRaisesRegex(torch.jit.Error, "3"): + cu.foo(torch.tensor(1)) + + def foo(cond): + a = 3 + if bool(cond): + raise ArbitraryError(a, "hi") + if 1 == 2: + raise ArbitraryError + return a + + with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"): + torch.jit.script(foo) + + def exception_as_value(): + a = Exception() + print(a) + + with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"): + torch.jit.script(exception_as_value) + + @torch.jit.script + def foo_no_decl_always_throws(): + raise RuntimeError("Hi") + + # function that has no declared type but always throws set to None + output_type = next(foo_no_decl_always_throws.graph.outputs()).type() + self.assertTrue(str(output_type) == "NoneType") + + @torch.jit.script + def foo_decl_always_throws(): + # type: () -> Tensor + raise Exception("Hi") + + output_type = next(foo_decl_always_throws.graph.outputs()).type() + self.assertTrue(str(output_type) == "Tensor") + + def foo(): + raise 3 + 4 + + with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"): + torch.jit.script(foo) + + # a escapes scope + @torch.jit.script + def foo(): + if 1 == 1: + a = 1 + else: + if 1 == 1: + raise Exception("Hi") + else: + raise Exception("Hi") + return a + self.assertEqual(foo(), 1) + + @torch.jit.script + def tuple_fn(): + raise RuntimeError("hello", "goodbye") + + with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"): + tuple_fn() + + @torch.jit.script + def no_message(): + raise RuntimeError + + with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"): + no_message() + + def test_assertions(self): + cu = torch.jit.CompilationUnit(''' + def foo(cond): + assert bool(cond), "hi" + return 0 + ''') + + cu.foo(torch.tensor(1)) + with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): + cu.foo(torch.tensor(0)) + + @torch.jit.script + def foo(cond): + assert bool(cond), "hi" + + foo(torch.tensor(1)) + # we don't currently validate the name of the exception + with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): + foo(torch.tensor(0)) + + def test_python_op_exception(self): + @torch.jit.ignore + def python_op(x): + raise Exception("bad!") + + @torch.jit.script + def fn(x): + return python_op(x) + + with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"): + fn(torch.tensor(4)) + + def test_dict_expansion_raises_error(self): + def fn(self): + d = {"foo": 1, "bar": 2, "baz": 3} + return {**d} + + with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, + "Dict expansion "): + torch.jit.script(fn) + + def test_custom_python_exception(self): + class MyValueError(ValueError): + def __init__(self, msg): + super(MyValueError, self).__init__(msg) + + @torch.jit.script + def fn(): + raise MyValueError("test custom exception") + + with self.assertRaisesRegex(torch.jit.Error, "jit.test_exception.MyValueError: test custom exception"): + fn() + + def test_custom_python_exception_defined_elsewhere(self): + from jit.myexception import MyKeyError + + @torch.jit.script + def fn(): + raise MyKeyError("This is a user defined key error") + with self.assertRaisesRegex(torch.jit.Error, "jit.myexception.MyKeyError: This is a user defined key error"): + fn() diff --git a/test/test_jit.py b/test/test_jit.py index 37cd9b5d53c..2527fbf941b 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -13013,153 +13013,6 @@ dedent """ self.checkScript(dedent(code), (101,)) - def test_pyop_exception_message(self): - class Foo(torch.jit.ScriptModule): - def __init__(self): - super(Foo, self).__init__() - self.conv = nn.Conv2d(1, 10, kernel_size=5) - - @torch.jit.script_method - def forward(self, x): - return self.conv(x) - foo = Foo() - # testing that the correct error message propagates - with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"): - foo(torch.ones([123])) # wrong size - - def test_builtin_error_messsage(self): - with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"): - @torch.jit.script - def close_match(x): - return x.masked_fill(True) - - with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently " - "supported in TorchScript"): - @torch.jit.script - def unknown_op(x): - torch.set_anomaly_enabled(True) - return x - - def test_exceptions(self): - cu = torch.jit.CompilationUnit(''' - def foo(cond): - if bool(cond): - raise ValueError(3) - return 1 - ''') - - cu.foo(torch.tensor(0)) - with self.assertRaisesRegex(torch.jit.Error, "3"): - cu.foo(torch.tensor(1)) - - def foo(cond): - a = 3 - if bool(cond): - raise ArbitraryError(a, "hi") - if 1 == 2: - raise ArbitraryError - return a - - with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"): - torch.jit.script(foo) - - def exception_as_value(): - a = Exception() - print(a) - - with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"): - torch.jit.script(exception_as_value) - - @torch.jit.script - def foo_no_decl_always_throws(): - raise RuntimeError("Hi") - - # function that has no declared type but always throws set to None - output_type = next(foo_no_decl_always_throws.graph.outputs()).type() - self.assertTrue(str(output_type) == "NoneType") - - @torch.jit.script - def foo_decl_always_throws(): - # type: () -> Tensor - raise Exception("Hi") - - output_type = next(foo_decl_always_throws.graph.outputs()).type() - self.assertTrue(str(output_type) == "Tensor") - - def foo(): - raise 3 + 4 - - with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"): - torch.jit.script(foo) - - # a escapes scope - @torch.jit.script - def foo(): - if 1 == 1: - a = 1 - else: - if 1 == 1: - raise Exception("Hi") - else: - raise Exception("Hi") - return a - self.assertEqual(foo(), 1) - - @torch.jit.script - def tuple_fn(): - raise RuntimeError("hello", "goodbye") - - with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"): - tuple_fn() - - @torch.jit.script - def no_message(): - raise RuntimeError - - with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"): - no_message() - - def test_assertions(self): - cu = torch.jit.CompilationUnit(''' - def foo(cond): - assert bool(cond), "hi" - return 0 - ''') - - cu.foo(torch.tensor(1)) - with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): - cu.foo(torch.tensor(0)) - - @torch.jit.script - def foo(cond): - assert bool(cond), "hi" - - foo(torch.tensor(1)) - # we don't currently validate the name of the exception - with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): - foo(torch.tensor(0)) - - def test_python_op_exception(self): - @torch.jit.ignore - def python_op(x): - raise Exception("bad!") - - @torch.jit.script - def fn(x): - return python_op(x) - - with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"): - fn(torch.tensor(4)) - - def test_dict_expansion_raises_error(self): - def fn(self): - d = {"foo": 1, "bar": 2, "baz": 3} - return {**d} - - with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, - "Dict expansion "): - torch.jit.script(fn) - def test_module_parameters_and_buffers(self): weights = torch.randn(10, 10) bias = torch.randn(10) diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index 20616a978d4..ba570b35391 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -977,7 +977,7 @@ def is_scripting() -> bool: # Retrieves a fully-qualified name (module hierarchy + classname) for a given obj. -def _qualified_name(obj) -> str: +def _qualified_name(obj, mangle_name=True) -> str: # This special case allows us to override the qualified name on a type. # It's currently used in conjunction with tracing, where we create a # fake module to filter only supported attributes. However, since this @@ -1026,13 +1026,16 @@ def _qualified_name(obj) -> str: module_name = module_name.replace("<", "_") module_name = module_name.replace(">", "_") - # __main__ is a builtin module, so rewrite it to "__torch__". - if module_name == "__main__": - module_name = "__torch__" - else: - # Everything else gets a "__torch__" prefix to avoid name collisions - # with the names of user values. - module_name = "__torch__." + module_name + # The PythonExceptionValue C++ class in torch/csrc/jit/python/python_sugared_value.h + # does not need mangle the python class name. + if mangle_name: + # __main__ is a builtin module, so rewrite it to "__torch__". + if module_name == "__main__": + module_name = "__torch__" + else: + # Everything else gets a "__torch__" prefix to avoid name collisions + # with the names of user values. + module_name = "__torch__." + module_name if "." in name: raise RuntimeError(f"Could not get qualified name for class '{name}': " diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp index 20cab7c7499..eac6161c923 100644 --- a/torch/csrc/jit/frontend/ir_emitter.cpp +++ b/torch/csrc/jit/frontend/ir_emitter.cpp @@ -2478,12 +2478,14 @@ struct to_ir { void emitRaise(const Raise& raise) { auto sv = emitSugaredExpr(raise.expr(), 1); Value* error_message = nullptr; + Value* qualified_class_name = nullptr; if (auto exception_instance = std::dynamic_pointer_cast(sv)) { // The typical case, an instance of the exception class was thrown: // raise RuntimeError("error") error_message = exception_instance->getValue(); + qualified_class_name = exception_instance->getQualifiedClassName(); } else if ( auto exception_class = std::dynamic_pointer_cast(sv)) { // A bare exception was thrown so add an empty message. e.g. @@ -2500,7 +2502,11 @@ struct to_ir { error_message = graph->insert(aten::str, {error_message}); } - graph->insert(prim::RaiseException, {error_message}, {}, raise.range()); + graph->insert( + prim::RaiseException, + {error_message, qualified_class_name}, + {}, + raise.range()); exit_blocks.insert(environment_stack->block()); } diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h index f6a3f72a59d..6ddd9bed753 100644 --- a/torch/csrc/jit/frontend/sugared_value.h +++ b/torch/csrc/jit/frontend/sugared_value.h @@ -744,7 +744,10 @@ struct SimpleSelf : public Self { // This is not a SimpleValue so it can not pass through the code paths that // expect a SimpleValue as a sugared value. struct TORCH_API ExceptionMessageValue : public SugaredValue { - explicit ExceptionMessageValue(Value* value) : value_(value) {} + explicit ExceptionMessageValue( + Value* value, + Value* qualified_class_name = nullptr) + : value_(value), qualified_class_name_(qualified_class_name) {} std::string kind() const override { return "exception message"; @@ -754,7 +757,14 @@ struct TORCH_API ExceptionMessageValue : public SugaredValue { return value_; } + // qualified python class name + Value* getQualifiedClassName() { + return qualified_class_name_; + } + + private: Value* value_; + Value* qualified_class_name_; }; struct TORCH_API ExceptionValue : public SugaredValue { diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index 87ab27a5552..f014150d8a2 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -914,8 +914,11 @@ std::shared_ptr PythonExceptionValue::call( ->insertNode(caller.graph()->createTuple(message_values)) ->output(); } + Value* qualified_class_name = + insertConstant(*caller.graph(), exception_class_qualified_name_, loc); - return std::make_shared(error_message); + return std::make_shared( + error_message, qualified_class_name); } bool isNamedTupleClass(const py::object& obj) { diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h index d3559abda5c..5fef124cf2b 100644 --- a/torch/csrc/jit/python/python_sugared_value.h +++ b/torch/csrc/jit/python/python_sugared_value.h @@ -328,7 +328,12 @@ struct VISIBILITY_HIDDEN PythonClassValue : public ClassValue { struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue { explicit PythonExceptionValue(const py::object& exception_class) : ExceptionValue( - py::str(py::getattr(exception_class, "__name__", py::str("")))) {} + py::str(py::getattr(exception_class, "__name__", py::str("")))), + exception_class_qualified_name_( + py::str(py::module::import("torch._jit_internal") + .attr("_qualified_name")( + exception_class, + /*mangle_name=*/false))) {} std::string kind() const override { return "Python exception"; @@ -340,6 +345,9 @@ struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue { at::ArrayRef args, at::ArrayRef kwargs, size_t n_binders) override; + + private: + std::string exception_class_qualified_name_; }; // Python Slice class. From b9ccbe4ff285af8dc89050dac6dbeeaced63e21d Mon Sep 17 00:00:00 2001 From: CodemodService FBSourceClangFormatLinterBot <> Date: Tue, 15 Feb 2022 02:34:49 -0800 Subject: [PATCH 037/199] [AutoAccept][Codemod][FBSourceClangFormatLinter] Daily `arc lint --take CLANGFORMAT` Reviewed By: bilalsou Differential Revision: D34237270 fbshipit-source-id: f33c06e9cbbde8b1fa39b11f9addb716f3762c99 (cherry picked from commit 0db3686e9d2976b0bd69131f4da7d8f5d2891b54) --- torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp index fd7b6fc502a..0e74ce172f9 100644 --- a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp +++ b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp @@ -3136,8 +3136,7 @@ void SegmentedFusion::annotateFP16IntermediateTensors() { } } -std::string toString( - const SegmentCandidateFinderOptions& segment_options) { +std::string toString(const SegmentCandidateFinderOptions& segment_options) { std::stringstream ss; ss << "segmentation phases {\n"; if (segment_options.run_combine_reductions) { From 5b82e4f72bc3f10f820cb27c773eef3a5efcc042 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Tue, 15 Feb 2022 06:53:51 -0800 Subject: [PATCH 038/199] stop sccache server after building (#72794) Summary: This is to avoid the directory , where the sccache is installed, couldn't be deleted. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72794 Reviewed By: H-Huang Differential Revision: D34222877 Pulled By: janeyx99 fbshipit-source-id: 2765d6f49b375d15598586ed83ae4c5e667e7226 (cherry picked from commit 551e21ca582c80d88a466b7bfe4eda9dee0c9a5f) --- .jenkins/pytorch/win-test-helpers/build_pytorch.bat | 1 + 1 file changed, 1 insertion(+) diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat index 4954dcf4f45..ade52f8892c 100644 --- a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat +++ b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat @@ -157,4 +157,5 @@ python setup.py install --cmake && sccache --show-stats && ( sccache --show-stats > stats.txt python -m tools.stats.upload_sccache_stats stats.txt +sccache --stop-server rm stats.txt From 28388b4b43381e9ea08a9c18c99da6b2ef9e28b3 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Tue, 15 Feb 2022 07:11:37 -0800 Subject: [PATCH 039/199] Remove native_functions.yaml dependency from GridSample.{cpp,cu} (#66979) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/66979 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D31856103 Pulled By: malfet fbshipit-source-id: 49e674dcf8555f358fbac72826204ee3bcc28f70 (cherry picked from commit 9c785d94c06b45d660b42df1f78ab6119a182430) --- aten/src/ATen/TensorGeometry.cpp | 24 ++++++- aten/src/ATen/TensorGeometry.h | 17 +++-- aten/src/ATen/TensorUtils.cpp | 19 ----- aten/src/ATen/TensorUtils.h | 7 -- aten/src/ATen/core/TensorAccessor.h | 1 + aten/src/ATen/native/GridSampler.cpp | 20 ++++-- aten/src/ATen/native/GridSampler.h | 6 +- aten/src/ATen/native/SpectralOps.cpp | 1 + .../src/ATen/native/cpu/GridSamplerKernel.cpp | 47 ++++++------ aten/src/ATen/native/cpu/GridSamplerKernel.h | 30 ++++++-- aten/src/ATen/native/cuda/Activation.h | 2 +- aten/src/ATen/native/cuda/GridSampler.cpp | 72 +++++++++++++++++++ aten/src/ATen/native/cuda/GridSampler.cu | 50 ++++++------- aten/src/ATen/native/cuda/GridSampler.cuh | 2 - aten/src/ATen/native/cuda/GridSampler.h | 32 +++++++++ aten/src/ATen/native/cuda/UpSample.cuh | 7 +- .../src/ATen/native/quantized/cpu/qconcat.cpp | 1 + tools/build_variables.bzl | 1 + 18 files changed, 231 insertions(+), 108 deletions(-) create mode 100644 aten/src/ATen/native/cuda/GridSampler.cpp create mode 100644 aten/src/ATen/native/cuda/GridSampler.h diff --git a/aten/src/ATen/TensorGeometry.cpp b/aten/src/ATen/TensorGeometry.cpp index 20ab6bb6690..9490aff534b 100644 --- a/aten/src/ATen/TensorGeometry.cpp +++ b/aten/src/ATen/TensorGeometry.cpp @@ -1,10 +1,30 @@ #include -#include -#include +#include +#include namespace at { +// See TensorGeometry.h on why this is useful now that we cache is_contiguous. +bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) { + assert(sizes.size() < static_cast(std::numeric_limits::max())); + auto dim = static_cast(sizes.size()); + int64_t expected_stride = 1; + bool contig_if_nonempty = true; + for (int64_t i = dim - 1; i >= 0; i--) { + if (sizes[i] == 0) { + return true; + } + if (contig_if_nonempty) { + if (sizes[i] != 1 && strides[i] != expected_stride) { + contig_if_nonempty = false; + } + expected_stride *= sizes[i]; + } + } + return contig_if_nonempty; +} + bool TensorGeometry::is_contiguous() const { if (numel_ == 0) { return true; diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h index ad3e16da4a6..7762cc94df6 100644 --- a/aten/src/ATen/TensorGeometry.h +++ b/aten/src/ATen/TensorGeometry.h @@ -1,10 +1,17 @@ #pragma once -#include -#include +#include +#include namespace at { +// Return if the tensor geometry represented by `sizes` and `strides` is contiguous +// Although we cache is_contiguous in tensor now, this is till useful because it +// allows checking if a particular geometry is contiguous without explicitly +// constructing a tensor, e.g., when you want to choose a kernel strategy based +// on whether a subgeometry is contiguous. +TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides); + struct TORCH_API TensorGeometry { TensorGeometry() : storage_offset_(0) {} @@ -21,7 +28,7 @@ struct TORCH_API TensorGeometry { numel_ = expected_stride; } - explicit TensorGeometry(const Tensor& t) + explicit TensorGeometry(const TensorBase& t) : sizes_(t.sizes().vec()) , strides_(t.strides().vec()) , storage_offset_(t.storage_offset()) @@ -32,12 +39,12 @@ struct TORCH_API TensorGeometry { int64_t dim() const { return sizes_.size(); } int64_t size(int64_t dim) const { - dim = maybe_wrap_dim(dim, this->dim()); + dim = c10::maybe_wrap_dim(dim, this->dim()); return sizes_.at(static_cast(dim)); } IntArrayRef sizes() const { return IntArrayRef{ sizes_ }; } int64_t stride(int64_t dim) const { - dim = maybe_wrap_dim(dim, this->dim()); + dim = c10::maybe_wrap_dim(dim, this->dim()); return strides_.at(static_cast(dim)); } IntArrayRef strides() const { return IntArrayRef{ strides_ }; } diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 754c73bb615..392915820d0 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -264,25 +264,6 @@ void * maybe_data_ptr(const TensorArg& tensor) { return tensor->defined() ? (void *)tensor->data_ptr() : nullptr; } -// See TensorUtils.h on why this is useful now that we cache is_contiguous. -bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides) { - int64_t dim = sizes.size(); - int64_t expected_stride = 1; - bool contig_if_nonempty = true; - for (int64_t i = dim - 1; i >= 0; i--) { - if (sizes[i] == 0) { - return true; - } - if (contig_if_nonempty) { - if (sizes[i] != 1 && strides[i] != expected_stride) { - contig_if_nonempty = false; - } - expected_stride *= sizes[i]; - } - } - return contig_if_nonempty; -} - void check_dim_size( const Tensor& tensor, int64_t dim, diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index f018c33f1ae..e8adf16ca18 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -138,13 +138,6 @@ TORCH_API void checkLayout(CheckedFrom c, at::ArrayRef tensors, at::Layo TORCH_API void* maybe_data_ptr(const Tensor& tensor); TORCH_API void* maybe_data_ptr(const TensorArg& tensor); -// Return if the tensor geometry represented by `sizes` and `strides` is contiguous -// Although we cache is_contiguous in tensor now, this is till useful because it -// allows checking if a particular geometry is contiguous without explicitly -// constructing a tensor, e.g., when you want to choose a kernel strategy based -// on whether a subgeometry is contiguous. -TORCH_API bool geometry_is_contiguous(IntArrayRef sizes, IntArrayRef strides); - TORCH_API void check_dim_size( const Tensor& tensor, int64_t dim, diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index 9d65522b5d9..9c60f84a16b 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp index 99b3d933bd8..c3cdd706750 100644 --- a/aten/src/ATen/native/GridSampler.cpp +++ b/aten/src/ATen/native/GridSampler.cpp @@ -864,8 +864,13 @@ Tensor grid_sampler_2d_cpu(const Tensor& input, const Tensor& grid, } } - return grid_sampler_2d_cpu_kernel( - kCPU, input, grid, interpolation_mode, padding_mode, align_corners); + auto in_size = input.sizes(); + auto grid_size = grid.sizes(); + auto output = at::empty( + {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options()); + grid_sampler_2d_cpu_kernel( + kCPU, output, input, grid, interpolation_mode, padding_mode, align_corners); + return output; } DEFINE_DISPATCH(grid_sampler_2d_cpu_kernel); @@ -911,8 +916,15 @@ grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, con } } - return grid_sampler_2d_backward_cpu_kernel( - kCPU, grad_output, input, grid, interpolation_mode, padding_mode, align_corners, output_mask); + Tensor grad_input; + if (output_mask[0]) { + grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + grid_sampler_2d_backward_cpu_kernel( + kCPU, grad_input, grad_grid, grad_output, input, grid, + interpolation_mode, padding_mode, align_corners, output_mask); + return std::make_tuple(std::move(grad_input), std::move(grad_grid)); } DEFINE_DISPATCH(grid_sampler_2d_backward_cpu_kernel); diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h index effc322c0d3..412465937aa 100644 --- a/aten/src/ATen/native/GridSampler.h +++ b/aten/src/ATen/native/GridSampler.h @@ -1,7 +1,9 @@ #pragma once -#include -#include +#include +#include +#include +#include namespace at { namespace native { diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 2f5789a8f38..631dd58b0dd 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp index 4e89a499d23..c866a177263 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -1,11 +1,12 @@ -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include #include -#include +#include +#include +#include +#include +#include +#include #include #include @@ -1146,13 +1147,12 @@ static inline void grid_sample_2d_grid_slice_iterator( // and backward. // See NOTE [ Grid Sample CPU Kernels ] for details. -Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, - int64_t padding_mode, bool align_corners) { +void grid_sampler_2d_cpu_kernel_impl( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { auto N = input.size(0); auto H = grid.size(1); auto W = grid.size(2); - auto output = at::empty({N, input.size(1), H, W}, input.options()); auto spatial_size = H * W; auto grain_size = spatial_size == 0 ? (N + 1) : at::divup(at::internal::GRAIN_SIZE, spatial_size * 4 /* 2d * 2 tensors*/); @@ -1207,18 +1207,18 @@ Tensor grid_sampler_2d_cpu_kernel_impl(const Tensor& input, const Tensor& grid, }); #undef HANDLE_CASE #undef HANDLE_INTERP - - return output; } -std::tuple -grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, - const Tensor& input, - const Tensor& grid, - int64_t interpolation_mode, - int64_t padding_mode, - bool align_corners, - std::array output_mask) { +void grid_sampler_2d_backward_cpu_kernel_impl( + const TensorBase &grad_input, + const TensorBase &grad_grid, + const TensorBase &grad_output_, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + std::array output_mask) { // grad_output should be contiguous most of time. Ensuring that it is // contiguous can greatly simplify this code. auto grad_output = grad_output_.contiguous(); @@ -1228,11 +1228,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, // is always computed.) auto input_requires_grad = output_mask[0]; - Tensor grad_input; - if (input_requires_grad) { - grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } - auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); auto N = input.size(0); auto spatial_size = grid.size(1) * grid.size(2); auto grain_size = spatial_size == 0 ? (N + 1) @@ -1315,8 +1310,6 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, }); #undef HANDLE_CASE #undef HANDLE_INTERP - - return std::make_tuple(grad_input, grad_grid); } } diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.h b/aten/src/ATen/native/cpu/GridSamplerKernel.h index aa4a24736da..b1830fcd391 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.h +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.h @@ -1,17 +1,33 @@ #pragma once -#include -#include -#include #include -#include -#include +#include +#include + +namespace at { +class TensorBase; +} namespace at { namespace native { -using forward_2d_fn = Tensor(*)(const Tensor &, const Tensor &, int64_t, int64_t, bool); -using backward_2d_fn = std::tuple(*)(const Tensor &, const Tensor &, const Tensor &, int64_t, int64_t, bool, std::array); +using forward_2d_fn = void (*) ( + const TensorBase &output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners); +using backward_2d_fn = void (*) ( + const TensorBase &grad_input, + const TensorBase &grad_grid, + const TensorBase &grad_output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + std::array output_mask); DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel); DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel); diff --git a/aten/src/ATen/native/cuda/Activation.h b/aten/src/ATen/native/cuda/Activation.h index ca0ad3828da..5fc52ff257c 100644 --- a/aten/src/ATen/native/cuda/Activation.h +++ b/aten/src/ATen/native/cuda/Activation.h @@ -1,4 +1,4 @@ - +#pragma once #include #include diff --git a/aten/src/ATen/native/cuda/GridSampler.cpp b/aten/src/ATen/native/cuda/GridSampler.cpp new file mode 100644 index 00000000000..c98ab7b2d31 --- /dev/null +++ b/aten/src/ATen/native/cuda/GridSampler.cpp @@ -0,0 +1,72 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#endif + +namespace at { +namespace native { + +Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode, + bool align_corners) { + auto in_size = input.sizes(); + auto grid_size = grid.sizes(); + auto output = at::empty( + {in_size[0], in_size[1], grid_size[1], grid_size[2]}, input.options()); + launch_grid_sampler_2d_forward_kernel( + output, input, grid, interpolation_mode, padding_mode, align_corners); + return output; +} + +Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, + int64_t interpolation_mode, int64_t padding_mode, + bool align_corners) { + auto in_size = input.sizes(); + auto grid_size = grid.sizes(); + auto output = at::empty( + {in_size[0], in_size[1], grid_size[1], grid_size[2], grid_size[3]}, + input.options()); + launch_grid_sampler_3d_forward_kernel( + output, input, grid, interpolation_mode, padding_mode, align_corners); + return output; +} + +std::tuple +grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, + const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask) { + Tensor grad_input; + if (output_mask[0]) { + grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + launch_grid_sampler_2d_backward_kernel( + grad_input, grad_grid, grad_output, input, + grid, interpolation_mode, padding_mode, align_corners, output_mask); + return std::make_tuple(grad_input, grad_grid); +} + +std::tuple +grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, + const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners) { + auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + launch_grid_sampler_3d_backward_kernel( + grad_input, grad_grid, grad_output, input, + grid, interpolation_mode, padding_mode, align_corners); + return std::make_tuple(grad_input, grad_grid); +} + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu index b358853c997..536422f65dd 100644 --- a/aten/src/ATen/native/cuda/GridSampler.cu +++ b/aten/src/ATen/native/cuda/GridSampler.cu @@ -1,10 +1,13 @@ -#include +#define TORCH_ASSERT_NO_OPERATORS +#include #include #include #include #include #include #include +#include +#include #include namespace at { namespace native { @@ -723,14 +726,12 @@ namespace { } // namespace // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { +void launch_grid_sampler_2d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { auto N = input.size(0); - auto C = input.size(1); auto H = grid.size(1); auto W = grid.size(2); - auto output = at::empty({N, C, H, W}, input.options()); int64_t count = N * H * W; if (count > 0) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_cuda", [&] { @@ -760,18 +761,16 @@ Tensor grid_sampler_2d_cuda(const Tensor& input, const Tensor& grid, } }); } - return output; } // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { +void launch_grid_sampler_3d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { auto N = input.size(0); auto D = grid.size(1); auto H = grid.size(2); auto W = grid.size(3); - auto output = at::empty({N, input.size(1), D, H, W}, input.options()); int64_t count = N * D * H * W; if (count > 0) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_cuda", [&] { @@ -801,15 +800,14 @@ Tensor grid_sampler_3d_cuda(const Tensor& input, const Tensor& grid, } }); } - return output; } // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple -grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, - int64_t padding_mode, bool align_corners, - std::array output_mask) { +void launch_grid_sampler_2d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("grid_sampler_2d_backward_cuda"); @@ -822,11 +820,6 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, // is always computed.) auto input_requires_grad = output_mask[0]; - Tensor grad_input; - if (input_requires_grad) { - grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } - auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); int64_t count = N * H * W; if (count > 0) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_backward_cuda", [&] { @@ -864,14 +857,14 @@ grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, } }); } - return std::make_tuple(grad_input, grad_grid); } // No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple -grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { +void launch_grid_sampler_3d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase& grad_output, const TensorBase& input, + const TensorBase& grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners) { // See Note [Writing Nondeterministic Operations] // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("grid_sampler_3d_backward_cuda"); @@ -879,8 +872,6 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, auto D = grid.size(1); auto H = grid.size(2); auto W = grid.size(3); - auto grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - auto grad_grid = at::empty_like(grid, LEGACY_CONTIGUOUS_MEMORY_FORMAT); int64_t count = N * D * H * W; if (count > 0) { AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_backward_cuda", [&] { @@ -916,7 +907,6 @@ grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, } }); } - return std::make_tuple(grad_input, grad_grid); } }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/GridSampler.cuh b/aten/src/ATen/native/cuda/GridSampler.cuh index 2fdf3bd5491..abc86f21749 100644 --- a/aten/src/ATen/native/cuda/GridSampler.cuh +++ b/aten/src/ATen/native/cuda/GridSampler.cuh @@ -1,5 +1,3 @@ -#include -#include #include namespace at { namespace native { diff --git a/aten/src/ATen/native/cuda/GridSampler.h b/aten/src/ATen/native/cuda/GridSampler.h new file mode 100644 index 00000000000..48411ec1dd7 --- /dev/null +++ b/aten/src/ATen/native/cuda/GridSampler.h @@ -0,0 +1,32 @@ +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at { +namespace native { + +void launch_grid_sampler_2d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_3d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_2d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +void launch_grid_sampler_3d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners); + +}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh index b609b42a4d9..f4d85512ba7 100644 --- a/aten/src/ATen/native/cuda/UpSample.cuh +++ b/aten/src/ATen/native/cuda/UpSample.cuh @@ -1,7 +1,10 @@ -#include -#include +#include #include +#include +#include +#include + #include namespace at { diff --git a/aten/src/ATen/native/quantized/cpu/qconcat.cpp b/aten/src/ATen/native/quantized/cpu/qconcat.cpp index 8e09e32c420..4322b3558f5 100644 --- a/aten/src/ATen/native/quantized/cpu/qconcat.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconcat.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index f63e4ea1668..c6a7e5a0791 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -1338,6 +1338,7 @@ aten_cuda_cu_source_list = [ "aten/src/ATen/native/cuda/Activation.cpp", "aten/src/ATen/native/cuda/Blas.cpp", "aten/src/ATen/native/cuda/Equal.cpp", + "aten/src/ATen/native/cuda/GridSampler.cpp", "aten/src/ATen/native/cuda/IndexKernel.cpp", "aten/src/ATen/native/cuda/ReduceOps.cpp", "aten/src/ATen/native/cuda/ScanKernels.cpp", From 111e52c5d71b6f34e950d6be4df678fcef4e9f1e Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Tue, 15 Feb 2022 07:11:37 -0800 Subject: [PATCH 040/199] Remove native_functions.yaml dependency from Sorting.cpp (#66980) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/66980 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D31856101 Pulled By: malfet fbshipit-source-id: a943fc5d60820952f02677cc6e5d988b08ec66a1 (cherry picked from commit 8da294a496b385f459adefa2bf852a06bb59d9f6) --- aten/src/ATen/native/Sorting.cpp | 13 +++ aten/src/ATen/native/Sorting.h | 11 ++- aten/src/ATen/native/SortingUtils.h | 87 ----------------- aten/src/ATen/native/TopKImpl.h | 95 +++++++++++++++++++ aten/src/ATen/native/cpu/SortingKernel.cpp | 32 +++---- .../cpu/kernels/QuantizedOpKernels.cpp | 2 +- 6 files changed, 128 insertions(+), 112 deletions(-) create mode 100644 aten/src/ATen/native/TopKImpl.h diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index ae88547a8aa..e75965382d2 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -45,6 +45,19 @@ namespace native { DEFINE_DISPATCH(sort_stub); DEFINE_DISPATCH(topk_stub); +void _fill_indices(const TensorBase &indices, int64_t dim) { + auto ndim = indices.dim(); + assert(0 <= dim && dim < ndim); + auto dim_size = indices.size(dim); + auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong)); + auto idx_dim_sizes = std::vector(ndim, 1); + auto idx_dim_strides = std::vector(ndim, 0); + idx_dim_sizes[dim] = dim_size; + idx_dim_strides[dim] = 1; + auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides); + OptionalTensorRef(indices)->copy_(idx_dim_restrided); +} + namespace { /* Note from TH: diff --git a/aten/src/ATen/native/Sorting.h b/aten/src/ATen/native/Sorting.h index edfc583a50b..91bfcc42e83 100644 --- a/aten/src/ATen/native/Sorting.h +++ b/aten/src/ATen/native/Sorting.h @@ -1,8 +1,11 @@ #pragma once -#include #include +namespace at { +class TensorBase; +} + namespace at { namespace native { @@ -14,11 +17,13 @@ enum class QUANTILE_INTERPOLATION_MODE : uint8_t { NEAREST }; -using sort_fn = void(*)(Tensor& values, Tensor& indices, int64_t dim, bool descending, bool stable); -using topk_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int64_t, int64_t, bool, bool); +using sort_fn = void(*)(const TensorBase &values, const TensorBase &indices, int64_t dim, bool descending, bool stable); +using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool); DECLARE_DISPATCH(sort_fn, sort_stub); DECLARE_DISPATCH(topk_fn, topk_stub); +void _fill_indices(const TensorBase &indices, int64_t dim); + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/SortingUtils.h b/aten/src/ATen/native/SortingUtils.h index f3d8805a352..f6065927eba 100644 --- a/aten/src/ATen/native/SortingUtils.h +++ b/aten/src/ATen/native/SortingUtils.h @@ -86,92 +86,5 @@ inline void _allocate_or_resize_output_with_indices( } } - -#ifdef CPU_CAPABILITY -inline namespace CPU_CAPABILITY { -#else -inline namespace DEFAULT { -#endif - -// Core topk loop, shared between CPU and QuantizedCPU -template -void topk_impl_loop( - const int64_t mode_values_stride, - const int64_t mode_indices_stride, - const int64_t tmp_values_stride, - const int64_t k, - const int64_t dim_size, - const bool largest, - const bool sorted, - char** data, const int64_t* strides, const int64_t n) { - - using elem_t = std::pair; - std::vector queue(dim_size); - for (const auto i : c10::irange(n)) { - TensorAccessor mode_values( - reinterpret_cast(data[0] + i * strides[0]), - &k, &mode_values_stride); - TensorAccessor mode_indices( - reinterpret_cast(data[1] + i * strides[1]), - &k, &mode_indices_stride); - TensorAccessor tmp_values( - reinterpret_cast(data[2] + i * strides[2]), - &dim_size, &tmp_values_stride); - - auto n = dim_size; - auto use_partial_sort = k * 64 <= n; - - for (const auto j : c10::irange(n)) { - queue[j].first = tmp_values[j]; - queue[j].second = j; - } - - // we want nan to be sorted as top for numpy compatibility - if (use_partial_sort) { - if (largest) { - std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), - [](const elem_t& x, const elem_t& y) -> bool { - return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); - }); - } else { - std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), - [](const elem_t& x, const elem_t& y) -> bool { - return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); - }); - } - } else { - if (largest) { - std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(), - [](const elem_t& x, const elem_t& y) -> bool { - return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); - }); - if (sorted) { - std::sort(queue.begin(), queue.begin() + k - 1, - [](const elem_t& x, const elem_t& y) -> bool { - return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); - }); - } - } else { - std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(), - [](const elem_t& x, const elem_t& y) -> bool { - return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); - }); - if (sorted) { - std::sort(queue.begin(), queue.begin() + k -1, - [](const elem_t& x, const elem_t& y) -> bool { - return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); - }); - } - } - } - - for (const auto j : c10::irange(k)) { - mode_values[j] = queue[j].first; - mode_indices[j] = queue[j].second; - } - } -} - -} // namespace CPU_CAPABILITY } // namespace native } // namespace at diff --git a/aten/src/ATen/native/TopKImpl.h b/aten/src/ATen/native/TopKImpl.h new file mode 100644 index 00000000000..69d5c70236b --- /dev/null +++ b/aten/src/ATen/native/TopKImpl.h @@ -0,0 +1,95 @@ +#pragma once +#include +#include + +namespace at { +namespace native { + +#ifdef CPU_CAPABILITY +inline namespace CPU_CAPABILITY { +#else +inline namespace DEFAULT { +#endif + +// Core topk loop, shared between CPU and QuantizedCPU +template +void topk_impl_loop( + const int64_t mode_values_stride, + const int64_t mode_indices_stride, + const int64_t tmp_values_stride, + const int64_t k, + const int64_t dim_size, + const bool largest, + const bool sorted, + char** data, const int64_t* strides, const int64_t n) { + + using elem_t = std::pair; + std::vector queue(dim_size); + for (const auto i : c10::irange(n)) { + TensorAccessor mode_values( + reinterpret_cast(data[0] + i * strides[0]), + &k, &mode_values_stride); + TensorAccessor mode_indices( + reinterpret_cast(data[1] + i * strides[1]), + &k, &mode_indices_stride); + TensorAccessor tmp_values( + reinterpret_cast(data[2] + i * strides[2]), + &dim_size, &tmp_values_stride); + + auto n = dim_size; + auto use_partial_sort = k * 64 <= n; + + for (const auto j : c10::irange(n)) { + queue[j].first = tmp_values[j]; + queue[j].second = j; + } + + // we want nan to be sorted as top for numpy compatibility + if (use_partial_sort) { + if (largest) { + std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + } else { + std::partial_sort(queue.begin(), queue.begin() + k, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + } + } else { + if (largest) { + std::nth_element(queue.begin(), queue.begin() + k - 1, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + if (sorted) { + std::sort(queue.begin(), queue.begin() + k - 1, + [](const elem_t& x, const elem_t& y) -> bool { + return ((_isnan(x.first) && !_isnan(y.first)) || (x.first > y.first)); + }); + } + } else { + std::nth_element(queue.begin(), queue.begin() + k -1, queue.end(), + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + if (sorted) { + std::sort(queue.begin(), queue.begin() + k -1, + [](const elem_t& x, const elem_t& y) -> bool { + return ((!_isnan(x.first) && _isnan(y.first)) || (x.first < y.first)); + }); + } + } + } + + for (const auto j : c10::irange(k)) { + mode_values[j] = queue[j].first; + mode_indices[j] = queue[j].second; + } + } +} + +} // namespace CPU_CAPABILITY +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp index 8eab924407d..bd4ae3159c9 100644 --- a/aten/src/ATen/native/cpu/SortingKernel.cpp +++ b/aten/src/ATen/native/cpu/SortingKernel.cpp @@ -1,33 +1,23 @@ -#include +#define TORCH_ASSERT_NO_OPERATORS +#include +#include #include #include #include #include #include #include -#include -#include +#include #include namespace at { namespace native { namespace { -void _fill_indices(Tensor& indices, int64_t dim) { - auto dim_size = indices.size(dim); - auto idx_dim = at::arange(0, dim_size, indices.options().dtype(at::kLong)); - auto idx_dim_sizes = std::vector(indices.dim(), 1); - auto idx_dim_strides = std::vector(indices.dim(), 0); - idx_dim_sizes[dim] = dim_size; - idx_dim_strides[dim] = 1; - auto idx_dim_restrided = idx_dim.as_strided(idx_dim_sizes, idx_dim_strides); - indices.copy_(idx_dim_restrided); -} - template void _dim_apply( - Tensor& values, - Tensor& indices, + const TensorBase &values, + const TensorBase &indices, int64_t dim, const std::string& method_name, const func_t& f) { @@ -95,8 +85,8 @@ struct KeyValueCompDesc { }; static void sort_kernel( - Tensor& values, - Tensor& indices, + const TensorBase &values, + const TensorBase &indices, int64_t dim, bool descending, bool stable) { @@ -143,9 +133,9 @@ static void sort_kernel( } static void topk_kernel( - const Tensor& values, - const Tensor& indices, - const Tensor& self, + const TensorBase &values, + const TensorBase &indices, + const TensorBase &self, int64_t k, int64_t dim, bool largest, diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index 77c9756e366..79ad11dfef3 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include From 23b98202b5b3fa1267dd92613b8ea15307879d64 Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Tue, 15 Feb 2022 07:11:37 -0800 Subject: [PATCH 041/199] Remove native_functions.yaml dependency from DistributionBernoulli.cu (#67721) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/67721 This moves the operator calls (`expand_inplace` and `to(kCUDA)`) into `bernoulli_impl_` which is shared between CPU and CUDA. So that the cuda code only needs to generates random numbers and nothing else. The other changes are just rearranging includes to avoid including `Tensor.h`. Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D32500426 Pulled By: malfet fbshipit-source-id: f855c2055392355e79e3df832fa56d2041ebf24e (cherry picked from commit 7ab7d17e941a6fb10be03f9a9538bc66f195b95f) --- aten/src/ATen/ExpandBase.h | 23 ++++++++++++++++++ aten/src/ATen/ExpandUtils.cpp | 7 ++++++ aten/src/ATen/Tensor.h | 2 +- aten/src/ATen/Utils.h | 23 ------------------ aten/src/ATen/core/Generator.h | 23 ++++++++++++++++++ aten/src/ATen/core/Tensor.cpp | 21 ++++++++++++++++ aten/src/ATen/core/TensorBase.h | 2 ++ aten/src/ATen/cuda/CUDAApplyUtils.cuh | 18 +++++++------- aten/src/ATen/cuda/CUDAGeneratorImpl.h | 2 -- aten/src/ATen/native/Copy.cpp | 2 +- aten/src/ATen/native/Copy.h | 3 ++- aten/src/ATen/native/Distributions.h | 2 -- aten/src/ATen/native/UnaryOps.h | 4 ++-- .../ATen/native/cpu/DistributionKernels.cpp | 12 +++++----- .../ATen/native/cpu/DistributionTemplates.h | 17 ++++++------- .../ATen/native/cuda/DistributionBernoulli.cu | 7 +++--- .../ATen/native/cuda/DistributionTemplates.h | 24 ++++++++----------- aten/src/ATen/native/cuda/Distributions.cu | 2 +- aten/src/ATen/native/cuda/TriangularOps.cu | 6 ++++- 19 files changed, 126 insertions(+), 74 deletions(-) create mode 100644 aten/src/ATen/ExpandBase.h diff --git a/aten/src/ATen/ExpandBase.h b/aten/src/ATen/ExpandBase.h new file mode 100644 index 00000000000..e0a24091da1 --- /dev/null +++ b/aten/src/ATen/ExpandBase.h @@ -0,0 +1,23 @@ +#include + +// Broadcasting utilities for working with TensorBase +namespace at { +namespace internal { +TORCH_API TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size); +} // namespace internal + +inline c10::MaybeOwned expand_size(const TensorBase &self, IntArrayRef size) { + if (size.equals(self.sizes())) { + return c10::MaybeOwned::borrowed(self); + } + return c10::MaybeOwned::owned( + at::internal::expand_slow_path(self, size)); +} +c10::MaybeOwned expand_size(TensorBase &&self, IntArrayRef size) = delete; + +inline c10::MaybeOwned expand_inplace(const TensorBase &tensor, const TensorBase &to_expand) { + return expand_size(to_expand, tensor.sizes()); +} +c10::MaybeOwned expand_inplace(const TensorBase &tensor, TensorBase &&to_expand) = delete; + +} // namespace at diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp index 35588ac62a2..a44005a2ef8 100644 --- a/aten/src/ATen/ExpandUtils.cpp +++ b/aten/src/ATen/ExpandUtils.cpp @@ -1,8 +1,15 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include namespace at { +namespace internal { +TensorBase expand_slow_path(const TensorBase &self, IntArrayRef size) { + return OptionalTensorRef(self)->expand(size); +} +} namespace { // NOTE: are_expandable did a similar check, please keep them sync if change is needed diff --git a/aten/src/ATen/Tensor.h b/aten/src/ATen/Tensor.h index 1dfb8bb4ffc..0b3719cca3b 100644 --- a/aten/src/ATen/Tensor.h +++ b/aten/src/ATen/Tensor.h @@ -1,3 +1,3 @@ #pragma once -#include +#include diff --git a/aten/src/ATen/Utils.h b/aten/src/ATen/Utils.h index 9160cbe2fed..36b0785400b 100644 --- a/aten/src/ATen/Utils.h +++ b/aten/src/ATen/Utils.h @@ -91,29 +91,6 @@ std::array check_intlist(ArrayRef list, const char * name, return res; } -/** - * Utility function to static cast input Generator* to - * the backend generator type (CPU/CUDAGeneratorImpl etc.) - */ -template -static inline T * check_generator(c10::optional gen) { - TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt"); - TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed"); - TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'"); - return gen->get(); -} - -/** - * Utility function used in tensor implementations, which - * supplies the default generator to tensors, if an input generator - * is not supplied. The input Generator* is also static casted to - * the backend generator type (CPU/CUDAGeneratorImpl etc.) - */ -template -static inline T* get_generator_or_default(const c10::optional& gen, const Generator& default_gen) { - return gen.has_value() && gen->defined() ? check_generator(gen) : check_generator(default_gen); -} - using at::detail::check_size_nonnegative; namespace detail { diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h index 1e6e8d54fa7..60323f3d3a0 100644 --- a/aten/src/ATen/core/Generator.h +++ b/aten/src/ATen/core/Generator.h @@ -138,6 +138,29 @@ Generator make_generator(Args&&... args) { return Generator(c10::make_intrusive(std::forward(args)...)); } +/** + * Utility function to static cast input Generator* to + * the backend generator type (CPU/CUDAGeneratorImpl etc.) + */ +template +static inline T * check_generator(c10::optional gen) { + TORCH_CHECK(gen.has_value(), "Expected Generator but received nullopt"); + TORCH_CHECK(gen->defined(), "Generator with undefined implementation is not allowed"); + TORCH_CHECK(T::device_type() == gen->device().type(), "Expected a '", T::device_type(), "' device type for generator but found '", gen->device().type(), "'"); + return gen->get(); +} + +/** + * Utility function used in tensor implementations, which + * supplies the default generator to tensors, if an input generator + * is not supplied. The input Generator* is also static casted to + * the backend generator type (CPU/CUDAGeneratorImpl etc.) + */ +template +static inline T* get_generator_or_default(const c10::optional& gen, const Generator& default_gen) { + return gen.has_value() && gen->defined() ? check_generator(gen) : check_generator(default_gen); +} + namespace detail { /** diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index 9f56923c1cd..fa175165d2e 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -4,6 +4,15 @@ #include #include +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#endif + #include namespace at { @@ -29,6 +38,18 @@ const TensorBase& TensorBase::zero_() const { return *this; } +TensorBase TensorBase::to( + at::TensorOptions options, + bool non_blocking, + bool copy, + c10::optional memory_format) const { + Tensor self(*this); + return at::_ops::to_dtype_layout::call( + self, optTypeMetaToScalarType(options.dtype_opt()), + options.layout_opt(), options.device_opt(), + options.pinned_memory_opt(), non_blocking, copy, memory_format); +} + void TensorBase::enforce_invariants() { if (impl_.get() == nullptr) { throw std::runtime_error("TensorImpl with nullptr is not supported"); diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index b05f74259dc..45c8325ecb9 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -142,6 +142,8 @@ class TORCH_API TensorBase { const TensorBase& fill_(const c10::Scalar& scalar) const; const TensorBase& zero_() const; + TensorBase to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional memory_format=c10::nullopt) const; + bool is_complex() const { return at::isComplexType(this->scalar_type()); } diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh index 44e24ab52b9..6a8ca194397 100644 --- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -378,12 +378,14 @@ kernelPointwiseApply2(detail::TensorInfo a, template -inline bool CUDA_tensor_apply2(at::Tensor a, - at::Tensor b, +inline bool CUDA_tensor_apply2(at::TensorBase a, + at::TensorBase b, const Op op, TensorArgType aType = TensorArgType::ReadWrite, TensorArgType bType = TensorArgType::ReadOnly) { - checkDeviceType("CUDA_tensor_apply2", {a, b}, DeviceType::CUDA); + TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(), + "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got " + "tensors with type ", a.device().type(), " and ", b.device().type()); int64_t totalElements = a.numel(); if (totalElements != b.numel()) { @@ -413,8 +415,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a, This ensures that each element of the tensor is operated on once and only once. */ - Tensor oldA; - Tensor oldB; + TensorBase oldA; + TensorBase oldB; if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) { // Must perform in contiguous space @@ -524,8 +526,8 @@ inline bool CUDA_tensor_apply2(at::Tensor a, template -inline bool CUDA_tensor_apply2(at::Tensor a, - at::Tensor b, +inline bool CUDA_tensor_apply2(const at::TensorBase &a, + const at::TensorBase &b, const Op op, TensorArgType aType = TensorArgType::ReadWrite, TensorArgType bType = TensorArgType::ReadOnly) { diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h index 3fddd855646..768f0b7549c 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h @@ -1,9 +1,7 @@ #pragma once -#include #include #include -#include #include #include diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index caf2dfe7773..5496facf847 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -258,7 +258,7 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) { return self; } -void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src) { +void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) { // Called when we are copying into an overlapping index `dst`, but we don't // care which writer wins. Hacky but it works. This is only used by // CUDA_tensor_apply2 in case that there are write overlaps. diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h index 6f688a73e84..14abb32fa5a 100644 --- a/aten/src/ATen/native/Copy.h +++ b/aten/src/ATen/native/Copy.h @@ -6,6 +6,7 @@ namespace at { class Tensor; struct TensorIterator; +class TensorBase; namespace native { @@ -13,7 +14,7 @@ using copy_fn = void (*)(TensorIterator&, bool non_blocking); DECLARE_DISPATCH(copy_fn, copy_stub); -TORCH_API void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src); +TORCH_API void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src); } // namespace native } // namespace at diff --git a/aten/src/ATen/native/Distributions.h b/aten/src/ATen/native/Distributions.h index ebfaf463136..2c334157eba 100644 --- a/aten/src/ATen/native/Distributions.h +++ b/aten/src/ATen/native/Distributions.h @@ -1,7 +1,5 @@ #pragma once -#include -#include #include #include #include diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h index 47224d51fc3..574d503961c 100644 --- a/aten/src/ATen/native/UnaryOps.h +++ b/aten/src/ATen/native/UnaryOps.h @@ -73,8 +73,8 @@ DECLARE_DISPATCH(unary_fn, trunc_stub); DECLARE_DISPATCH(unary_fn, lgamma_stub); // NB: these are actually defined in Distribution -DECLARE_DISPATCH(void(*)(Tensor&, const Tensor&, c10::optional), bernoulli_tensor_stub); -DECLARE_DISPATCH(void(*)(Tensor&, const double, c10::optional), bernoulli_scalar_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const TensorBase&, c10::optional), bernoulli_tensor_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, c10::optional), bernoulli_scalar_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), cauchy_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional), exponential_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional), geometric_stub); diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp index f6803e5a399..373a96e916d 100644 --- a/aten/src/ATen/native/cpu/DistributionKernels.cpp +++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp @@ -1,9 +1,9 @@ #include #include +#include #include #include #include -#include #include #include @@ -25,22 +25,22 @@ static void cauchy_kernel(TensorIteratorBase& iter, double median, double sigma, templates::cpu::cauchy_kernel(iter, median, sigma, generator); } -void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional gen) { +void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::bernoulli_kernel(self, p_, generator); } -void bernoulli_scalar_kernel_default(Tensor& self, double p, c10::optional gen) { +void bernoulli_scalar_kernel_default(const TensorBase &self, double p, c10::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::bernoulli_kernel(self, p, generator); } #if !AT_MKL_ENABLED() -void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { bernoulli_scalar_kernel_default(self, p, gen); } #else -void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { if (cpuinfo_initialize() && cpuinfo_vendor_intel == cpuinfo_get_processor(0)->core->vendor) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); int64_t seed; @@ -87,7 +87,7 @@ void bernoulli_scalar_kernel(Tensor &self, double p, c10::optional ge // copy_ if using buffer and non contiguous if (!contig) { - self.copy_(tmp_int_tensor); + OptionalTensorRef(self)->copy_(tmp_int_tensor); } }); } else { diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index 6c017e15c46..d1228f7983e 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -1,7 +1,8 @@ #pragma once -#include #include +#include +#include #include #include #include @@ -308,25 +309,25 @@ struct ExponentialKernel { // ================================================== Bernoulli ======================================================= template -void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) { +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generator) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_tensor_cpu_self_", [&] { // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); using self_t = scalar_t; auto p_cpu = p_.to(kCPU); - c10::MaybeOwned p = expand_inplace(self, p_cpu); + auto p = expand_inplace(self, p_cpu); auto iter = TensorIteratorConfig() .add_output(self) .add_input(*p) .check_all_same_dtype(false) .build(); - if (p_.scalar_type() == kDouble) { + if (p->scalar_type() == kDouble) { cpu_serial_kernel(iter, [&](const double p_val) -> self_t { at::bernoulli_distribution bernoulli(p_val); return static_cast(bernoulli(generator)); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p_.scalar_type(), "bernoulli_tensor_cpu_p_", [&] { + AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, p->scalar_type(), "bernoulli_tensor_cpu_p_", [&] { using p_t = scalar_t; cpu_serial_kernel(iter, [&](const p_t p_val) -> self_t { at::bernoulli_distribution bernoulli(p_val); @@ -338,7 +339,7 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG generator) { } template -void bernoulli_kernel(Tensor& self, double p, RNG generator) { +void bernoulli_kernel(const TensorBase &self, double p, RNG generator) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Bool, at::ScalarType::BFloat16, self.scalar_type(), "bernoulli_scalar_cpu_", [&] { // See Note [Acquire lock when using random generators] std::lock_guard lock(generator->mutex_); @@ -352,10 +353,10 @@ void bernoulli_kernel(Tensor& self, double p, RNG generator) { template struct BernoulliKernel { - void operator()(Tensor& self, double p, c10::optional gen) { + void operator()(const TensorBase &self, double p, c10::optional gen) { bernoulli_kernel(self, p, check_generator(gen)); } - void operator()(Tensor& self, const Tensor& p_, c10::optional gen) { + void operator()(const TensorBase &self, const TensorBase &p_, c10::optional gen) { bernoulli_kernel(self, p_, check_generator(gen)); } }; diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu index 8c9c59e7861..a7967122db9 100644 --- a/aten/src/ATen/native/cuda/DistributionBernoulli.cu +++ b/aten/src/ATen/native/cuda/DistributionBernoulli.cu @@ -1,6 +1,5 @@ +#define TORCH_ASSERT_NO_OPERATORS #include -#include -#include #include #include #include @@ -24,12 +23,12 @@ namespace at { namespace native { -void bernoulli_tensor_kernel(Tensor& self, const Tensor& p_, c10::optional gen_) { +void bernoulli_tensor_kernel(const TensorBase &self, const TensorBase &p_, c10::optional gen_) { auto generator = get_generator_or_default(gen_, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::bernoulli_kernel(self, p_, generator); } -void bernoulli_scalar_kernel(Tensor& self, double p, c10::optional gen) { +void bernoulli_scalar_kernel(const TensorBase &self, double p, c10::optional gen) { auto iter = TensorIterator::borrowing_nullary_op(self); auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::bernoulli_kernel(iter, p, generator); diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index 54324cbbaf5..d7c857cd70e 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -430,7 +430,7 @@ void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transfo // ==================================================== Normal ======================================================== template -void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) { +void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) { auto iter = TensorIterator::borrowing_nullary_op(self); AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] { using accscalar_t = at::acc_type; @@ -446,7 +446,7 @@ void normal_kernel(Tensor& self, double mean_, double std_, RNG gen) { template struct NormalKernel { - void operator()(Tensor& self, double mean, double std, c10::optional gen) { + void operator()(const TensorBase &self, double mean, double std, c10::optional gen) { normal_kernel(self, mean, std, check_generator(gen)); } }; @@ -574,7 +574,7 @@ struct CauchyKernel { template void bernoulli_tensor_cuda_kernel( - at::Tensor& ret, const at::Tensor& p, + const TensorBase &ret, const at::TensorBase &p, PhiloxCudaState philox_args) { auto functor = [philox_args] __device__( int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4, @@ -618,7 +618,7 @@ void bernoulli_tensor_cuda_kernel( } template -void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) { +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) { PhiloxCudaState rng_engine_inputs; { // See Note [Acquire lock when using random generators] @@ -626,14 +626,10 @@ void bernoulli_kernel(Tensor& self, const Tensor& p_, RNG gen) { rng_engine_inputs = gen->philox_cuda_state(10); } TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type()); - auto p_CUDA = p_.to(kCUDA); - //cast probabilities tensor to double for double `self` tensor, and to `float` for everything else - if (self.dtype() == at::kDouble) { - p_CUDA = p_CUDA.to(at::kDouble); - } else { - p_CUDA = p_CUDA.to(at::kFloat); - } - c10::MaybeOwned p = expand_inplace(self, p_CUDA); + // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else + const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat; + auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type)); + auto p = expand_inplace(self, p_cuda); AT_DISPATCH_ALL_TYPES_AND3( at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] { if (std::is_same::value) { @@ -662,7 +658,7 @@ struct BernoulliKernel { void operator()(TensorIteratorBase& iter, double p, c10::optional gen) { bernoulli_kernel(iter, p, check_generator(gen)); } - void operator()(Tensor& self, const Tensor& p_, c10::optional gen) { + void operator()(const TensorBase &self, const TensorBase &p_, c10::optional gen) { bernoulli_kernel(self, p_, check_generator(gen)); } }; diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index d7ab78c1812..6669624faff 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu index 3a0f8fb1e4d..1e264a08907 100644 --- a/aten/src/ATen/native/cuda/TriangularOps.cu +++ b/aten/src/ATen/native/cuda/TriangularOps.cu @@ -1,15 +1,19 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include #include #include #include -#include #include #ifndef AT_PER_OPERATOR_HEADERS #include +#include #else #include +#include +#include +#include #endif #include From dc169d53aa266560750ea25ee0cf31c7e614550d Mon Sep 17 00:00:00 2001 From: Peter Bell Date: Tue, 15 Feb 2022 07:11:37 -0800 Subject: [PATCH 042/199] Remove native_functions.yaml dependency from DistributionNormal.cu (#67874) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/67874 Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D32500424 Pulled By: malfet fbshipit-source-id: d8f09b8bd7aa1c7ae91403438b412fe5f555c8ff (cherry picked from commit 0b9ee87503329fbce90160190834ba31f942de03) --- aten/src/ATen/native/UnaryOps.h | 4 +-- .../ATen/native/cpu/DistributionKernels.cpp | 2 +- .../ATen/native/cpu/DistributionTemplates.h | 6 ++--- .../ATen/native/cuda/DistributionNormal.cu | 25 +++---------------- 4 files changed, 9 insertions(+), 28 deletions(-) diff --git a/aten/src/ATen/native/UnaryOps.h b/aten/src/ATen/native/UnaryOps.h index 574d503961c..0a9afd9cd4d 100644 --- a/aten/src/ATen/native/UnaryOps.h +++ b/aten/src/ATen/native/UnaryOps.h @@ -6,7 +6,7 @@ namespace at { class Tensor; -struct TensorIterator; +class TensorBase; struct TensorIteratorBase; } @@ -80,7 +80,7 @@ DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, c10::optional), geometric_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), log_normal_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const double, const double, c10::optional), uniform_stub); -DECLARE_DISPATCH(void(*)(Tensor&, const double, const double, c10::optional), normal_stub); +DECLARE_DISPATCH(void(*)(const TensorBase&, const double, const double, c10::optional), normal_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, const uint64_t, const int64_t, c10::optional), random_from_to_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional), random_full_64_bits_range_stub); DECLARE_DISPATCH(void(*)(TensorIteratorBase&, c10::optional), random_stub); diff --git a/aten/src/ATen/native/cpu/DistributionKernels.cpp b/aten/src/ATen/native/cpu/DistributionKernels.cpp index 373a96e916d..9db9733f599 100644 --- a/aten/src/ATen/native/cpu/DistributionKernels.cpp +++ b/aten/src/ATen/native/cpu/DistributionKernels.cpp @@ -117,7 +117,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from, double to, c10::optio templates::cpu::uniform_kernel(iter, from, to, generator); } -void normal_kernel(Tensor& self, double mean, double std, c10::optional gen) { +void normal_kernel(const TensorBase &self, double mean, double std, c10::optional gen) { CPUGeneratorImpl* generator = get_generator_or_default(gen, detail::getDefaultCPUGenerator()); templates::cpu::normal_kernel(self, mean, std, generator); } diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index d1228f7983e..37c799803ea 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -106,7 +106,7 @@ static void normal_fill_16_AVX2(float *data, } template -void normal_fill_AVX2(Tensor& self, const float mean, const float std, RNG generator) { +void normal_fill_AVX2(const TensorBase &self, const float mean, const float std, RNG generator) { float *data = self.data_ptr(); auto size = self.numel(); std::lock_guard lock(generator->mutex_); @@ -149,7 +149,7 @@ static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t s } template -void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG generator) { +void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { scalar_t *data = self.data_ptr(); auto size = self.numel(); std::lock_guard lock(generator->mutex_); @@ -173,7 +173,7 @@ void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG gene } template -void normal_kernel(Tensor& self, double mean, double std, RNG generator) { +void normal_kernel(const TensorBase &self, double mean, double std, RNG generator) { auto size = self.numel(); if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) { #ifdef CPU_CAPABILITY_AVX2 diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu index 025c70c4260..28330dbd69a 100644 --- a/aten/src/ATen/native/cuda/DistributionNormal.cu +++ b/aten/src/ATen/native/cuda/DistributionNormal.cu @@ -1,30 +1,11 @@ -#include -#include -#include -#include -#include -#include +#define TORCH_ASSERT_NO_OPERATORS #include +#include #include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - namespace at { namespace native { -void normal_kernel(Tensor& self, double mean, double std, c10::optional gen) { +void normal_kernel(const TensorBase &self, double mean, double std, c10::optional gen) { auto generator = get_generator_or_default(gen, cuda::detail::getDefaultCUDAGenerator()); at::native::templates::cuda::normal_kernel(self, mean, std, generator); } From 80f23469dd5cbb4b4d52cfdd628974697aa98e99 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 15 Feb 2022 07:15:04 -0800 Subject: [PATCH 043/199] Revert D34152115: [pytorch][PR] [ROCm] Enable sort operator BF16 support Test Plan: revert-hammer Differential Revision: D34152115 (https://github.com/pytorch/pytorch/commit/aa44480b40075d4f696605f3ab7a03d87372b4e2) Original commit changeset: 53841c91976b Original Phabricator Diff: D34152115 (https://github.com/pytorch/pytorch/commit/aa44480b40075d4f696605f3ab7a03d87372b4e2) fbshipit-source-id: c9b5cc06198032af73cd6390466de2c62576a1e1 (cherry picked from commit eb72533ae9723610cc3974f5c45a827c438460f1) --- aten/src/ATen/cuda/cub.cu | 3 ++ aten/src/ATen/cuda/cub.cuh | 29 ++++--------------- aten/src/ATen/native/cuda/Sort.cu | 10 +++---- test/test_sort_and_select.py | 7 +++++ .../_internal/common_methods_invocations.py | 4 +-- 5 files changed, 22 insertions(+), 31 deletions(-) diff --git a/aten/src/ATen/cuda/cub.cu b/aten/src/ATen/cuda/cub.cu index 8a64da6756c..6915a1c2b98 100644 --- a/aten/src/ATen/cuda/cub.cu +++ b/aten/src/ATen/cuda/cub.cu @@ -57,7 +57,10 @@ AT_INSTANTIATE_SORT_PAIRS(int64_t, 4) AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8) +// BFloat16 is not supported by ROCm's radix sort +#if !AT_ROCM_ENABLED() AT_INSTANTIATE_SORT_PAIRS(c10::BFloat16, 8) +#endif } // namespace detail diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index bf51ccce49c..6ac9905f571 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -45,23 +45,17 @@ #ifdef USE_ROCM #define NO_ROCM(x) -#define ROCM_HIPCUB(x) ::hipcub #else #define NO_ROCM(x) x -#define ROCM_HIPCUB(x) x #endif -#if !CUB_SUPPORTS_NV_BFLOAT16() || \ - (defined(USE_ROCM) && ROCM_VERSION >= 40500) +#if !defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16() -#if !defined(USE_ROCM) namespace at_cuda_detail { -#endif - // backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16 template <> -struct ROCM_HIPCUB(cub)::FpLimits +struct cub::FpLimits { static __host__ __device__ __forceinline__ c10::BFloat16 Max() { unsigned short max_word = 0x7F7F; @@ -74,14 +68,8 @@ struct ROCM_HIPCUB(cub)::FpLimits } }; -template <> -struct ROCM_HIPCUB(cub)::NumericTraits: - ROCM_HIPCUB(cub)::BaseTraits {}; - -#if !defined(USE_ROCM) -} // namespace at_cuda_detail -#endif - +template <> struct cub::NumericTraits: cub::BaseTraits {}; +} #endif #if !defined(USE_ROCM) @@ -105,20 +93,13 @@ struct cuda_type { using type = __half; }; -#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16() +#if CUB_SUPPORTS_NV_BFLOAT16() template<> struct cuda_type { using type = __nv_bfloat16; }; -#elif (defined(USE_ROCM) && ROCM_VERSION >= 40500) - -template<> -struct cuda_type { - using type = hip_bfloat16; -}; - #endif } // namespace detail diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu index 3ceb3001e50..15c89f7b76e 100644 --- a/aten/src/ATen/native/cuda/Sort.cu +++ b/aten/src/ATen/native/cuda/Sort.cu @@ -325,14 +325,14 @@ void launch_stable_sort_kernel( TORCH_CHECK(nbatch > 0, "Cannot sort dimension of length ", nsort); int64_t *indices_ptr = indices.data_ptr(); -#if (defined(USE_ROCM) && ROCM_VERSION < 40500) - constexpr bool is_rocm_bf16_sort_unsupported = true; +#if defined(USE_ROCM) + constexpr bool is_rocm = true; #else - constexpr bool is_rocm_bf16_sort_unsupported = false; + constexpr bool is_rocm = false; #endif AT_DISPATCH_ALL_TYPES_AND3(kBool, kHalf, kBFloat16, self.scalar_type(), "sort", [&]{ - c10::guts::if_constexpr::value)>([&](auto _){ + c10::guts::if_constexpr::value)>([&](auto _){ const scalar_t *self_ptr = self.data_ptr(); scalar_t *values_ptr = values.data_ptr(); int64_t remaining = _(numel); @@ -353,7 +353,7 @@ void launch_stable_sort_kernel( values_ptr += n; indices_ptr += n; } - }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm < 4.5"); }); + }, [&](auto _){ TORCH_CHECK(_(false), "BFloat16 is not supported on ROCm"); }); }); } diff --git a/test/test_sort_and_select.py b/test/test_sort_and_select.py index 840635acbbf..b44b09ffa1d 100644 --- a/test/test_sort_and_select.py +++ b/test/test_sort_and_select.py @@ -135,6 +135,8 @@ class TestSortAndSelect(TestCase): # FIXME: remove torch.bool from unsupported types once support is added for cub sort @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) def test_stable_sort(self, device, dtype): + if TEST_WITH_ROCM and dtype == torch.bfloat16: + return sizes = (100, 1000, 10000) for ncopies in sizes: x = torch.tensor([0, 1] * ncopies, dtype=dtype, device=device) @@ -228,6 +230,8 @@ class TestSortAndSelect(TestCase): # FIXME: remove torch.bool from unsupported types once support is added for cub sort @dtypes(*set(get_all_dtypes()) - {torch.bool, torch.complex64, torch.complex128}) def test_stable_sort_against_numpy(self, device, dtype): + if TEST_WITH_ROCM and dtype == torch.bfloat16: + return if dtype in floating_types_and(torch.float16, torch.bfloat16): inf = float('inf') neg_inf = -float('inf') @@ -291,6 +295,9 @@ class TestSortAndSelect(TestCase): @dtypes(*(get_all_int_dtypes() + get_all_fp_dtypes())) def test_msort(self, device, dtype): + if TEST_WITH_ROCM and dtype == torch.bfloat16: + return + def test(shape): tensor = make_tensor(shape, device, dtype, low=-9, high=9) if tensor.size() != torch.Size([]): diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 45c06edb9a3..411137efb6f 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -13285,7 +13285,7 @@ op_db: List[OpInfo] = [ OpInfo('sort', dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), - dtypesIfROCM=all_types_and(torch.float16, torch.bfloat16), + dtypesIfROCM=all_types_and(torch.float16), sample_inputs_func=sample_inputs_sort, supports_forward_ad=True, supports_fwgrad_bwgrad=True, @@ -13931,7 +13931,7 @@ op_db: List[OpInfo] = [ OpInfo('msort', dtypes=all_types_and(torch.bool, torch.float16, torch.bfloat16), dtypesIfCUDA=all_types_and(torch.float16, torch.bfloat16), - dtypesIfROCM=all_types_and(torch.float16, torch.bfloat16), + dtypesIfROCM=all_types_and(torch.float16), check_batched_gradgrad=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, From f43165a75ff6f645aa0d36b57359a8a4225c836c Mon Sep 17 00:00:00 2001 From: Anton Jansson Date: Tue, 15 Feb 2022 07:15:19 -0800 Subject: [PATCH 044/199] Remove duplicate call to objective function in strong wolfe line search in L-BFGS optimizer. (#72773) Summary: With this change, the optimizer is almost twice as fast as before. As the result of the first call is never used, it looks like a copy paste error and therefore can be removed. In addition, this duplicate call is not present in the Python implementation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72773 Reviewed By: samdow Differential Revision: D34214312 Pulled By: albanD fbshipit-source-id: 4f4de08633c7236f3ccce8a2a74e56500003281b (cherry picked from commit 4a63f812ab8020a11b6f4766f16d641a179f6a56) --- torch/csrc/api/src/optim/lbfgs.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/torch/csrc/api/src/optim/lbfgs.cpp b/torch/csrc/api/src/optim/lbfgs.cpp index d7d8dd002eb..d3143b07ccd 100644 --- a/torch/csrc/api/src/optim/lbfgs.cpp +++ b/torch/csrc/api/src/optim/lbfgs.cpp @@ -232,7 +232,6 @@ std::tuple _strong_wolfe(const Function& obj_fu auto d_norm = val(d.abs().max()); g = g.clone(at::MemoryFormat::Contiguous); // evaluate objective and gradient using initial step - auto obj_func_res = obj_func(x, t, d); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) double f_new; Tensor g_new; @@ -285,7 +284,6 @@ std::tuple _strong_wolfe(const Function& obj_fu f_prev = f_new; g_prev = g_new.clone(at::MemoryFormat::Contiguous); gtd_prev = gtd_new; - obj_func_res = obj_func(x, t, d); std::tie(f_new, g_new) = obj_func(x, t, d); ls_func_evals += 1; gtd_new = g_new.dot(d); @@ -335,9 +333,7 @@ std::tuple _strong_wolfe(const Function& obj_fu } // Evaluate new point - obj_func_res = obj_func(x, t, d); - f_new = std::get<0>(obj_func_res); - g_new = std::get<1>(obj_func_res); + std::tie(f_new, g_new) = obj_func(x, t, d); ls_func_evals += 1; gtd_new = g_new.dot(d); ls_iter += 1; From 32dd4a8639025bd3ed2541736be85bf8c26f5990 Mon Sep 17 00:00:00 2001 From: Huamin Li Date: Tue, 15 Feb 2022 08:07:58 -0800 Subject: [PATCH 045/199] move fx_acc out of pytorch core (#72803) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72803 as title Reviewed By: jfix71 Differential Revision: D34101788 fbshipit-source-id: a9fd84671929af21405c049603e9895ec68de3d8 (cherry picked from commit e98fd1c32d34d26a3a4e8ad8ace041735312422e) --- docs/source/conf.py | 1 - test/fx_acc/test_acc_tracer.py | 2104 ----------------- torch/fx/experimental/const_fold.py | 24 +- torch/fx/experimental/fx_acc/__init__.py | 0 .../fx/experimental/fx_acc/acc_normalizer.py | 444 ---- .../experimental/fx_acc/acc_op_properties.py | 45 - torch/fx/experimental/fx_acc/acc_ops.py | 1924 --------------- torch/fx/experimental/fx_acc/acc_tracer.py | 462 ---- torch/fx/experimental/fx_acc/acc_utils.py | 175 -- torch/fx/passes/graph_manipulation.py | 2 +- torch/testing/_internal/common_fx2trt.py | 2 +- 11 files changed, 24 insertions(+), 5159 deletions(-) delete mode 100644 test/fx_acc/test_acc_tracer.py delete mode 100644 torch/fx/experimental/fx_acc/__init__.py delete mode 100644 torch/fx/experimental/fx_acc/acc_normalizer.py delete mode 100644 torch/fx/experimental/fx_acc/acc_op_properties.py delete mode 100644 torch/fx/experimental/fx_acc/acc_ops.py delete mode 100644 torch/fx/experimental/fx_acc/acc_tracer.py delete mode 100644 torch/fx/experimental/fx_acc/acc_utils.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 8cf8459614d..0e55297b27b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -243,7 +243,6 @@ coverage_missing_automodule = [ "torch.fft", "torch.for_onnx", "torch.fx.experimental", - "torch.fx.experimental.fx_acc", "torch.fx.experimental.unification", "torch.fx.experimental.unification.multipledispatch", "torch.fx.passes", diff --git a/test/fx_acc/test_acc_tracer.py b/test/fx_acc/test_acc_tracer.py deleted file mode 100644 index f16eef8e528..00000000000 --- a/test/fx_acc/test_acc_tracer.py +++ /dev/null @@ -1,2104 +0,0 @@ -# Owner(s): ["oncall: fx"] - -import unittest -from typing import Callable, List - -import numpy as np -import torch -import torch.fx.experimental.fx_acc.acc_normalizer as acc_normalizer -import torch.fx.experimental.fx_acc.acc_ops as acc_ops -import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer -import torch.fx.experimental.fx_acc.acc_utils as acc_utils -import torch.nn as nn -import torchvision -from parameterized import parameterized, param - -torch.manual_seed(0) - - -class AccTracerTest(unittest.TestCase): - def _make_model_unit_test( - self, - model, - *args, - input_shape=None, - enable_allclose=False, - **kwargs, - ): - """ - Test that the model can be traced correctly and is producing correct - result. - """ - if input_shape is None: - input_shape = [1, 3, 224, 224] - input = torch.randn(input_shape) - traced = acc_tracer.trace(model, [input]) - if enable_allclose: - torch.testing.assert_allclose(model(input), traced(input)) - else: - self.assertTrue(torch.equal(model(input), traced(input))) - traced_again = acc_tracer.trace(traced, [input]) - if enable_allclose: - torch.testing.assert_allclose(model(input), traced_again(input)) - else: - self.assertTrue(torch.equal(model(input), traced_again(input))) - - def _make_acc_op_function_test( - self, - acc_op: Callable, - torch_op, - *args, - input_shape=(2, 3), - validate_same_kwargs=True, - enable_allclose=False, - **kwargs, - ): - """ - Test that acc_op is traced somewhat. - """ - - class TestModule(torch.nn.Module): - def __init__(self, torch_op, args, kwargs): - super().__init__() - self._torch_op = torch_op - self._args = args - self._kwargs = kwargs - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self._torch_op(a, *self._args, **self._kwargs) - m = TestModule(torch_op, args, kwargs) - m.eval() - a = torch.randn(*input_shape) - traced = acc_tracer.trace(m, [a]) - ph_a = acc_op_node = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_op) - self.assertEqual(node.kwargs["input"], ph_a) - if validate_same_kwargs: - for key, value in kwargs.items(): - self.assertEqual(node.kwargs[key], value) - acc_op_node = node - elif node.op == "output": - if acc_op is None: - # If we expect no new acc_op after graph building - # and found we have only output in traced graph - continue - self.assertEqual(acc_op_node, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref_outputs = m(a) - outputs = traced(a) - traced_again = acc_tracer.trace(traced, [a]) - outputs_again = traced_again(a) - if isinstance(ref_outputs, torch.Tensor): - ref_outputs = [ref_outputs] - outputs = [outputs] - outputs_again = [outputs_again] - - for ref_output, output, output_again in zip( - ref_outputs, outputs, outputs_again - ): - if enable_allclose: - torch.testing.assert_allclose( - torch.nan_to_num(ref_output), torch.nan_to_num(output) - ) - torch.testing.assert_allclose( - torch.nan_to_num(ref_output), torch.nan_to_num(output_again) - ) - else: - self.assertTrue( - torch.equal(torch.nan_to_num(ref_output), torch.nan_to_num(output)) - ) - self.assertTrue( - torch.equal( - torch.nan_to_num(ref_output), torch.nan_to_num(output_again) - ) - ) - - def test_sum(self): - self._make_acc_op_function_test(acc_ops.sum, torch.sum) - self._make_acc_op_function_test(acc_ops.sum, torch.sum, dim=(1,), keepdim=True) - - def test_prod(self): - self._make_acc_op_function_test(acc_ops.prod, torch.prod) - self._make_acc_op_function_test(acc_ops.prod, torch.prod, dim=1, keepdim=True) - - def test_mean(self): - self._make_acc_op_function_test(acc_ops.mean, torch.mean) - self._make_acc_op_function_test( - acc_ops.mean, torch.mean, dim=(1,), keepdim=True - ) - - def test_pad(self): - self._make_acc_op_function_test( - acc_ops.pad, torch.nn.functional.pad, pad=(2, 0) - ) - - def test_max(self): - def torch_max(x, *args, **kwargs): - return x.max(*args, **kwargs) - - self._make_acc_op_function_test(acc_ops.max_full_reduce, torch_max) - self._make_acc_op_function_test( - acc_ops.max_dim_reduce, torch_max, dim=1, keepdim=True - ) - self._make_acc_op_function_test( - acc_ops.max_dim_reduce, torch_max, input_shape=(1, 4), dim=1, keepdim=True - ) - self._make_acc_op_function_test( - acc_ops.max_dim_reduce, torch_max, input_shape=(3, 4, 3), dim=2 - ) - - @parameterized.expand( - [ - param("max_maximum", orig_op=torch.max, expected_op=acc_ops.maximum), - param( - "maximum_maximum", orig_op=torch.maximum, expected_op=acc_ops.maximum - ), - param("min_minimum", orig_op=torch.min, expected_op=acc_ops.minimum), - param( - "minimum_minimum", orig_op=torch.minimum, expected_op=acc_ops.minimum - ), - ] - ) - def test_maximum_minimum(self, _: str, orig_op, expected_op): - class TestModule(torch.nn.Module): - def __init__(self, orig_op): - super().__init__() - self.orig_op = orig_op - - def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor: - return self.orig_op(input, other) - - m = TestModule(orig_op) - input, other = torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, [input, other]) - - ph_in = ph_oth = mxm = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "other": - ph_oth = node - else: - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == expected_op: - self.assertEqual(node.kwargs["input"], ph_in) - self.assertEqual(node.kwargs["other"], ph_oth) - mxm = node - elif node.op == "output": - self.assertEqual(mxm, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input, other), traced(input, other))) - - def test_conv(self): - """ - Test that a conv is traced as expected. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.Conv2d(8, 7, 3, stride=2) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.conv(a) - - m = TestModule() - input = torch.randn(3, 8, 10, 10) - traced = acc_tracer.trace(m, [input]) - - ph = weight_attr = bias_attr = conv = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "conv.weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "conv.bias": - bias_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.conv2d) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - self.assertEqual(node.kwargs["stride"], (2, 2)) - self.assertEqual(node.kwargs["padding"], (0, 0)) - self.assertEqual(node.kwargs["dilation"], (1, 1)) - self.assertEqual(node.kwargs["groups"], 1) - conv = node - elif node.op == "output": - self.assertEqual(conv, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_quantized_conv2d(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.quantized.Conv2d(3, 3, 1) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.conv(a) - - m = TestModule() - input = torch.quantize_per_tensor( - torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8 - ) - traced = acc_tracer.trace(m, [input]) - print(traced.graph) - ph = weight_attr = bias_attr = conv = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "conv_weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "conv_bias": - bias_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.quantized_conv2d) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - conv = node - elif node.op == "output": - self.assertEqual(conv, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_quantized_convrelu2d(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.conv = nn.intrinsic.quantized.ConvReLU2d(3, 3, 1) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.conv(a) - - m = TestModule() - input = torch.quantize_per_tensor( - torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8 - ) - traced = acc_tracer.trace(m, [input]) - ph = weight_attr = bias_attr = conv = relu = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "conv_weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "conv_bias": - bias_attr = node - elif node.op == "call_function" and node.target == acc_ops.quantized_conv2d: - self.assertEqual(node.target, acc_ops.quantized_conv2d) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - conv = node - elif node.op == "call_function" and node.target == acc_ops.relu: - self.assertEqual(node.target, acc_ops.relu) - self.assertEqual(node.kwargs["input"], conv) - relu = node - elif node.op == "output": - self.assertEqual(relu, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_embedding_bag(self): - """ - Test that an embedding_bag is traced as expected. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.eb = nn.EmbeddingBag(10, 3, mode="sum", include_last_offset=True) - - def forward(self, inp: torch.Tensor, offsets: torch.Tensor) -> torch.Tensor: - return self.eb(inp, offsets) - - m = TestModule() - inp = torch.LongTensor([1, 2, 4, 5, 4, 3, 2, 9]) - offsets = torch.LongTensor([0, 4]) - traced = acc_tracer.trace(m, [inp, offsets]) - - inp_node = offsets_node = weight_attr = eb_node = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "inp": - inp_node = node - elif str(node.target) == "offsets": - offsets_node = node - else: - self.fail(f"Unexpected placeholder {node.target}.") - continue - elif node.op == "get_attr" and node.target == "eb.weight": - weight_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.embedding_bag) - # Note: Normalization called from acc_tracer means we use all kwargs. - self.assertEqual(node.kwargs["input"], inp_node) - self.assertEqual(node.kwargs["offsets"], offsets_node) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["mode"], "sum") - self.assertEqual(node.kwargs["include_last_offset"], True) - # The rest of these were unspecified, so verify they fell back - # to their respective default values thanks to normalization. - self.assertEqual(node.kwargs["max_norm"], None) - self.assertEqual(node.kwargs["norm_type"], 2.0) - self.assertEqual(node.kwargs["scale_grad_by_freq"], False) - self.assertEqual(node.kwargs["sparse"], False) - self.assertEqual(node.kwargs["per_sample_weights"], None) - eb_node = node - elif node.op == "output": - self.assertEqual(eb_node, node.args[0]) - - self.assertTrue(torch.equal(m(inp, offsets), traced(inp, offsets))) - - def test_embedding_bag_byte_and_4bit_rowwise_offsets(self): - """ - Test that 4 bit quantized embedding_bag is traced as expected. - """ - - class TestModule(nn.Module): - def __init__( - self, - op, - q_weights, - per_index_weights, - ): - super().__init__() - self.emb = op - self.q_weights = q_weights - self.per_index_weights = per_index_weights - - def forward( - self, - indices, - offsets, - ): - return self.emb( - self.q_weights, - indices, - offsets, - mode=0, - per_sample_weights=self.per_index_weights, - include_last_offset=True, - ) - - def run_embedding_bag_test(is_4bit, use_weights): - # generate random indices, offsets, and weights. - num_embeddings = 16 - embedding_dim = 32 - num_lengths = 10 - - weights = torch.from_numpy( - (np.random.random_sample((num_embeddings, embedding_dim)) + 1).astype( - np.float32 - ) - ) - q_weights = ( - torch.ops.quantized.embedding_bag_4bit_prepack(weights) - if is_4bit - else torch.ops.quantized.embedding_bag_byte_prepack(weights) - ) - np_lengths = np.random.randint(0, num_lengths, size=10).astype(np.int32) - - num_lengths = np.sum(np_lengths) - indices = torch.from_numpy( - np.random.randint(low=0, high=num_embeddings, size=num_lengths) - ).int() - - lengths = torch.from_numpy(np_lengths) - offsets = torch.cat([torch.zeros([1]), torch.cumsum(lengths, 0)]).int() - - weights = torch.randint(low=0, high=4, size=indices.size()) - per_sample_weights = weights.to(torch.float32) - - indices = indices.to(torch.int32) - offsets = offsets.to(torch.int32) - inputs = [ - indices, - offsets, - ] - - op = ( - torch.ops.quantized.embedding_bag_4bit_rowwise_offsets - if is_4bit - else torch.ops.quantized.embedding_bag_byte_rowwise_offsets - ) - - m = TestModule( - op, - q_weights, - per_sample_weights, - ) - - traced = acc_tracer.trace(m, inputs) - print(traced.graph) - - expected_target = ( - acc_ops.embedding_bag_4bit_rowwise_offsets - if is_4bit - else acc_ops.embedding_bag_byte_rowwise_offsets - ) - - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "indices": - inp_node = node - elif str(node.target) == "offsets": - offsets_node = node - else: - self.fail(f"Unexpected placeholder {node.target}.") - continue - elif node.op == "get_attr" and node.target == "q_weights": - weight_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, expected_target) - # Note: Normalization called from acc_tracer means we use all kwargs. - self.assertEqual(node.kwargs["indices"], inp_node) - self.assertEqual(node.kwargs["offsets"], offsets_node) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["mode"], 0) - self.assertEqual(node.kwargs["include_last_offset"], True) - # The rest of these were unspecified, so verify they fell back - # to their respective default values thanks to normalization. - eb_node = node - elif node.op == "output": - self.assertEqual(eb_node, node.args[0]) - self.assertTrue(torch.equal(m(indices, offsets), traced(indices, offsets))) - - # test 8-bit - run_embedding_bag_test(is_4bit=False, use_weights=True) - # test 4-bit - run_embedding_bag_test(is_4bit=True, use_weights=True) - - def test_quantized_batch_norm2d(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.bn = nn.quantized.BatchNorm2d(3) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.bn(a) - - m = TestModule() - m.eval() - input = torch.quantize_per_tensor( - torch.randn(1, 3, 1, 1), scale=0.01, zero_point=3, dtype=torch.quint8 - ) - traced = acc_tracer.trace(m, [input]) - ph = weight_attr = bias_attr = bn_mean = bn_var = bn = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "bn.weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "bn.bias": - bias_attr = node - elif node.op == "get_attr" and node.target == "bn.running_mean": - bn_mean = node - elif node.op == "get_attr" and node.target == "bn.running_var": - bn_var = node - elif node.op == "get_attr" and node.target == "bn.scale": - bn_scale = node - elif node.op == "get_attr" and node.target == "bn.zero_point": - bn_zero_point = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.quantized_batch_norm2d) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - self.assertEqual(node.kwargs["running_mean"], bn_mean) - self.assertEqual(node.kwargs["running_var"], bn_var) - self.assertEqual(node.kwargs["acc_out_ty"][6]["scale"], bn_scale) - self.assertEqual( - node.kwargs["acc_out_ty"][6]["zero_point"], bn_zero_point - ) - bn = node - elif node.op == "output": - self.assertEqual(bn, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_linear(self): - """ - Test that a linear is traced as expected, i.e. to the functional level and with - kwarg normalization. Also verify that symbolic shape inference worked as part of - the acc_tracer. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.linear = nn.Linear(3, 5, bias=True) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.linear(a) - - m = TestModule() - test_input = torch.randn(1, 3) - traced = acc_tracer.trace(m, test_input) - ph = weight_attr = bias_attr = linear = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "linear.weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "linear.bias": - bias_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.linear) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - linear = node - elif node.op == "output": - self.assertEqual(linear, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - self.assertTrue(torch.equal(m(test_input), traced(test_input))) - - def test_quantized_linear(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.linear = nn.quantized.Linear(3, 5) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.linear(a) - - m = TestModule() - input = torch.quantize_per_tensor( - torch.randn(2, 3), scale=0.01, zero_point=3, dtype=torch.quint8 - ) - traced = acc_tracer.trace(m, [input]) - ph = weight_attr = bias_attr = linear = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "linear_weight": - weight_attr = node - elif node.op == "get_attr" and node.target == "linear_bias": - bias_attr = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.quantized_linear) - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight_attr) - self.assertEqual(node.kwargs["bias"], bias_attr) - linear = node - elif node.op == "output": - self.assertEqual(linear, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input), traced(input))) - - @parameterized.expand( - [ - param("remove_exceptions_false", remove_exceptions=False), - param("remove_exceptions_true", remove_exceptions=True), - ] - ) - def test_batch_norm(self, _, remove_exceptions): - """ - Test that a batch norm is traced as expected, i.e. to the functional level - and with kwarg normalization. Note that we also expect to see a - ConditionalExceptionWrapper in the graph that the AST rewriter converted - from `if x: raise y`. - - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.bn = torch.nn.BatchNorm2d(2) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.bn(a) - - m = TestModule() - input = torch.randn(2, 2, 1, 1) - # Note: Explicitly not removing exceptions so that we can check they - # were found and exist below. - traced = acc_tracer.trace( - m, - [input], - remove_exceptions=remove_exceptions, - ) - - ph = exception_wrapper = weight = bias = mean = var = bn = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "get_attr" and node.target == "bn.weight": - weight = node - elif node.op == "get_attr" and node.target == "bn.bias": - bias = node - elif node.op == "get_attr" and node.target == "bn.running_mean": - mean = node - elif node.op == "get_attr" and node.target == "bn.running_var": - var = node - elif node.op == "call_function" and node.target == acc_ops.batch_norm: - # Note: Normalization called from acc_tracer means we use - # all kwargs. - self.assertEqual(node.kwargs["input"], ph) - self.assertEqual(node.kwargs["weight"], weight) - self.assertEqual(node.kwargs["bias"], bias) - self.assertEqual(node.kwargs["running_mean"], mean) - self.assertEqual(node.kwargs["running_var"], var) - bn = node - elif ( - node.op == "call_module" - and node.target == "bn._conditional_exception_wrapper_ValueError" - ): - exception_wrapper = node - elif node.op == "output": - self.assertEqual(bn, node.args[0]) - - self.assertTrue(remove_exceptions or exception_wrapper is not None) - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_remove_asserts(self): - """ - Test that a Module with asserts has the asserts automatically removed, as - well as calls to a class method that should be dead. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def _test_method(self, a): - return a - - def forward(self, a: torch.Tensor) -> torch.Tensor: - assert torch.equal(self._test_method(a), a) - return a - - m = TestModule() - input = torch.randn(10) - traced = acc_tracer.trace(m, [input], ast_rewriter_allow_list={TestModule}) - # Check we have no call_functions. If remove asserts didn't work - # correctly we would see a call to torch._assert, _test_method, and - # torch.equal. - for node in traced.graph.nodes: - self.assertFalse(node.op == "call_function") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_no_rewrite_leaf_module(self): - """ - Test that when we supply a leaf module, we don't rewrite it - """ - - class TestChildModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return a.relu() - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.child = TestChildModule() - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.child(a) + self.child(a) - - m = TestModule() - input = torch.randn(10) - traced = acc_tracer.trace(m, [input], leaf_module_list={TestChildModule}) - # trace it again just in case - traced = acc_tracer.trace(traced, [input], leaf_module_list={TestChildModule}) - - for _, m in traced.named_children(): - self.assertFalse("__AccRewrittenModule" in str(type(m)), str(type(m))) - - def test_sequential(self): - """ - Test that the tracer works for torch.nn.Sequential. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.model = nn.Sequential(nn.Sigmoid(), nn.ReLU()) - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return self.model(a) - - m = TestModule() - input = torch.randn(10) - traced = acc_tracer.trace(m, [input]) - - for node in traced.graph.nodes: - if node.op == "call_function": - is_sigmoid = node.target == acc_ops.sigmoid - is_relu = node.target == acc_ops.relu - self.assertTrue(is_sigmoid or is_relu) - else: - self.assertTrue(node.op == "placeholder" or node.op == "output") - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_unsqueeze(self): - """ - Test that torch.unsqueeze is traced correctly. - """ - self._make_acc_op_function_test( - acc_ops.unsqueeze, - torch.unsqueeze, - validate_same_kwargs=False, - dim=1, - ) - - def test_stack(self): - """ - Test that torch.stack is traced correctly. - """ - - class TestModule(torch.nn.Module): - def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: - return torch.stack((a, b), dim=1) - - a, b = torch.randn(4, 5, 6), torch.randn(4, 5, 6) - mod = TestModule() - traced = acc_tracer.trace(mod, [a, b]) - self.assertTrue(torch.equal(mod(a, b), traced(a, b))) - - ph_a = ph_b = unsqueeze_a = unsqueeze_b = cat_node = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - ph_b = node - elif node.op == "call_function": - if node.target == acc_ops.unsqueeze: - if node.kwargs["input"] is ph_a: - unsqueeze_a = node - else: - self.assertEqual(node.kwargs["input"], ph_b) - unsqueeze_b = node - else: - self.assertEqual(node.target, acc_ops.cat) - self.assertEqual(node.kwargs["tensors"], [unsqueeze_a, unsqueeze_b]) - cat_node = node - elif node.op == "output": - self.assertEqual(cat_node, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - def test_no_raise(self): - """ - self that we can trace `if x: raise y(msg)` when the raise isn't executed. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a, b): - if torch.equal(a, b): - raise AssertionError("a equaled b!") - return a - - m = TestModule() - in_a, in_b = torch.randn(5), torch.randn(5) - traced = acc_tracer.trace( - m, - [in_a, in_b], - remove_exceptions=False, - use_acc_normalization=False, - ast_rewriter_allow_list={TestModule}, - ) - - # Verify the structure of the graph, including the existence of the - # exception_wrapper. - ph_a = exception_wrapper = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - elif node.op == "call_module": - self.assertEqual( - node.target, "_conditional_exception_wrapper_AssertionError" - ) - exception_wrapper = node - elif node.op == "output": - self.assertEqual(ph_a, node.args[0]) - - self.assertTrue(exception_wrapper is not None) - - self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_b))) - - def test_yes_raise(self): - """ - Test that we can trace `if x: raise y(msg)` when the raise is executed. - """ - err_str = "a equaled b!" - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.err_str = err_str - - def forward(self, a, b): - if torch.equal(a, b): - raise RuntimeError(self.err_str) - return a - - m = TestModule() - # Note: We must use different inputs here in order for shape_prop to work, as - # otherwise the exception is thrown (as expected/checked below). - in_a, in_b = torch.randn(5), torch.randn(5) - traced = acc_tracer.trace( - m, - [in_a, in_b], - remove_exceptions=False, - ast_rewriter_allow_list={TestModule}, - ) - - # Verify the structure of the graph, including the existence of the - # exception_wrapper. - ph_a = exception_wrapper = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - elif node.op == "call_module": - self.assertEqual( - node.target, "_conditional_exception_wrapper_RuntimeError" - ) - exception_wrapper = node - elif node.op == "output": - self.assertEqual(ph_a, node.args[0]) - - self.assertTrue(exception_wrapper is not None) - - def test(mod): - try: - # Note: Use the same input here to ensure the exception is thrown. - mod(in_a, in_a) - self.fail("Shouldn't get here because exception should be thrown.") - except RuntimeError as e: - self.assertEqual(err_str, str(e)) - - test(m) - test(traced) - - def test_remove_raise(self): - """ - Test that we can trace `if x: raise y(msg)` and then remove the exception_wrapper. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a, b): - if torch.equal(a, b): - raise AssertionError("a equaled b!") - return a - - m = TestModule() - in_a, in_b = torch.randn(5), torch.randn(5) - traced = acc_tracer.trace( - m, - [in_a, in_b], - remove_exceptions=True, - ast_rewriter_allow_list={TestModule}, - ) - - # Verify the structure of the graph, including the existence of the - # exception_wrapper. - ph_a = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - elif node.op == "output": - self.assertEqual(ph_a, node.args[0]) - else: - # Should not encounter any call_modules, e.g. to the - # exception_wrapper. - self.assertFalse(node.op == "call_module") - - # Note: Using input in_a twice for the tracer version, which would - # trigger the raise if it was still there. - self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_a))) - - def test_raise_no_message(self): - """ - Test that we can trace `if x: raise y` when `y` has no message. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a, b): - if torch.equal(a, b): - raise AssertionError - return a - - m = TestModule() - in_a, in_b = torch.randn(5), torch.randn(5) - traced = acc_tracer.trace( - m, - [in_a, in_b], - remove_exceptions=False, - use_acc_normalization=False, - ast_rewriter_allow_list={TestModule}, - ) - - # Verify the structure of the graph, including the existence of the - # exception_wrapper. - ph_a = exception_wrapper = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - elif node.op == "call_module": - self.assertEqual( - node.target, "_conditional_exception_wrapper_AssertionError" - ) - exception_wrapper = node - elif node.op == "output": - self.assertEqual(ph_a, node.args[0]) - - self.assertTrue(exception_wrapper is not None) - self.assertTrue(torch.equal(m(in_a, in_b), traced(in_a, in_b))) - - def test_quantized_add(self): - """ - Test that a quantized_add and acc_ops.quantize_per_tensor are traced as expected, - verifying the acc_out_tys are set as expected. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.q_input = torch.nn.quantized.Quantize( - scale=1.0 / 128, zero_point=5, dtype=torch.quint8 - ) - self.q_other = torch.nn.quantized.Quantize( - scale=1.0 / 128, zero_point=10, dtype=torch.quint8 - ) - - def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor: - return torch.ops.quantized.add( - self.q_input(input), - self.q_other(other), - scale=0.05, - zero_point=1, - ) - - m = TestModule() - input, other = torch.randn(2, 3, 4), torch.randn(2, 3, 4) - traced = acc_tracer.trace(m, [input, other]) - - input_ph = other_ph = q_input = q_other = q_add = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "input": - input_ph = node - else: - self.assertTrue(str(node.target) == "other") - other_ph = node - elif ( - node.op == "call_function" - and node.target == acc_ops.quantize_per_tensor - ): - qparams = { - "scale": 1.0 / 128, - "zero_point": 5, - } - expected_md = acc_utils.build_raw_tensor_meta( - dtype=torch.quint8, - qparams=qparams, - ) - if node.kwargs["input"] == input_ph: - q_input = node - else: - self.assertTrue(node.kwargs["input"] == other_ph) - q_other = node - qparams_copy = qparams.copy() - qparams_copy["zero_point"] = 10 - expected_md = expected_md._replace(qparams=qparams_copy) - self.assertEqual(node.kwargs["acc_out_ty"], expected_md) - elif node.op == "call_function" and node.target == acc_ops.quantized_add: - self.assertEqual(node.kwargs["input"], q_input) - self.assertEqual(node.kwargs["other"], q_other) - qparams = { - "scale": 0.05, - "zero_point": 1, - } - expected_md = acc_utils.build_raw_tensor_meta(qparams=qparams) - self.assertEqual(node.kwargs["acc_out_ty"], expected_md) - q_add = node - elif node.op == "output": - self.assertEqual(q_add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input, other), traced(input, other))) - - def test_quantized_mul(self): - """ - Test that a quantized_mul and acc_ops.quantize_per_tensor are traced as expected, - verifying the acc_out_tys are set as expected. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.q_input = torch.nn.quantized.Quantize( - scale=1.0 / 128, zero_point=5, dtype=torch.quint8 - ) - self.q_other = torch.nn.quantized.Quantize( - scale=1.0 / 128, zero_point=10, dtype=torch.quint8 - ) - - def forward(self, input: torch.Tensor, other: torch.Tensor) -> torch.Tensor: - return torch.ops.quantized.mul( - self.q_input(input), - self.q_other(other), - scale=0.05, - zero_point=1, - ) - - m = TestModule() - input, other = torch.randn(2, 3, 4), torch.randn(2, 3, 4) - traced = acc_tracer.trace(m, [input, other]) - - input_ph = other_ph = q_input = q_other = q_add = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "input": - input_ph = node - else: - self.assertTrue(str(node.target) == "other") - other_ph = node - elif ( - node.op == "call_function" - and node.target == acc_ops.quantize_per_tensor - ): - qparams = { - "scale": 1.0 / 128, - "zero_point": 5, - } - expected_md = acc_utils.build_raw_tensor_meta( - dtype=torch.quint8, - qparams=qparams, - ) - if node.kwargs["input"] == input_ph: - q_input = node - else: - self.assertTrue(node.kwargs["input"] == other_ph) - q_other = node - qparams_copy = qparams.copy() - qparams_copy["zero_point"] = 10 - expected_md = expected_md._replace(qparams=qparams_copy) - self.assertEqual(node.kwargs["acc_out_ty"], expected_md) - elif node.op == "call_function" and node.target == acc_ops.quantized_mul: - self.assertEqual(node.kwargs["input"], q_input) - self.assertEqual(node.kwargs["other"], q_other) - qparams = { - "scale": 0.05, - "zero_point": 1, - } - expected_md = acc_utils.build_raw_tensor_meta(qparams=qparams) - self.assertEqual(node.kwargs["acc_out_ty"], expected_md) - q_add = node - elif node.op == "output": - self.assertEqual(q_add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input, other), traced(input, other))) - - def test_cat(self): - """ - Test that torch.cat is traced correctly. - """ - - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: - return torch.cat([a, a, b], 0) - - m = TestModule() - a, b = torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, (a, b)) - - ph_a = ph_b = cat = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - ph_b = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.cat) - self.assertEqual(node.kwargs["tensors"][0], ph_a) - self.assertEqual(node.kwargs["tensors"][1], ph_a) - self.assertEqual(node.kwargs["tensors"][2], ph_b) - self.assertEqual(node.kwargs["dim"], 0) - cat = node - elif node.op == "output": - self.assertEqual(cat, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(a, b), traced(a, b))) - - def test_square(self): - """ - Test that torch.square is traced correctly. - """ - self._make_acc_op_function_test(acc_ops.mul, torch.square) - - def test_reshape(self): - """ - Test that torch.reshape is traced correctly. - """ - self._make_acc_op_function_test(acc_ops.reshape, torch.reshape, (1, -1)) - # arg = (1, -1) - self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.reshape(1, -1)) - # arg = ((1, -1)) - self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.reshape((1, -1))) - - def test_transpose(self): - """ - Test that torch.transpose is traced correctly. - """ - self._make_acc_op_function_test( - acc_ops.permute, lambda x: torch.transpose(x, 1, 0) - ) - - def test_permute(self): - """ - Test that torch.permute is traced correctly. - """ - - def torch_permute(a, *dim): - return a.permute(*dim) - - self._make_acc_op_function_test(acc_ops.permute, torch_permute, 1, 0) - - def test_min_full_reduce(self): - """ - Test that test_min_full_reduce is traced correctly. - """ - self._make_acc_op_function_test(acc_ops.min_full_reduce, torch.min) - - def test_matmul(self): - """ - Test that torch.matmul is traced correctly. - """ - - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: - return torch.matmul(a, b) - - m = TestModule() - a, b = torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, [a, b]) - - ph_a = ph_b = matmul = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - else: - self.assertTrue(str(node.target) == "b") - ph_b = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.matmul) - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - matmul = node - elif node.op == "output": - self.assertEqual(matmul, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(a, b), traced(a, b))) - - def test_bmm(self): - self._make_acc_op_function_test( - acc_ops.matmul, lambda x: torch.bmm(x, x), input_shape=(2, 4, 4) - ) - - def test_tile(self): - return self._make_acc_op_function_test( - acc_ops.tile, lambda x: torch.tile(x, (2, 1, 2)), input_shape=(1, 2) - ) - - def test_dropout(self): - self._make_acc_op_function_test( - None, - lambda x: nn.functional.dropout(x, training=False), - input_shape=(1, 2, 3), - ) - - def test_stochastic_depth(self): - self._make_acc_op_function_test( - None, - lambda x, p, mode, training: torchvision.ops.stochastic_depth( - x, p=p, mode=mode, training=training - ), - input_shape=(1, 2, 3), - p=0.5, - mode="row", - training=False, - ) - - def test_hardsigmoid(self): - self._make_acc_op_function_test( - acc_ops.hardsigmoid, - lambda x: nn.functional.hardsigmoid(x), - input_shape=(3, 4, 5), - ) - - def test_hardtanh(self): - self._make_acc_op_function_test( - acc_ops.hardtanh, - lambda x: nn.functional.hardtanh(x), - input_shape=(3, 4, 5), - ) - - def test_hardswish(self): - class TestModule(nn.Module): - def forward(self, x: torch.Tensor) -> torch.Tensor: - y = nn.functional.hardswish(x) - return y - - m = TestModule() - x = torch.randn(3, 4, 5) - traced = acc_tracer.trace(m, x) - ph_x = hardsigmoid_y = res_y = None - for node in traced.graph.nodes: - if node.op == "placeholder": - ph_x = node - elif node.op == "call_function" and node.target == acc_ops.hardsigmoid: - hardsigmoid_y = node - self.assertEqual(node.kwargs["input"], ph_x) - elif node.op == "call_function" and node.target == acc_ops.mul: - res_y = node - self.assertEqual(node.kwargs["input"], hardsigmoid_y) - self.assertEqual(node.kwargs["other"], ph_x) - elif node.op == "output": - self.assertEqual(node.args[0], res_y) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref = m(x) - res = traced(x) - torch.testing.assert_allclose(ref, res) - - def test_add_with_alpha(self): - """ - Test that normalization works for torch add with alpha, which requires special - normalization handling. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor: - a1 = torch.add(a, b) - a2 = torch.add(a, b, alpha=1.0) - a3 = torch.add(a, b, alpha=0.5) - return a1, a2, a3 - - m = TestModule() - input_a = torch.randn(2, 3) - input_b = torch.randn(2, 3) - traced = acc_tracer.trace(m, [input_a, input_b]) - - ph_a = ph_b = add_1 = add_2 = add_3 = mul = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - elif str(node.target) == "b": - ph_b = node - else: - self.fail(f"Unexpected placeholder {node.target}.") - elif node.op == "call_function" and node.target == acc_ops.mul: - mul = node - self.assertEqual(node.kwargs["input"], ph_b) - self.assertEqual(node.kwargs["other"], 0.5) - elif node.op == "call_function" and node.target == acc_ops.add: - if add_1 is None: - add_1 = node - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - elif add_2 is None: - add_2 = node - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - elif add_3 is None: - add_3 = node - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], mul) - else: - self.fail(f"Unexpected add: {node.format_node()}") - elif node.op == "output": - self.assertEqual(node.args[0][0], add_1) - self.assertEqual(node.args[0][1], add_2) - self.assertEqual(node.args[0][2], add_3) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref = m(input_a, input_b) - res = traced(input_a, input_b) - self.assertTrue(torch.equal(ref[0], res[0])) - self.assertTrue(torch.equal(ref[1], res[1])) - self.assertTrue(torch.equal(ref[2], res[2])) - - def test_leaf_module_list(self): - """ - Test leaf_module_list is working properly. - """ - - class LeafModule(nn.Module): - def forward(self, x): - return x - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - self.mod = LeafModule() - - def forward(self, x): - return self.mod(x) - - x = torch.randn(1, 1) - mod = TestModule() - acc_mod = acc_tracer.trace( - mod, - [x], - leaf_module_list={LeafModule}, - ) - ph = leaf_module = None - for node in acc_mod.graph.nodes: - if node.op == "placeholder": - ph = node - elif node.op == "call_module": - leaf_module = node - self.assertEqual(leaf_module.target, "mod") - self.assertEqual(leaf_module.args[0], ph) - elif node.op == "output": - self.assertEqual(node.args[0], leaf_module) - else: - self.fail(f"Unexpected node: {node.format_node()}") - self.assertTrue(torch.equal(mod(x), acc_mod(x))) - - def test_sign(self): - self._make_acc_op_function_test(acc_ops.sign, torch.sign) - - def test_relu(self): - self._make_acc_op_function_test(acc_ops.relu, torch.relu) - - def test_leaky_relu(self): - self._make_acc_op_function_test( - acc_ops.leaky_relu, torch.nn.functional.leaky_relu - ) - - def test_elu(self): - self._make_acc_op_function_test(acc_ops.elu, torch.nn.functional.elu) - - def test_selu(self): - self._make_acc_op_function_test(acc_ops.selu, torch.nn.functional.selu) - - def test_softsign(self): - self._make_acc_op_function_test(acc_ops.softsign, torch.nn.functional.softsign) - - def test_sigmoid(self): - self._make_acc_op_function_test(acc_ops.sigmoid, torch.sigmoid) - - def test_sin(self): - self._make_acc_op_function_test(acc_ops.sin, torch.sin) - - def test_cos(self): - self._make_acc_op_function_test(acc_ops.cos, torch.cos) - - def test_tan(self): - self._make_acc_op_function_test(acc_ops.tan, torch.tan) - - def test_sinh(self): - self._make_acc_op_function_test(acc_ops.sinh, torch.sinh) - - def test_cosh(self): - self._make_acc_op_function_test(acc_ops.cosh, torch.cosh) - - def test_tanh(self): - self._make_acc_op_function_test(acc_ops.tanh, torch.tanh) - - def test_asin(self): - self._make_acc_op_function_test(acc_ops.asin, torch.asin) - - def test_acos(self): - self._make_acc_op_function_test(acc_ops.acos, torch.acos) - - def test_atan(self): - self._make_acc_op_function_test(acc_ops.atan, torch.atan) - - def test_exp(self): - self._make_acc_op_function_test(acc_ops.exp, torch.exp) - - def test_log(self): - self._make_acc_op_function_test(acc_ops.log, torch.log) - - def test_sqrt(self): - self._make_acc_op_function_test(acc_ops.sqrt, torch.sqrt) - - def test_reciprocal(self): - self._make_acc_op_function_test(acc_ops.reciprocal, torch.reciprocal) - - def test_abs(self): - self._make_acc_op_function_test(acc_ops.abs, torch.abs) - - def test_neg(self): - self._make_acc_op_function_test(acc_ops.neg, torch.neg) - - def test_floor(self): - self._make_acc_op_function_test(acc_ops.floor, torch.floor) - - def test_ceil(self): - self._make_acc_op_function_test(acc_ops.ceil, torch.ceil) - - def test_softmax(self): - self._make_acc_op_function_test(acc_ops.softmax, torch.nn.functional.softmax) - - def test_tensor_squeeze(self): - self._make_acc_op_function_test(acc_ops.squeeze, lambda x: x.squeeze()) - - def test_torch_squeeze(self): - self._make_acc_op_function_test(acc_ops.squeeze, lambda x: torch.squeeze(x)) - - def test_operator_mul(self): - self._make_acc_op_function_test(acc_ops.mul, lambda x: x * 7) - - def test_torch_mul(self): - self._make_acc_op_function_test(acc_ops.mul, lambda x: torch.mul(x, 7)) - - def test_div(self): - self._make_acc_op_function_test(acc_ops.div, lambda x: torch.div(x, 2)) - self._make_acc_op_function_test(acc_ops.div, lambda x: x / 2) - - def test_floor_div(self): - self._make_acc_op_function_test( - acc_ops.floor_div, lambda x: torch.div(x, 2, rounding_mode="floor") - ) - - def test_trunc_div(self): - self._make_acc_op_function_test( - acc_ops.trunc_div, lambda x: torch.div(x, 2, rounding_mode="trunc") - ) - self._make_acc_op_function_test( - acc_ops.trunc_div, lambda x: torch.floor_divide(x, 2) - ) - - def test_view(self): - """ - Test that Tensor.view is traced correctly. - """ - - self._make_acc_op_function_test(acc_ops.reshape, lambda x: x.view(1, -1)) - - def test_narrow(self): - """ - Test that torch.narrow is traced correctly. - """ - return self._make_acc_op_function_test( - acc_ops.slice_tensor, - torch.narrow, - validate_same_kwargs=False, - dim=1, - start=1, - length=2, - ) - - def test_pow(self): - self._make_acc_op_function_test(acc_ops.pow, torch.pow, exponent=2) - - def test_size(self): - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a): - idx = a.size(1) - return a.shape[idx] - - m = TestModule() - a = torch.randn(2, 1, 4) - traced = acc_tracer.trace(m, [a]) - - ph_a = size_1 = size_2 = getitem_1 = getitem_2 = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertTrue(node.target == "a") - ph_a = node - elif node.op == "call_function" and node.target == acc_ops.size: - if size_1: - size_2 = node - self.assertTrue(size_2.kwargs["input"] is ph_a) - else: - size_1 = node - self.assertTrue(size_1.kwargs["input"] is ph_a) - elif node.op == "call_function" and node.target == acc_ops.getitem: - if getitem_1: - getitem_2 = node - self.assertTrue(getitem_2.kwargs["idx"] == getitem_1) - self.assertTrue(getitem_2.kwargs["input"] == size_2) - else: - getitem_1 = node - self.assertTrue(getitem_1.kwargs["idx"] == 1) - self.assertTrue(getitem_1.kwargs["input"] == size_1) - elif node.op == "output": - self.assertEqual(node.args[0], getitem_2) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref = m(a) - res = traced(a) - self.assertEqual(ref, res) - - def test_flatten(self): - """ - Test that torch.flatten is traced correctly. - """ - self._make_acc_op_function_test( - acc_ops.flatten, torch.flatten, start_dim=1, end_dim=1 - ) - self._make_acc_op_function_test(acc_ops.flatten, lambda x: x.flatten()) - - def test_topk_multi_output(self): - """ - Test that torch.topk multi outputs work. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: torch.Tensor) -> torch.Tensor: - return torch.topk(a, 3)[1] - - m = TestModule() - input_a = torch.randn(10) - traced = acc_tracer.trace(m, [input_a]) - - ph_a = topk = getitem = None - for node in traced.graph.nodes: - if node.op == "placeholder" and str(node.target) == "a": - ph_a = node - elif node.op == "call_function" and node.target == acc_ops.topk: - topk = node - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["k"], 3) - elif node.op == "call_function" and node.target == acc_ops.getitem: - getitem = node - self.assertEqual(node.kwargs["input"], topk) - self.assertEqual(node.kwargs["idx"], 1) - elif node.op == "output": - self.assertEqual(node.args[0], getitem) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input_a), traced(input_a))) - - def test_addmm_with_alpha_beta(self): - class TestModule(torch.nn.Module): - def __init__(self): - super().__init__() - - def forward( - self, input: torch.Tensor, a: torch.Tensor, b: torch.Tensor - ) -> torch.Tensor: - return torch.addmm(input, a, b, alpha=1.2, beta=1.1) - - m = TestModule() - input, a, b = torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, [input, a, b]) - - ph_in = ph_a = ph_b = mm = add = mm_mul = add_mul = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - elif str(node.target) == "b": - ph_b = node - else: - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.matmul: - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - mm = node - elif node.target == acc_ops.add: - self.assertEqual(node.kwargs["input"], mm_mul) - self.assertEqual(node.kwargs["other"], add_mul) - add = node - elif mm_mul: - self.assertEqual(node.kwargs["input"], ph_in) - self.assertEqual(node.kwargs["other"], 1.1) - add_mul = node - else: - self.assertEqual(node.kwargs["input"], mm) - self.assertEqual(node.kwargs["other"], 1.2) - mm_mul = node - elif node.op == "output": - self.assertEqual(add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - torch.testing.assert_allclose(m(input, a, b), traced(input, a, b)) - - def test_log1p(self): - class TestModule(torch.nn.Module): - def forward(self, input: torch.Tensor) -> torch.Tensor: - return torch.log1p(input) - - m = TestModule().eval() - input = torch.tensor([[1.2, 0.3, -0.4]]) - traced = acc_tracer.trace(m, [input]) - - ph_in = add = log = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.add: - self.assertEqual(node.kwargs["input"], ph_in) - self.assertEqual(node.kwargs["other"], 1) - add = node - else: - self.assertEqual(node.target, acc_ops.log) - self.assertEqual(node.kwargs["input"], add) - log = node - elif node.op == "output": - self.assertEqual(log, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - torch.testing.assert_allclose(m(input), traced(input)) - - def test_addmm(self): - class TestModule(torch.nn.Module): - def forward( - self, input: torch.Tensor, a: torch.Tensor, b: torch.Tensor - ) -> torch.Tensor: - return torch.addmm(input, a, b) - - m = TestModule() - input, a, b = torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2) - traced = acc_tracer.trace(m, [input, a, b]) - - ph_in = ph_a = ph_b = mm = add = None - for node in traced.graph.nodes: - if node.op == "placeholder": - if str(node.target) == "a": - ph_a = node - elif str(node.target) == "b": - ph_b = node - else: - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.matmul: - self.assertEqual(node.kwargs["input"], ph_a) - self.assertEqual(node.kwargs["other"], ph_b) - mm = node - else: - self.assertEqual(node.target, acc_ops.add) - self.assertEqual(node.kwargs["input"], mm) - self.assertEqual(node.kwargs["other"], ph_in) - add = node - elif node.op == "output": - self.assertEqual(add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - self.assertTrue(torch.equal(m(input, a, b), traced(input, a, b))) - - def test_gelu(self): - return self._make_acc_op_function_test(acc_ops.gelu, torch.nn.functional.gelu) - - @parameterized.expand( - [ - (1, True), - (1, False), - (None, False), - ] - ) - def test_argmin(self, dim, keepdim): - class TestModule(torch.nn.Module): - def __init__(self, dim, keepdim): - super().__init__() - self.dim = dim - self.keepdim = keepdim - - def forward(self, input: torch.Tensor) -> torch.Tensor: - return torch.argmin(input, dim=self.dim, keepdim=self.keepdim) - - m = TestModule(dim, keepdim) - input = torch.randn(2, 2) - traced = acc_tracer.trace(m, [input]) - - ph_in = flatten = topk = getitem = squeeze = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.flatten: - self.assertEqual(node.kwargs["input"], ph_in) - flatten = node - elif node.target == acc_ops.topk: - self.assertEqual( - node.kwargs["input"], flatten if flatten else ph_in - ) - topk = node - elif node.target == acc_ops.getitem: - self.assertEqual(node.kwargs["input"], topk) - getitem = node - elif node.target == acc_ops.squeeze: - self.assertEqual(node.kwargs["input"], getitem) - squeeze = node - elif node.op == "output": - self.assertEqual(squeeze if squeeze else getitem, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - if dim is None: - self.assertTrue(flatten is not None) - if not keepdim: - self.assertTrue(squeeze is not None) - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_t(self): - """ - Test Tensor.t() is traced correctly. - """ - self._make_acc_op_function_test(acc_ops.permute, lambda x: x.t()) - self._make_acc_op_function_test( - acc_ops.permute, lambda x: x.t(), input_shape=(3,) - ) - - def test_split_size(self): - self._make_acc_op_function_test( - acc_ops.split, - torch.split, - validate_same_kwargs=False, - split_size_or_sections=2, - dim=1, - ) - - def test_split_sections(self): - class TestModule(torch.nn.Module): - def forward(self, input: torch.Tensor) -> torch.Tensor: - return torch.split(input, [2, 5, 3], 1) - - m = TestModule() - input = torch.randn(1, 10) - traced = acc_tracer.trace(m, [input]) - - ph_in = slice_node_0 = slice_node_1 = slice_node_2 = None - tuple_construct_node = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertTrue(str(node.target) == "input") - ph_in = node - elif node.op == "call_function": - if node.target == acc_ops.slice_tensor: - self.assertEqual(node.kwargs["input"], ph_in) - if slice_node_0: - if slice_node_1: - slice_node_2 = node - else: - slice_node_1 = node - else: - slice_node_0 = node - else: - self.assertEqual(node.target, acc_ops.tuple_construct) - self.assertEqual( - node.kwargs["tensors"], - (slice_node_0, slice_node_1, slice_node_2), - ) - tuple_construct_node = node - elif node.op == "output": - self.assertEqual(tuple_construct_node, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - ref_output = m(input) - output = traced(input) - for i, j in zip(ref_output, output): - self.assertTrue(torch.equal(i, j)) - - @parameterized.expand( - [ - ("neg_1", -1, 1, 3), - ("neg_2", -2, 1, 3), - ("neg_4", -4, 1, 1), - ] - ) - def test_negative_slicing(self, _, dim, start, length): - """ - Test that slicing with negative dims works. - """ - self._make_acc_op_function_test( - acc_ops.slice_tensor, - torch.narrow, - input_shape=(2, 3, 4, 5), - validate_same_kwargs=False, - dim=dim, - start=start, - length=length, - ) - - def test_list_input(self): - """ - Test that list inputs are traced correctly. - """ - - class TestModule(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, a: List[torch.Tensor]) -> torch.Tensor: - return a[0] + a[1] - - m = TestModule() - input = [torch.randn(2, 3), torch.randn(2, 3)] - traced = acc_tracer.trace(m, [input]) - - ph = getitem_0 = getitem_1 = add = None - for node in traced.graph.nodes: - if node.op == "placeholder": - self.assertEqual(str(node.target), "a") - ph = node - elif node.op == "call_function" and node.target == acc_ops.getitem: - self.assertTrue(node.kwargs["idx"] == 0 or node.kwargs["idx"] == 1) - if node.kwargs["idx"] == 0: - getitem_0 = node - else: - getitem_1 = node - elif node.op == "call_function": - self.assertEqual(node.target, acc_ops.add) - self.assertEqual(node.kwargs["input"], getitem_0) - self.assertEqual(node.kwargs["other"], getitem_1) - add = node - elif node.op == "output": - self.assertEqual(add, node.args[0]) - else: - self.fail(f"Unexpected node: {node.format_node()}") - - # Check the tensor metadatas are correct given the input is a list. - self.assertTrue(isinstance(ph.meta["tensor_meta"], list)) - self.assertEqual(len(ph.meta["tensor_meta"]), 2) - self.assertEqual(getitem_0.meta["tensor_meta"], ph.meta["tensor_meta"][0]) - self.assertEqual(getitem_1.meta["tensor_meta"], ph.meta["tensor_meta"][1]) - - self.assertTrue(torch.equal(m(input), traced(input))) - - def test_mobilenet_v3(self): - """ - Test that we can trace mobilenet v3 small and run/compare against the untraced version. - """ - m = torchvision.models.mobilenet_v3_small(pretrained=True) - self._make_model_unit_test(m, enable_allclose=True) - - def test_mobilenet_v2(self): - """ - Test that we can trace mobilenet v2 small and run/compare against the untraced version. - """ - m = torchvision.models.mobilenet_v2(pretrained=True) - self._make_model_unit_test(m) - - def test_vgg16(self): - """ - Test that we can trace vgg16 and run/compare against the untraced version. - """ - m = torchvision.models.vgg16(pretrained=True) - self._make_model_unit_test(m) - - def test_resnet18(self): - """ - Test that we can trace resnet18 and run/compare against the untraced version. - """ - m = torchvision.models.resnet18(pretrained=True) - self._make_model_unit_test(m) - - def test_resnext50_32x4d(self): - """ - Test that we can trace resnext and run/compare against the untraced version. - """ - m = torchvision.models.resnext50_32x4d(pretrained=True) - self._make_model_unit_test(m) - - def test_cumsum(self): - self._make_acc_op_function_test(acc_ops.cumsum, torch.cumsum, dim=1) - self._make_acc_op_function_test( - acc_ops.cumsum, torch.cumsum, dim=1, dtype=torch.float - ) - - def test_chunk(self): - self._make_acc_op_function_test(acc_ops.chunk, torch.chunk, chunks=2, dim=0) - - def test_retrace_reshape(self): - """ - Retrace reshape to verify it's retraceable. - """ - - class TestModule(torch.nn.Module): - def forward(self, a: torch.Tensor) -> torch.Tensor: - return a.reshape(a.size()[0], 1, 2) - - m = TestModule() - a = torch.randn(2, 2) - gm = acc_tracer.trace(m, [a]) - self.assertTrue(torch.equal(m(a), gm(a))) - gm_retrace = acc_tracer.trace(gm, [a]) - self.assertTrue(torch.equal(m(a), gm_retrace(a))) - - def test_all_acc_ops_registered(self): - self.assertEqual( - acc_normalizer._acc_ops, - { - acc_ops.linear, - acc_ops.max_pool2d, - acc_ops.flatten, - acc_ops.adaptive_avg_pool2d, - acc_ops.avg_pool2d, - acc_ops.add, - acc_ops.min_full_reduce, - acc_ops.min_dim_reduce, - acc_ops.minimum, - acc_ops.cat, - acc_ops.softmax, - acc_ops.sign, - acc_ops.permute, - acc_ops.matmul, - acc_ops.quantize_per_tensor, - acc_ops.quantize_per_channel, - acc_ops.quantized_add, - acc_ops.quantized_mul, - acc_ops.dequantize, - acc_ops.sub, - acc_ops.mul, - acc_ops.div, - acc_ops.floor_div, - acc_ops.trunc_div, - acc_ops.pow, - acc_ops.relu, - acc_ops.leaky_relu, - acc_ops.elu, - acc_ops.selu, - acc_ops.softsign, - acc_ops.tuple_construct, - acc_ops.unsqueeze, - acc_ops.sigmoid, - acc_ops.sum, - acc_ops.prod, - acc_ops.max_full_reduce, - acc_ops.max_dim_reduce, - acc_ops.maximum, - acc_ops.sinh, - acc_ops.cosh, - acc_ops.tanh, - acc_ops.asin, - acc_ops.acos, - acc_ops.atan, - acc_ops.exp, - acc_ops.log, - acc_ops.sqrt, - acc_ops.reciprocal, - acc_ops.abs, - acc_ops.neg, - acc_ops.floor, - acc_ops.ceil, - acc_ops.size, - acc_ops.split, - acc_ops.conv2d, - acc_ops.batch_norm, - acc_ops.embedding_bag, - acc_ops.embedding_bag_byte_rowwise_offsets, - acc_ops.embedding_bag_4bit_rowwise_offsets, - acc_ops.contiguous, - acc_ops.pad, - acc_ops.sin, - acc_ops.cos, - acc_ops.tan, - acc_ops.topk, - acc_ops.getitem, - acc_ops.squeeze, - acc_ops.tile, - acc_ops.reshape, - acc_ops.quantized_linear, - acc_ops.quantized_conv2d, - acc_ops.quantized_batch_norm2d, - acc_ops.to_dtype, - acc_ops.clamp, - acc_ops.layer_norm, - acc_ops.linalg_norm, - acc_ops.slice_tensor, - acc_ops.hardsigmoid, - acc_ops.mean, - acc_ops.hardtanh, - acc_ops.gelu, - acc_ops.cumsum, - acc_ops.chunk, - acc_ops.rescale_quantize_per_tensor, - acc_ops.rescale_quantize_per_channel, - acc_ops.nan_to_num, - }, - ) diff --git a/torch/fx/experimental/const_fold.py b/torch/fx/experimental/const_fold.py index a7365ee668f..1ccc498c565 100644 --- a/torch/fx/experimental/const_fold.py +++ b/torch/fx/experimental/const_fold.py @@ -1,7 +1,7 @@ +import re from typing import Callable, Dict, Set, Optional, Union import torch.fx -import torch.fx.experimental.fx_acc.acc_utils as acc_utils from torch.fx.node import map_arg from torch.fx.passes.split_module import split_module @@ -111,6 +111,26 @@ def _inline_module(gm: torch.fx.GraphModule, inline_mod_name: str): gm.graph.eliminate_dead_code() +def get_unique_attr_name_in_module(mod_traced: torch.fx.GraphModule, name: str) -> str: + """ + Make sure the name is unique (in a module) and can represents an attr. + """ + # Delete all characters that are illegal in a Python identifier. + name = re.sub("[^0-9a-zA-Z_]+", "_", name) + if name[0].isdigit(): + name = f"_{name}" + # Now make sure it is in fact unique to the module by incrementing suffix value. + while hasattr(mod_traced, name): + match = re.match(r"(.*)_(\d+)$", name) + if match is None: + name = name + "_1" + else: + base, num = match.group(1, 2) + name = f"{base}_{int(num) + 1}" + + return name + + def split_const_subgraphs( module: Union[torch.nn.Module, torch.fx.GraphModule], skip_folding_node_fn: Optional[Callable[[torch.fx.Node], bool]] = None @@ -222,7 +242,7 @@ def split_const_subgraphs( # folded tensor(s) that result from constant folding. Note that we don't need to # worry about whether this is one or more tensors because the original graph # correctly uses getitem to extract individual tensors if there are multiple folded. - fx_const_folded_attrs_name = acc_utils.get_unique_attr_name_in_module( + fx_const_folded_attrs_name = get_unique_attr_name_in_module( split, "_FX_CONST_FOLDED_ATTRS" ) setattr( diff --git a/torch/fx/experimental/fx_acc/__init__.py b/torch/fx/experimental/fx_acc/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/torch/fx/experimental/fx_acc/acc_normalizer.py b/torch/fx/experimental/fx_acc/acc_normalizer.py deleted file mode 100644 index 5e2d21e86fc..00000000000 --- a/torch/fx/experimental/fx_acc/acc_normalizer.py +++ /dev/null @@ -1,444 +0,0 @@ -import inspect -import re -from typing import NamedTuple, Optional, Callable, Dict, List, Tuple, Union, Any, Set - -import torch -import torch.fx -import torch.fx.experimental.fx_acc.acc_utils as acc_utils -from torch.fx.node import _get_qualified_name - -# Need to keep up-to-date with https://fburl.com/codesearch/7r2hhh53 -ALIAS_MAP = { - "input": ("input", "x", "a", "x1"), - "dim": ("dim", "axis"), - "keepdim": ("keepdim", "keepdims"), - "other": ("other", "x2"), -} - -# Type used for arg replacement tuples. The list represents the argument signature of -# some callable. Each item in the list is a tuple, where for each member of a tuple: -# - The first member is union of either: -# - A tuple of all potential alias kwarg str names of the source signature, or -# - A tuple of a single str representing the single kwarg name allowed. -# - The second member is the str name of the kwarg to map it to. This is either from the -# signature of the acc_op, or for custom mapped nodes from the original unnormalized op. -# - The third member is a bool representing whether this arg is optional, i.e. whether it -# is allowed to not be present in the original input args. -ArgReplacementTuplesType = List[Tuple[Tuple[str, ...], str, bool]] - - -class NormalizationInfo(NamedTuple): - """ - Holds normalization info for some FX node, where the FX node will be mapped either - via new_fn_target and arg_replacement_tuples, or via custom_mapping_fn. - - If via new_fn_target and arg_replacement_tuples: - - new_fn_target is the target function to replace the original node with - (generally some function from acc_ops). - - - arg_replacement_tuples describes how to map the original FX node's args/kwargs to - the new FX node. If set to None, then the kwargs are copied directly from the - original FX node. Else, this is list of three-member tuples, where each tuple - represents a mapping from either an arg or kwarg in the original FX node to the - kwarg it should be mapped to. If for ops registered with `register_acc_op` then - this is a mapping to the the new FX node for the acc_op. Otherwise it is for some - op registered with `register_custom_acc_mapper_fn`, in which case this is a - mapping for the original input node so its args are normalized to kwargs before - being custom normalized to acc_ops. The third member of the tuple is a bool - representing whether this argument is optional; if False and the arg is not - present then an assertion will be thrown. The index of the tuple indicates where - the original arg is in node.args and the string name indicates which original - kwarg it is. - - If via custom_mapping_fn, then custom_mapping_fn is some function that takes the - original FX node as input and returns the FX node that should replace it. This means - it was registered via `register_custom_acc_mapper_fn`. - """ - - new_fn_target: Callable - arg_replacement_tuples: Optional[ArgReplacementTuplesType] - custom_mapping_fn: Optional[Callable] - # either (tensor_meta_field_name, original_field_name, move_to_qparams) or - # (tensor_meta_field_name, orginal_field_name) - # when move_to_qparams is True, we'll move the field to qparams - # dictionary, otherwise it will stay in TensorMeta itself - kwargs_to_move_to_acc_out_ty: Optional[ - List[Union[Tuple[str, str, bool], Tuple[str, str]]] - ] - needs_shapes_for_normalization: bool - - -# Dict from (op, target) to NormalizationInfo for that op. -_normalization_dict: Dict[Tuple[str, Union[str, Callable]], NormalizationInfo] = {} - -# Set of all the acc ops. -_acc_ops: Set[Callable] = set() - - -def _insert_fun( - op_and_target: Tuple[str, Union[str, Callable]], - arg_replacement_tuples: List[Tuple], - new_fn_target: Optional[Callable] = None, - custom_mapping_fn: Optional[Callable] = None, - kwargs_to_move_to_acc_out_ty: Optional[ - List[Union[Tuple[str, str, bool], Tuple[str, str]]] - ] = None, - needs_shapes_for_normalization=False, - allow_normalize_from_torch_package=False, -): - if op_and_target[0] == "call_function": - assert callable(op_and_target[1]) - elif op_and_target[0] == "call_method": - assert isinstance(op_and_target[1], str) - elif op_and_target[0] == "call_module": - assert isinstance(op_and_target[1], type) - - # Finalize arg replacement tuples. - # 1. Check to see if they have the `is_optional` bool, and if not defaulting it to - # False. - # 2. Some kwargs might have aliases. e.g. "a", "x" and "x1" are aliases of "input". - # Here we replace `orig_kwarg` with a tuple of all aliases if it has aliases. - final_arg_replacement_tuples = [] - for arg_replacement_tuple in arg_replacement_tuples: - if len(arg_replacement_tuple) == 2: - orig_kwarg, new_kwarg, is_optional = *arg_replacement_tuple, False - else: - assert len(arg_replacement_tuple) == 3 - orig_kwarg, new_kwarg, is_optional = arg_replacement_tuple - - if not isinstance(orig_kwarg, tuple): - orig_kwarg = (orig_kwarg,) - - # Use set to avoid duplicates. - orig_kwarg_set = set(orig_kwarg) - - for k in orig_kwarg: - if k in ALIAS_MAP: - orig_kwarg_set.update(ALIAS_MAP[k]) - final_arg_replacement_tuples.append( - (tuple(orig_kwarg_set), new_kwarg, is_optional) - ) - - assert op_and_target not in _normalization_dict.keys() - norm_info = NormalizationInfo( - new_fn_target=new_fn_target, # type: ignore[arg-type] - arg_replacement_tuples=final_arg_replacement_tuples, - custom_mapping_fn=custom_mapping_fn, - kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty, - needs_shapes_for_normalization=needs_shapes_for_normalization, - ) - _normalization_dict[op_and_target] = norm_info - - # If allow_normalize_from_torch_package then add another entry to - # _normalization_dict where we look for the qualified name of the target with the - # torch_package module prefix. Note that we leave off any integer at the end of - # "" in order to allow for whatever mangling index is used. - if allow_normalize_from_torch_package: - torch_package_op_and_target = ( - op_and_target[0], # type: ignore[] - f".{_get_qualified_name(op_and_target[1])}", # type: ignore[arg-type] - ) - _normalization_dict[torch_package_op_and_target] = norm_info - - -def _get_dup_signature_tuples(fn: Callable) -> List[Tuple[str, str]]: - """ - Helper that inspects the arg signature of `fn` and returns a list of tuples, where - each tuple is a pair of duplicated names which is used for arg_replacement_tuples. - """ - sig_tuples: List[Tuple[str, str]] = [] - for param in inspect.signature(inspect.unwrap(fn)).parameters: - sig_tuples.append((param, param)) - return sig_tuples - - -def register_acc_op(acc_op: Callable): - """ - For a new acc op, add this as decorator to register it. - """ - _acc_ops.add(acc_op) - return acc_op - - -def register_acc_op_mapping( - op_and_target: Tuple[str, Union[str, Callable]], - arg_replacement_tuples: Optional[ - List[ - Union[ - Tuple[Union[str, Tuple[str, ...]], str], - Tuple[Union[str, Tuple[str, ...]], str, bool], - ] - ] - ] = None, - kwargs_to_move_to_acc_out_ty: Optional[ - List[Union[Tuple[str, str, bool], Tuple[str, str]]] - ] = None, -): - """ - Use this decorator to map a non-acc operator to an acc operator. - - Args: - op_and_target: A tuple that contains op and target of the node that represents the non-acc operator. - arg_replacement_tuples: Please refer to the comment on above for `ArgReplacementTuplesType`. - kwargs_to_move_to_acc_out_ty: The kwargs we want to move out from the non-acc op kwargs to acc_out_ty. - """ - - def insert(new_fn_target: Callable): - # If arg_replacement_tuples is None then assume we use the same signature for - # the acc_op and the original op. - if arg_replacement_tuples is None: - final_arg_replacement_tuples = _get_dup_signature_tuples(new_fn_target) - else: - final_arg_replacement_tuples = arg_replacement_tuples # type: ignore[assignment] - - _insert_fun( - op_and_target=op_and_target, - new_fn_target=new_fn_target, - arg_replacement_tuples=final_arg_replacement_tuples, # type: ignore[arg-type] - kwargs_to_move_to_acc_out_ty=kwargs_to_move_to_acc_out_ty, - ) - return new_fn_target - - return insert - - -def register_custom_acc_mapper_fn( - op_and_target: Tuple[str, Union[str, Callable]], - arg_replacement_tuples: List[ - Union[ - Tuple[Union[str, Tuple[str, ...]], str], - Tuple[Union[str, Tuple[str, ...]], str, bool], - ] - ], - needs_shapes_for_normalization=False, - allow_normalize_from_torch_package=False, -): - def insert(custom_mapping_fn: Callable): - _insert_fun( - op_and_target=op_and_target, - custom_mapping_fn=custom_mapping_fn, - arg_replacement_tuples=arg_replacement_tuples, # type: ignore[arg-type] - needs_shapes_for_normalization=needs_shapes_for_normalization, - allow_normalize_from_torch_package=allow_normalize_from_torch_package, - ) - return custom_mapping_fn - - return insert - - -def move_kwargs_to_acc_out_ty( - node_or_normalization_info: Union[NormalizationInfo, torch.fx.Node], - new_kwargs: Dict[str, Any], -): - """ - Given `node_or_normalization_info` which is either NormalizationInfo for a node, or - a node to fetch NormalizationInfo for, check if kwargs_to_move_to_acc_out_ty exists - in the NormalizationInfo, and if so perform the move of kwargs to acc_out_ty. - """ - - if isinstance(node_or_normalization_info, torch.fx.Node): - node = node_or_normalization_info - normalization_info = _normalization_dict.get((node.op, node.target)) - else: - assert isinstance(node_or_normalization_info, NormalizationInfo) - normalization_info = node_or_normalization_info - - assert normalization_info is not None - if normalization_info.kwargs_to_move_to_acc_out_ty is None: - return - - assert acc_utils.is_acc_op_with_kwarg( - normalization_info.new_fn_target, "acc_out_ty" - ) - - # Build a dict representing the new TensorMetadata to use for acc_out_ty, - # and then remove the kwarg from the new_kwargs since it's passed in via - # acc_out_ty instead. - tmd_dict: Dict[str, Any] = {} - qparams: Dict[str, Any] = {} - - for kwarg_replacement_tuple in normalization_info.kwargs_to_move_to_acc_out_ty: - if len(kwarg_replacement_tuple) == 2: - orig_kwarg_name, tmd_field_name, move_to_qparams = *kwarg_replacement_tuple, False # type: ignore[misc] - else: - assert len(kwarg_replacement_tuple) == 3 - orig_kwarg_name, tmd_field_name, move_to_qparams = kwarg_replacement_tuple # type: ignore[misc] - if move_to_qparams: - qparams[tmd_field_name] = new_kwargs[orig_kwarg_name] - else: - tmd_dict[tmd_field_name] = new_kwargs[orig_kwarg_name] - del new_kwargs[orig_kwarg_name] - - tmd_dict["qparams"] = qparams - # Note: allow_partial_spec here because we are only using the tensor metadata tuple - # here to pass specific values into the function. For example, for quantization we - # only need to provide qparams dictionary, but is_quantized is - # not passed in. - new_kwargs["acc_out_ty"] = acc_utils.build_raw_tensor_meta(**tmd_dict) - - -def get_normalized_kwargs( - node: torch.fx.Node, arg_replacement_tuples: ArgReplacementTuplesType -): - new_kwargs = {} - final_arg_is_varg = False - for i, replacement_tuple in enumerate(arg_replacement_tuples): - orig_kwargs_names, new_kwarg_name, is_optional = replacement_tuple - - # Check if this is a varg and if so break/process the rest outside the loop. - if len(orig_kwargs_names) == 1 and orig_kwargs_names[0] == "*": - assert i == len(arg_replacement_tuples) - 1 - final_arg_is_varg = True - break - - # If nothing is found in node.kwargs it means the kwarg is in node.arg - # or it's optional. In this case, we set orig_kwargs_name to None. - assert isinstance(orig_kwargs_names, tuple) - orig_kwargs_name = next( - (key for key in orig_kwargs_names if key in node.kwargs), - None, - ) - - # If can't find in node.kwargs then it should be in the i index - # of node.args. - if orig_kwargs_name is None: - if i < len(node.args): - new_kwargs[new_kwarg_name] = node.args[i] - else: - # Verify the arg we're trying to normalize was optional. - assert is_optional, f"Cannot normalize {orig_kwargs_names} to {new_kwarg_name} for {node.name}" - else: - new_kwargs[new_kwarg_name] = node.kwargs[orig_kwargs_name] - - # If using var args then process the rest of the args now. - if final_arg_is_varg: - var_arg_idx = len(arg_replacement_tuples) - 1 - new_kwarg_name = arg_replacement_tuples[var_arg_idx][1] - rest_of_args = [] - for i in range(var_arg_idx, len(node.args)): - rest_of_args.append(node.args[i]) - new_kwargs[new_kwarg_name] = rest_of_args - - return new_kwargs - - -def normalize(mod: torch.fx.GraphModule, expect_nodes_have_shapes: bool = False): - assert len(_normalization_dict) > 0 - graph = mod.graph - - # For "call_module" node we return _base_class_origin if it's a - # RewrittenModule, otherwise, return its type. For other nodes, - # we return node.target. - def get_target(mod: torch.fx.GraphModule, node: torch.fx.Node): - if node.op != "call_module": - return node.target - - # Find the module that node.target points to - m = dict(mod.named_modules())[node.target] - return getattr(m, "_base_class_origin", type(m)) - - def normalize_to_acc_op( - node: torch.fx.Node, - normalization_info: NormalizationInfo, - normalized_args: Tuple[Any, ...], - normalized_kwargs: Dict[str, Any], - ): - # If there's a custom mapping function then use it. - if normalization_info.custom_mapping_fn is not None: - # For custom mapping, the normalized_kwargs are used for the original op, - # i.e. *before* custom acc_ops normalization. Do that now. - node.args = normalized_args - node.kwargs = normalized_kwargs - new_node = normalization_info.custom_mapping_fn(node, mod) - # If a new node is returned then use it to replace the old node. Otherwise - # the custom mapping function did its own replacement, so return early. - if new_node is None: - return - else: - # If there's kwargs_to_move_to_acc_out_ty then use it to setup acc_out_ty in - # normalized_kwargs, and remove the kwarg from normalized_kwargs. - move_kwargs_to_acc_out_ty(normalization_info, normalized_kwargs) - - # All acc ops are functions. Create a call to the correct acc_ops target using - # the normalized kwargs provided. - with graph.inserting_before(node): - new_node = graph.create_node( - "call_function", - normalization_info.new_fn_target, - args=normalized_args, - kwargs=normalized_kwargs, - name=node.name, - ) - new_node.meta = node.meta.copy() - - # Finally replace the original node with the normalized node. - node.replace_all_uses_with(new_node) - graph.erase_node(node) - - # Don't wrap the acc_op node just because the original node was wrapped. - if "is_wrapped" in new_node.meta: - del new_node.meta["is_wrapped"] - - for node in graph.nodes: - if node.op in {"placeholder", "get_attr", "output"}: - continue - - normalization_info = _normalization_dict.get((node.op, get_target(mod, node))) - - # Also check if the torch_packaged version of the op was specified to be normalized. - if normalization_info is None and node.op == "call_function": - # Strip off the mangle_index suffix here before checking the map. - target = re.sub( - r"\A", - "", - _get_qualified_name(node.target), - ) - torch_package_op_and_target = (node.op, target) - normalization_info = _normalization_dict.get(torch_package_op_and_target) - - if normalization_info is None: - continue - - # Get the normalized kwargs to be used by normalize_to_acc_op below. If - # normalization_info.arg_replacement_tuples is empty then assume the function - # signature must be left as is. - assert normalization_info.arg_replacement_tuples is not None - if len(normalization_info.arg_replacement_tuples) == 0: - normalized_args = node.args - normalized_kwargs = node.kwargs - else: - normalized_args = () - try: - normalized_kwargs = get_normalized_kwargs( - node, normalization_info.arg_replacement_tuples - ) - except Exception: - print( - f"Error during kwarg normalization for: {node.format_node()}; " - f"arg_replacement_tuples={normalization_info.arg_replacement_tuples}" - ) - raise - - if ( - normalization_info.needs_shapes_for_normalization - and not expect_nodes_have_shapes - ): - # All nodes needing shapes for normalization should be custom mapped. - assert normalization_info.custom_mapping_fn is not None - # For custom mapping, the normalized_kwargs are used for the original op, - # i.e. *before* custom acc_ops normalization. Do that now so that whoever - # consumes the graph next (e.g. shape inference) can use kwargs safely. - node.args = normalized_args - node.kwargs = normalized_kwargs - continue - - try: - normalize_to_acc_op( - node, normalization_info, normalized_args, normalized_kwargs - ) - except Exception: - print(f"Error during normalization for node: {node.format_node()}") - raise - - # If there are any dead nodes left after normalization, eliminate them now. - mod.graph.eliminate_dead_code() diff --git a/torch/fx/experimental/fx_acc/acc_op_properties.py b/torch/fx/experimental/fx_acc/acc_op_properties.py deleted file mode 100644 index a2bc076ec78..00000000000 --- a/torch/fx/experimental/fx_acc/acc_op_properties.py +++ /dev/null @@ -1,45 +0,0 @@ -from collections import defaultdict -from enum import Flag, auto -from typing import Callable, DefaultDict, Set - -import torch -import torch.fx - -class AccOpProperty(Flag): - """ - A collection of static properties for acc_ops. - - * pointwise - op commutes with data restructuring ops such as reshape, - transpose, permute. e.g. op(reshape(x)) == reshape(op(x)). - Alternatively, for tensor x = (x1, x2, ...), there exists a scalar - function f such that op(x) = (f(x1), f(x2), ...). - * quantized - op expects quantized inputs and return quantized outputs - * unary - op has exactly one graph dependent input. e.g. relu, - dequantize, sum - """ - pointwise = auto() - quantized = auto() - unary = auto() - -acc_op_properties: DefaultDict[Callable, Set[AccOpProperty]] = defaultdict(set) -acc_ops_with_property: DefaultDict[AccOpProperty, Set[Callable]] = defaultdict(set) - - -def register_acc_op_properties(*properties: AccOpProperty): - """ - Attach properties to acc_op to inform optimization - """ - def decorator(acc_op: Callable): - acc_op_properties[acc_op] |= set(properties) - for prop in properties: - acc_ops_with_property[prop].add(acc_op) - return acc_op - return decorator - - -def add_optimization_properties_to_meta(mod: torch.fx.GraphModule) -> None: - """ - Add acc_op properties to Node.meta to inform optimization - """ - for node in mod.graph.nodes: - node.meta['acc_op_properties'] = acc_op_properties[node.target] diff --git a/torch/fx/experimental/fx_acc/acc_ops.py b/torch/fx/experimental/fx_acc/acc_ops.py deleted file mode 100644 index 636e3517045..00000000000 --- a/torch/fx/experimental/fx_acc/acc_ops.py +++ /dev/null @@ -1,1924 +0,0 @@ -# encoding: utf-8 -import operator -import warnings - -import torch # isort:skip -from typing import Sequence, List, cast - -import torch.fx.experimental.fx_acc.acc_utils as acc_utils -import torch.nn as nn -from torch.fx.experimental.fx_acc.acc_normalizer import ( - register_acc_op, - register_acc_op_mapping, - register_custom_acc_mapper_fn, -) -from torch.fx.experimental.fx_acc.acc_op_properties import ( - AccOpProperty, - register_acc_op_properties, -) -from torch.fx.passes.shape_prop import _extract_tensor_metadata, TensorMetadata - -this_arg_is_optional = True -move_to_qparams = True -dont_move_to_qparams = False - - -@register_acc_op_mapping(op_and_target=("call_function", nn.functional.linear)) -@register_acc_op -def linear(*, input, weight, bias): - return nn.functional.linear(input=input, weight=weight, bias=bias) - - -@register_acc_op_properties(AccOpProperty.quantized) -@register_acc_op -def quantized_linear(*, input, weight, bias, acc_out_ty=None): - assert acc_out_ty is not None - qparams = TensorMetadata(*acc_out_ty).qparams - return nn.quantized.functional.linear( - input, - weight, - bias, - qparams["scale"], - qparams["zero_point"], - ) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping( - op_and_target=("call_method", "flatten"), - arg_replacement_tuples=[ - ("input", "input"), - ("start_dim", "start_dim", this_arg_is_optional), - ("end_dim", "end_dim", this_arg_is_optional), - ], -) -@register_acc_op_mapping(op_and_target=("call_function", torch.flatten)) -@register_acc_op -def flatten(*, input, start_dim=0, end_dim=-1): - return torch.flatten(input=input, start_dim=start_dim, end_dim=end_dim) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping( - op_and_target=("call_method", "squeeze"), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ], -) -@register_acc_op_mapping( - op_and_target=("call_function", torch.squeeze), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ], -) -@register_acc_op -def squeeze(*, input, dim=None): - if dim is None: - return input.squeeze() - return input.squeeze(dim=dim) - - -@register_acc_op_mapping(op_and_target=("call_function", nn.functional.max_pool2d)) -@register_acc_op -def max_pool2d( - *, input, kernel_size, stride, padding, dilation, ceil_mode, return_indices -): - return nn.functional.max_pool2d( - input=input, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - ceil_mode=ceil_mode, - return_indices=return_indices, - ) - - -@register_acc_op_mapping( - op_and_target=("call_function", nn.functional.adaptive_avg_pool2d) -) -@register_acc_op -def adaptive_avg_pool2d(*, input, output_size): - return nn.functional.adaptive_avg_pool2d(input=input, output_size=output_size) - - -@register_acc_op_mapping(op_and_target=("call_function", nn.functional.avg_pool2d)) -@register_acc_op -def avg_pool2d( - *, - input, - kernel_size, - stride, - padding, - ceil_mode, - count_include_pad, - divisor_override, -): - return nn.functional.avg_pool2d( - input=input, - kernel_size=kernel_size, - stride=stride, - padding=padding, - ceil_mode=ceil_mode, - count_include_pad=count_include_pad, - divisor_override=divisor_override, - ) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.sign)) -@register_acc_op -def sign(*, input): - return torch.sign(input) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def size(*, input): - return input.size() - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", getattr), - arg_replacement_tuples=[], -) -def custom_getattr_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - """ - Custom function for mapping a call_function getattr to other ops. Currently only - supports loading a getattr called on a torch.Tensor with attr name "shape", which is - supported by mapping it to acc_ops.size(). - """ - # Have to use args here since getattr forces positional args. - input_obj = node.args[0] - attr_name = node.args[1] - assert isinstance(input_obj, torch.fx.Node) - assert ( - input_obj.meta["type"] == torch.Tensor - ), f"Expected torch.Tensor type for {input_obj.meta['type']}" - assert ( - attr_name == "shape" - ), f"Only supporting shape getattr for now, not {attr_name}" - with node.graph.inserting_before(node): - size_node = node.graph.call_function(size, kwargs={"input": input_obj}) - size_node.meta = node.meta.copy() - return size_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "size"), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ], -) -def tensor_size_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - """ - Mapping from Tensor.size() to acc_ops.size. We map size() to acc_ops.size directly - and map size(dim) to acc_ops.size + acc_ops.getitem. - """ - - with node.graph.inserting_before(node): - size_node = node.graph.call_function( - size, kwargs={"input": node.kwargs["input"]} - ) - - if "dim" not in node.kwargs: - size_node.meta = node.meta.copy() - return size_node - - size_node.meta["type"] = torch.Size - getitem_node = node.graph.call_function( - getitem, kwargs={"input": size_node, "idx": node.kwargs["dim"]} - ) - getitem_node.meta = node.meta.copy() - return getitem_node - - -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op_mapping(op_and_target=("call_function", operator.add)) -@register_acc_op_mapping(op_and_target=("call_method", "add")) -@register_acc_op -def add(*, input, other): - return input + other - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_method", "unsqueeze")) -@register_acc_op_mapping(op_and_target=("call_function", torch.unsqueeze)) -@register_acc_op -def unsqueeze(*, input, dim): - return torch.unsqueeze(input=input, dim=dim) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_method", "tile")) -@register_acc_op_mapping(op_and_target=("call_function", torch.tile)) -@register_acc_op -def tile(*, input, dims): - return torch.tile(input=input, dims=dims) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.stack), - arg_replacement_tuples=[ - ("tensors", "tensors"), - ("dim", "dim"), - ], -) -def stack_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - """ - Map torch.stack to unsqueeze + cat. - """ - with node.graph.inserting_before(node): - inputs = node.kwargs["tensors"] - unsqueeze_nodes = [] - assert isinstance(inputs, Sequence) - for i, t in enumerate(inputs): - new_node = node.graph.create_node( - "call_function", - unsqueeze, - kwargs={"input": t, "dim": node.kwargs["dim"]}, - name=f"{node.name}_unsqueeze_{i}", - ) - new_node.meta["type"] = torch.Tensor - unsqueeze_nodes.append(new_node) - cat_node = node.graph.create_node( - "call_function", - cat, - kwargs={"tensors": unsqueeze_nodes, "dim": node.kwargs["dim"]}, - ) - cat_node.meta = node.meta.copy() - return cat_node - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.clamp)) -@register_acc_op_mapping(op_and_target=("call_method", "clamp")) -@register_acc_op -def clamp(*, input, min=None, max=None): - return torch.clamp(input=input, min=min, max=max) - - -@register_acc_op_mapping(op_and_target=("call_function", torch.cat)) -@register_acc_op -def cat(*, tensors, dim): - return torch.cat(tensors=tensors, dim=dim) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.transpose), - arg_replacement_tuples=[ - ("input", "input"), - ("dim0", "dim0"), - ("dim1", "dim1"), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "transpose"), - arg_replacement_tuples=[ - ("input", "input"), - ("dim0", "dim0"), - ("dim1", "dim1"), - ], -) -def transpose_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - # Get the dim-permutation/shuffle - shape_as_list = node.meta["tensor_meta"].shape - ranks = len(shape_as_list) - shuffle = list(i for i in range(ranks)) - dim0 = cast(int, node.kwargs["dim0"]) - dim1 = cast(int, node.kwargs["dim1"]) - shuffle[dim0] = dim1 - shuffle[dim1] = dim0 - - # Create the new acc_ops.permute node. Update all uses of the transpose - # node and then delete the transpose node. - with node.graph.inserting_after(node): - permute_node = node.graph.call_function( - the_function=permute, - kwargs={ - "input": node.kwargs.get("input"), - "permutation": shuffle, - }, - ) - permute_node.meta = node.meta.copy() - node.replace_all_uses_with(permute_node) - - permute_node.graph.erase_node(node) - return permute_node - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_method", "contiguous")) -@register_acc_op -def contiguous(*, input): - return input.contiguous() - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.softmax)) -@register_acc_op -def softmax(*, input, dim, dtype): - """ - _stacklevel are ignored here. - """ - return torch.nn.functional.softmax(input=input, dim=dim, dtype=dtype) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.addmm), - arg_replacement_tuples=[ - ("input", "input"), - ("mat1", "mat1"), - ("mat2", "mat2"), - ("beta", "beta"), - ("alpha", "alpha"), - ], -) -def addmm_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - """ - Mapping from torch.addmm to acc_ops.mm -> acc_ops.add, if alpha or beta is not 1 - then we also insert acc_ops.mul to the right place. - """ - with node.graph.inserting_before(node): - mm_kwargs = {"input": node.kwargs["mat1"], "other": node.kwargs["mat2"]} - mm_node = node.graph.create_node( - "call_function", matmul, kwargs=mm_kwargs, name=f"{node.name}_mm" - ) - mm_node.meta = node.meta.copy() - - if node.kwargs["alpha"] != 1: - mul_kwargs = {"input": mm_node, "other": node.kwargs["alpha"]} - mm_node = node.graph.create_node( - "call_function", mul, kwargs=mul_kwargs, name=f"{mm_node.name}_mul" - ) - mm_node.meta = node.meta.copy() - - input_node = node.kwargs["input"] - if node.kwargs["beta"] != 1: - mul_kwargs = {"input": input_node, "other": node.kwargs["beta"]} - new_input_node = node.graph.create_node( - "call_function", mul, kwargs=mul_kwargs, name=f"{node.name}_input_mul" - ) - assert isinstance(input_node, torch.fx.Node) - new_input_node.meta = input_node.meta.copy() - input_node = new_input_node - - add_kwargs = {"input": mm_node, "other": input_node} - add_node = node.graph.create_node( - "call_function", add, kwargs=add_kwargs, name=f"{node.name}_add" - ) - add_node.meta = node.meta.copy() - return add_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.t), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "t"), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -def t_mapper(node: torch.fx.Node, _: nn.Module): - ranks = len(node.meta["tensor_meta"].shape) - shuffle = [1, 0] if (ranks > 1) else [0] - - with node.graph.inserting_before(node): - new_node = node.graph.create_node( - "call_function", - permute, - kwargs={"input": node.kwargs["input"], "permutation": shuffle}, - ) - new_node.meta = node.meta.copy() - return new_node - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping( - op_and_target=("call_method", "permute"), - arg_replacement_tuples=[ - ("input", "input"), - ("*", "permutation"), - ], -) -@register_acc_op_mapping( - op_and_target=("call_function", torch.permute), - arg_replacement_tuples=[ - ("input", "input"), - ("dims", "permutation"), - ], -) -@register_acc_op -def permute(*, input, permutation): - return input.permute(*permutation) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.square), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -def square_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - input_node = node.kwargs["input"] - with node.graph.inserting_before(node): - new_node = node.graph.call_function( - mul, kwargs={"input": input_node, "other": input_node} - ) - new_node.meta = node.meta.copy() - return new_node - - -@register_acc_op_mapping( - op_and_target=("call_function", torch.bmm), - arg_replacement_tuples=[ - ("input", "input"), - ("mat2", "other"), - ], -) -@register_acc_op_mapping(op_and_target=("call_function", torch.matmul)) -@register_acc_op -def matmul(*, input, other): - return torch.matmul(input=input, other=other) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", nn.functional.dropout), - arg_replacement_tuples=[("input", "input")], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "detach"), arg_replacement_tuples=[("input", "input")] -) -def dropout_mapper(node: torch.fx.Node, mod: nn.Module): - """ - Remove dropout node and directly map its input to output. - """ - return node.kwargs["input"] - - -try: - from torchvision.ops import stochastic_depth -except Exception as e: - warnings.warn(f"Unable to import torchvision related libraries.: {e}") -else: - - @register_custom_acc_mapper_fn( - op_and_target=("call_function", stochastic_depth), - arg_replacement_tuples=[("input", "input")], - ) - def stochastic_depth_mapper(node: torch.fx.Node, mod: nn.Module): - """ - Remove dropout node and directly map its input to output. - """ - return node.kwargs["input"] - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping( - op_and_target=("call_function", nn.functional.hardtanh), -) -@register_acc_op -def hardtanh(*, input, min_val=-1.0, max_val=1.0): - return nn.functional.hardtanh(input=input, min_val=min_val, max_val=max_val) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", nn.functional.hardsigmoid)) -@register_acc_op -def hardsigmoid(*, input): - return nn.functional.hardsigmoid(input) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", nn.functional.silu), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -def silu(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - input_node = node.kwargs["input"] - with node.graph.inserting_before(node): - sigmoid_node = node.graph.call_function(sigmoid, kwargs={"input": input_node}) - sigmoid_node.meta = node.meta.copy() - new_node = node.graph.call_function( - mul, kwargs={"input": sigmoid_node, "other": input_node} - ) - new_node.meta = node.meta.copy() - return new_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", nn.functional.hardswish), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -def hardswish_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - input_node = node.kwargs["input"] - with node.graph.inserting_before(node): - new_sigmoid_node = node.graph.call_function( - hardsigmoid, kwargs={"input": input_node} - ) - new_sigmoid_node.meta = node.meta.copy() - new_node = node.graph.call_function( - mul, kwargs={"input": new_sigmoid_node, "other": input_node} - ) - new_node.meta = node.meta.copy() - return new_node - - -@register_acc_op_properties(AccOpProperty.quantized) -@register_acc_op_mapping( - op_and_target=("call_function", torch.ops.quantized.add), - arg_replacement_tuples=[ - ("qa", "input"), - ("qb", "other"), - ("scale", "scale"), - ("zero_point", "zero_point"), - ], - kwargs_to_move_to_acc_out_ty=[ - ("scale", "scale", move_to_qparams), - ("zero_point", "zero_point", move_to_qparams), - ], -) -@register_acc_op -def quantized_add(*, input, other, acc_out_ty=None): - assert acc_out_ty is not None - qparams = TensorMetadata(*acc_out_ty).qparams - return torch.ops.quantized.add( - input, - other, - qparams["scale"], - qparams["zero_point"], - ) - - -@register_acc_op_properties(AccOpProperty.quantized) -@register_acc_op_mapping( - op_and_target=("call_function", torch.ops.quantized.mul), - arg_replacement_tuples=[ - ("qa", "input"), - ("qb", "other"), - ("scale", "scale"), - ("zero_point", "zero_point"), - ], - kwargs_to_move_to_acc_out_ty=[ - ("scale", "scale", move_to_qparams), - ("zero_point", "zero_point", move_to_qparams), - ], -) -@register_acc_op -def quantized_mul(*, input, other, acc_out_ty=None): - assert acc_out_ty is not None - qparams = TensorMetadata(*acc_out_ty).qparams - return torch.ops.quantized.mul( - input, - other, - qparams["scale"], - qparams["zero_point"], - ) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_properties(AccOpProperty.quantized) -@register_acc_op_mapping( - op_and_target=("call_function", torch.quantize_per_tensor), - arg_replacement_tuples=[ - ("input", "input"), - ("scale", "scale"), - ("zero_point", "zero_point"), - ("dtype", "dtype"), - ], - kwargs_to_move_to_acc_out_ty=[ - ("scale", "scale", move_to_qparams), - ("zero_point", "zero_point", move_to_qparams), - ("dtype", "dtype", dont_move_to_qparams), - ], -) -@register_acc_op -def quantize_per_tensor(*, input, acc_out_ty=None): - assert acc_out_ty is not None - qparams = TensorMetadata(*acc_out_ty).qparams - dtype = TensorMetadata(*acc_out_ty).dtype - return torch.quantize_per_tensor( - input, qparams["scale"], qparams["zero_point"], dtype - ) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping( - op_and_target=("call_function", torch.quantize_per_channel), - arg_replacement_tuples=[ - ("input", "input"), - ("scales", "scales"), - ("zero_points", "zero_points"), - ("axis", "axis"), - ("dtype", "dtype"), - ], - kwargs_to_move_to_acc_out_ty=[ - ("scales", "scale", move_to_qparams), - ("zero_points", "zero_point", move_to_qparams), - ("axis", "axis", move_to_qparams), - ("dtype", "dtype", dont_move_to_qparams), - ], -) -@register_acc_op -def quantize_per_channel(*, input, acc_out_ty=None): - assert acc_out_ty is not None - qparams = TensorMetadata(*acc_out_ty).qparams - dtype = TensorMetadata(*acc_out_ty).dtype - return torch.quantize_per_channel( - input, - torch.tensor(qparams["scale"]), - torch.tensor(qparams["zero_point"]), - qparams["axis"], - dtype, - ) # type: ignore[call-overload] - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_method", "dequantize")) -@register_acc_op_mapping(op_and_target=("call_function", torch.dequantize)) -@register_acc_op -def dequantize(*, input): - return torch.dequantize(input) - - -@register_acc_op_properties( - AccOpProperty.pointwise, AccOpProperty.unary, AccOpProperty.quantized -) -@register_acc_op -def rescale_quantize_per_tensor(*, input, acc_out_ty=None): - assert acc_out_ty is not None - d = dequantize(input=input) - return quantize_per_tensor(input=d, acc_out_ty=acc_out_ty) - - -@register_acc_op_properties(AccOpProperty.unary, AccOpProperty.quantized) -@register_acc_op -def rescale_quantize_per_channel(*, input, acc_out_ty=None): - assert acc_out_ty is not None - d = dequantize(input=input) - return quantize_per_channel(input=d, acc_out_ty=acc_out_ty) - - -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op_mapping(op_and_target=("call_function", operator.sub)) -@register_acc_op -def sub(*, input, other): - return input - other - - -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op_mapping(op_and_target=("call_function", torch.mul)) -@register_acc_op_mapping(op_and_target=("call_function", operator.mul)) -@register_acc_op_mapping(op_and_target=("call_method", "mul")) -@register_acc_op -def mul(*, input, other): - return input * other - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.div), - arg_replacement_tuples=[ - ("input", "input"), - ("other", "other"), - ("rounding_mode", "rounding_mode", this_arg_is_optional), - ], -) -def div_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node: - with node.graph.inserting_before(node): - div_kwargs = dict(node.kwargs) - if "rounding_mode" not in div_kwargs or div_kwargs["rounding_mode"] is None: - div_node = node.graph.call_function( - div, kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]} - ) - elif div_kwargs["rounding_mode"] == "trunc": - div_node = node.graph.call_function( - trunc_div, - kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]}, - ) - elif div_kwargs["rounding_mode"] == "floor": - div_node = node.graph.call_function( - floor_div, - kwargs={"input": div_kwargs["input"], "other": div_kwargs["other"]}, - ) - else: - raise RuntimeError( - f"Unhandled div rounding mode {div_kwargs['rounding_mode']}" - ) - div_node.meta = node.meta.copy() - return div_node - - -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op_mapping(op_and_target=("call_function", operator.truediv)) -@register_acc_op -def div(*, input, other): - return input / other - - -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op_mapping(op_and_target=("call_function", operator.floordiv)) -@register_acc_op -def floor_div(*, input, other): - # This is temp fix because currently operator.floor_div for tensors would - # traslate into torch.floor_divide which would throw an error. After it's - # fixed we can stick to `input // other`. - if isinstance(input, torch.Tensor) or isinstance(other, torch.Tensor): - return torch.div(input, other, rounding_mode="floor") - return input // other - - -# torch.floor_divide rounds result toward zero, rather than -Inf. -# https://github.com/pytorch/pytorch/issues/43874 -@register_acc_op_mapping(op_and_target=("call_function", torch.floor_divide)) -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op -def trunc_div(*, input, other): - return torch.div(input, other, rounding_mode="trunc") - - -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op_mapping(op_and_target=("call_function", torch.pow)) -@register_acc_op -def pow(*, input, exponent): - return torch.pow(input, exponent) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", nn.functional.relu)) -@register_acc_op_mapping( - op_and_target=("call_function", torch.relu), - arg_replacement_tuples=[("input", "input")], -) -@register_acc_op_mapping( - op_and_target=("call_method", "relu"), - arg_replacement_tuples=[("input", "input")], -) -@register_acc_op -def relu(*, input, inplace=False): - return nn.functional.relu(input=input, inplace=inplace) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping( - op_and_target=("call_function", torch.nn.functional.leaky_relu) -) -@register_acc_op -def leaky_relu(*, input, negative_slope=0.01, inplace=False): - return nn.functional.leaky_relu( - input=input, negative_slope=negative_slope, inplace=inplace - ) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.elu)) -@register_acc_op -def elu(*, input, alpha=1.0, inplace=False): - return nn.functional.elu(input=input, alpha=alpha, inplace=inplace) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.selu)) -@register_acc_op -def selu(*, input, inplace=False): - return nn.functional.selu(input=input, inplace=inplace) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.softsign)) -@register_acc_op -def softsign(*, input): - return nn.functional.softsign(input=input) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.log1p), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -def torch_log1p_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node: - with node.graph.inserting_before(node): - add_kwargs = {"input": node.kwargs["input"], "other": 1.0} - add_node = node.graph.call_function(add, kwargs=add_kwargs) - add_node.meta = node.meta.copy() - log_kwargs = {"input": add_node} - log_node = node.graph.call_function(log, kwargs=log_kwargs) - log_node.meta = node.meta.copy() - return log_node - - -def reduce_op_mapper( - node: torch.fx.Node, mod: torch.fx.GraphModule, func -) -> torch.fx.Node: - with node.graph.inserting_before(node): - kwargs = dict(node.kwargs) - if "dim" in kwargs and isinstance(kwargs["dim"], int): - kwargs["dim"] = (kwargs["dim"],) - new_node = node.graph.call_function(func, kwargs=kwargs) - new_node.meta = node.meta.copy() - return new_node - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def sum(*, input, dim=None, keepdim=False, dtype=None): - if dim is not None: - return torch.sum(input, dim=dim, keepdim=keepdim, dtype=dtype) - else: - return input.sum(dtype=dtype) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "sum"), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ("dtype", "dtype", this_arg_is_optional), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.sum), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ("dtype", "dtype", this_arg_is_optional), - ], -) -def sum_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node: - return reduce_op_mapper(node, mod, sum) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def prod(*, input, dim=None, keepdim=False, dtype=None): - if dim is not None: - return torch.prod(input, dim=dim, keepdim=keepdim, dtype=dtype) - else: - return input.prod(dtype=dtype) - -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "prod"), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ("dtype", "dtype", this_arg_is_optional), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.prod), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ("dtype", "dtype", this_arg_is_optional), - ], -) -def prod_mapper(node: torch.fx.Node, mod: torch.fx.GraphModule) -> torch.fx.Node: - func = prod - with node.graph.inserting_before(node): - kwargs = dict(node.kwargs) - new_node = node.graph.call_function(func, kwargs=kwargs) - new_node.meta = node.meta.copy() - return new_node - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def mean(*, input, dim=None, keepdim=False, dtype=None): - if dim is not None: - return torch.mean(input, dim=dim, keepdim=keepdim, dtype=dtype) - else: - return input.mean(dtype=dtype) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "mean"), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ("dtype", "dtype", this_arg_is_optional), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.mean), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ("dtype", "dtype", this_arg_is_optional), - ], -) -def mean_mapper(node, mod): - return reduce_op_mapper(node, mod, mean) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "max"), - arg_replacement_tuples=[ - ("input", "input"), - (("dim", "other"), "dim_or_other", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.max), - arg_replacement_tuples=[ - ("input", "input"), - (("dim", "other"), "dim_or_other", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "min"), - arg_replacement_tuples=[ - ("input", "input"), - (("dim", "other"), "dim_or_other", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.min), - arg_replacement_tuples=[ - ("input", "input"), - (("dim", "other"), "dim_or_other", this_arg_is_optional), - ("keepdim", "keepdim", this_arg_is_optional), - ], -) -def add_maximum_minimum_mapper( - node: torch.fx.Node, mod: torch.fx.GraphModule -) -> torch.fx.Node: - # there are effectively three versions of torch.max / torch.min - # full reduce: torch.max(input) -> Tensor - # dimensional reduce: torch.max(input, dim, keepdim=False, *, out=None) -> (Tensor, LongTensor) - # elementwise: torch.max(input, other, *, out=None) -> Tensor - - # the mapper function is remapping for both min and max situations - # this helper function makes the choices available clearer and provides an easier way - # to lookup the right function - def target_map(op, target): - if (op, target) in (("call_method", "max"), ("call_function", torch.max)): - return dict( - full_reduce=max_full_reduce, - dim_reduce=max_dim_reduce, - elementwise=maximum, - ) - elif (op, target) in (("call_method", "min"), ("call_function", torch.min)): - return dict( - full_reduce=min_full_reduce, - dim_reduce=min_dim_reduce, - elementwise=minimum, - ) - - with node.graph.inserting_before(node): - new_targets = target_map(node.op, node.target) - max_kwargs = dict() - max_kwargs["input"] = node.kwargs["input"] - if ("dim_or_other" not in node.kwargs) or (node.kwargs["dim_or_other"] is None): - nt = new_targets["full_reduce"] - max_node = node.graph.call_function(nt, kwargs=max_kwargs) - elif isinstance(node.kwargs["dim_or_other"], int): - nt = new_targets["dim_reduce"] - dim = node.kwargs["dim_or_other"] - max_kwargs["dim"] = dim - max_kwargs["keepdim"] = node.kwargs.get("keepdim", False) - max_node = node.graph.call_function(nt, kwargs=max_kwargs) - else: - other = node.kwargs["dim_or_other"] - assert isinstance(other, torch.fx.Node) - # Lowering path for when provided "other", where we do elem-wise max - nt = new_targets["elementwise"] - max_kwargs["other"] = other - max_node = node.graph.call_function(nt, kwargs=max_kwargs) - max_node.meta = node.meta.copy() - return max_node - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def max_full_reduce(*, input): - return torch.max(input=input) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def max_dim_reduce(*, input, dim=None, keepdim=False): - return torch.max(input=input, dim=dim, keepdim=keepdim) - - -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op_mapping(op_and_target=("call_function", torch.maximum)) -@register_acc_op_mapping(op_and_target=("call_method", "maximum")) -@register_acc_op -def maximum(*, input, other): - return torch.maximum(input=input, other=other) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def min_full_reduce(*, input): - return torch.min(input=input) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def min_dim_reduce(*, input, dim=None, keepdim=False): - return torch.min(input, dim=dim, keepdim=keepdim) - - -@register_acc_op_properties(AccOpProperty.pointwise) -@register_acc_op_mapping(op_and_target=("call_function", torch.minimum)) -@register_acc_op_mapping(op_and_target=("call_method", "minimum")) -@register_acc_op -def minimum(*, input, other): - return torch.minimum(input=input, other=other) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.sigmoid)) -@register_acc_op_mapping(op_and_target=("call_method", "sigmoid")) -@register_acc_op -def sigmoid(*, input): - return torch.sigmoid(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.sinh)) -@register_acc_op -def sinh(*, input): - return torch.sinh(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.cosh)) -@register_acc_op -def cosh(*, input): - return torch.cosh(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.tanh)) -@register_acc_op_mapping(op_and_target=("call_method", "tanh")) -@register_acc_op -def tanh(*, input): - return torch.tanh(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.asin)) -@register_acc_op -def asin(*, input): - return torch.asin(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.acos)) -@register_acc_op -def acos(*, input): - return torch.acos(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.atan)) -@register_acc_op -def atan(*, input): - return torch.atan(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.exp)) -@register_acc_op -def exp(*, input): - return torch.exp(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.log)) -@register_acc_op -def log(*, input): - return torch.log(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.sqrt)) -@register_acc_op -def sqrt(*, input): - return torch.sqrt(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.reciprocal)) -@register_acc_op -def reciprocal(*, input): - return torch.reciprocal(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.abs)) -@register_acc_op -def abs(*, input): - return torch.abs(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", operator.neg)) -@register_acc_op_mapping(op_and_target=("call_function", torch.neg)) -@register_acc_op -def neg(*, input): - return torch.neg(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.floor)) -@register_acc_op -def floor(*, input): - return torch.floor(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.ceil)) -@register_acc_op -def ceil(*, input): - return torch.ceil(input=input) - - -@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.pad)) -@register_acc_op -def pad(*, input, pad, mode, value): - return torch.nn.functional.pad(input=input, pad=pad, mode=mode, value=value) - - -@register_acc_op_mapping(op_and_target=("call_function", torch.conv2d)) -@register_acc_op -def conv2d(*, input, weight, bias, stride, padding, dilation, groups): - return nn.functional.conv2d( - input=input, - weight=weight, - bias=bias, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - ) - - -@register_acc_op_properties(AccOpProperty.quantized) -@register_acc_op -def quantized_conv2d( - *, - input, - weight, - bias, - stride, - padding, - dilation, - groups, - padding_mode, - acc_out_ty, -): - qparams = TensorMetadata(*acc_out_ty).qparams - return torch.nn.quantized.functional.conv2d( - input=input, - weight=weight, - bias=bias, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - padding_mode=padding_mode, - scale=qparams["scale"], - zero_point=qparams["zero_point"], - ) - - -@register_acc_op_mapping(op_and_target=("call_function", nn.functional.batch_norm)) -@register_acc_op -def batch_norm( - *, input, running_mean, running_var, weight, bias, training, momentum, eps -): - return nn.functional.batch_norm( - input=input, - running_mean=running_mean, - running_var=running_var, - weight=weight, - bias=bias, - training=training, - momentum=momentum, - eps=eps, - ) - - -@register_acc_op_mapping(op_and_target=("call_function", nn.functional.layer_norm)) -@register_acc_op -def layer_norm(*, input, normalized_shape, weight, bias, eps): - return nn.functional.layer_norm( - input=input, - normalized_shape=normalized_shape, - weight=weight, - bias=bias, - eps=eps, - ) - - -def argmin_max_mapper_impl(node: torch.fx.Node, largest: bool) -> torch.fx.Node: - """ - Map torch.argmin or torch.argmax to acc_ops.flatten (depend on dim) + acc_ops.topk - + acc_ops.getitem + acc_ops.squeeze (depends on keepdim). - """ - input_node = node.kwargs["input"] - dim = node.kwargs["dim"] - keepdim = node.kwargs["keepdim"] - - if dim is None and keepdim: - raise RuntimeError( - "We currently don't support argmin/argmax with dim=None and keepdim=True" - ) - - with node.graph.inserting_before(node): - if dim is None: - flatten_kwargs = { - "input": node.kwargs["input"], - "start_dim": 0, - "end_dim": -1, - } - flatten_node = node.graph.call_function(flatten, kwargs=flatten_kwargs) - flatten_node.meta["type"] = torch.Tensor - input_node = flatten_node - dim = -1 - - topk_kwargs = { - "input": input_node, - "k": 1, - "dim": dim, - "largest": largest, - "sorted": False, - } - topk_node = node.graph.call_function(topk, kwargs=topk_kwargs) - # It's actually more like NamedTuple but tuple here should be fine. - topk_node.meta["type"] = tuple - - getitem_kwargs = {"input": topk_node, "idx": 1} - getitem_node = node.graph.call_function(getitem, kwargs=getitem_kwargs) - getitem_node.meta["type"] = torch.Tensor - output_node = getitem_node - - if not keepdim: - squeeze_kwargs = {"input": getitem_node, "dim": dim} - output_node = node.graph.call_function(squeeze, kwargs=squeeze_kwargs) - - output_node.meta = node.meta.copy() - return output_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.argmin), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim"), - ("keepdim", "keepdim"), - ], -) -def torch_argmin_mapper(node: torch.fx.Node, _: torch.nn.Module) -> torch.fx.Node: - """ - Map torch.argmin to acc_ops.flatten (depend on dim) + acc_ops.topk + acc_ops.getitem - + acc_ops.squeeze (depends on keepdim). - """ - return argmin_max_mapper_impl(node, largest=False) - - -@register_acc_op_mapping(op_and_target=("call_function", torch.linalg.norm)) -@register_acc_op -def linalg_norm(*, input, ord, dim, keepdim): - return torch.linalg.norm(input=input, ord=ord, dim=dim, keepdim=keepdim) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "split"), - arg_replacement_tuples=[ - ("tensor", "input"), - ("split_size_or_sections", "split_size_or_sections"), - ("dim", "dim"), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "split_with_sizes"), - arg_replacement_tuples=[ - ("tensor", "input"), - ("split_sizes", "split_size_or_sections"), - ("dim", "dim"), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.split), - arg_replacement_tuples=[ - ("tensor", "input"), - ("split_size_or_sections", "split_size_or_sections"), - ("dim", "dim"), - ], -) -def torch_split_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node: - """ - If split_size_or_sections is sections, map the node to slice_tensors - + tuple_construct. Otherwise, if split_size_or_sections is split_size, - map the node to acc_ops.split. - """ - split_size_or_sections = node.kwargs["split_size_or_sections"] - with node.graph.inserting_before(node): - if isinstance(split_size_or_sections, int): - new_kwargs = { - "input": node.kwargs["input"], - "split_size": split_size_or_sections, - "dim": node.kwargs["dim"], - } - new_node = node.graph.call_function(split, kwargs=new_kwargs) - new_node.meta = node.meta.copy() - return new_node - - assert isinstance(split_size_or_sections, Sequence) - start = 0 - slice_nodes = [] - for i in split_size_or_sections: - assert isinstance(i, int) - new_kwargs = { - "input": node.kwargs["input"], - "dim": node.kwargs["dim"], - "start": start, - "stop": start + i, - "step": 1, - } - new_node = node.graph.call_function(slice_tensor, kwargs=new_kwargs) - new_node.meta["type"] = torch.Tensor - slice_nodes.append(new_node) - start += i - - new_node = node.graph.call_function( - tuple_construct, kwargs={"tensors": tuple(slice_nodes)} - ) - new_node.meta = node.meta.copy() - return new_node - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def split(*, input, split_size, dim): - return torch.split(input, split_size, dim) - - -@register_acc_op -def tuple_construct(*, tensors): - return tuple(tensors) - - -@register_acc_op_properties(AccOpProperty.quantized) -@register_acc_op_mapping( - op_and_target=("call_function", torch.ops.quantized.batch_norm2d), - arg_replacement_tuples=[ - ("input", "input"), - ("weight", "weight"), - ("bias", "bias"), - ("running_mean", "running_mean"), - ("running_var", "running_var"), - ("eps", "eps"), - ("scale", "scale"), - ("zero_point", "zero_point"), - ], - kwargs_to_move_to_acc_out_ty=[ - ("scale", "scale", move_to_qparams), - ("zero_point", "zero_point", move_to_qparams), - ], -) -@register_acc_op -def quantized_batch_norm2d( - *, input, running_mean, running_var, weight, bias, eps, acc_out_ty -): - qparams = TensorMetadata(*acc_out_ty).qparams - return torch.ops.quantized.batch_norm2d( - input, - weight, - bias, - running_mean, - running_var, - eps, - qparams["scale"], - qparams["zero_point"], - ) - - -@register_acc_op_mapping(op_and_target=("call_function", nn.functional.embedding_bag)) -@register_acc_op -def embedding_bag( - *, - input, - weight, - offsets, - max_norm, - norm_type, - scale_grad_by_freq, - mode, - sparse, - per_sample_weights, - include_last_offset, - padding_idx, -): - return nn.functional.embedding_bag( - input=input, - weight=weight, - offsets=offsets, - max_norm=max_norm, - norm_type=norm_type, - scale_grad_by_freq=scale_grad_by_freq, - mode=mode, - sparse=sparse, - per_sample_weights=per_sample_weights, - include_last_offset=include_last_offset, - padding_idx=padding_idx, - ) - - -@register_acc_op_mapping( - op_and_target=( - "call_function", - torch.ops.quantized.embedding_bag_byte_rowwise_offsets, - ) -) -@register_acc_op -def embedding_bag_byte_rowwise_offsets( - *, - weight, - indices, - offsets, - scale_grad_by_freq, - mode, - pruned_weights, - per_sample_weights, - compressed_indices_mapping, - include_last_offset, -): - return torch.ops.quantized.embedding_bag_byte_rowwise_offsets( - weight=weight, - indices=indices, - offsets=offsets, - scale_grad_by_freq=scale_grad_by_freq, - mode=mode, - pruned_weights=pruned_weights, - per_sample_weights=per_sample_weights, - compressed_indices_mapping=compressed_indices_mapping, - include_last_offset=include_last_offset, - ) - - -@register_acc_op_mapping( - op_and_target=( - "call_function", - torch.ops.quantized.embedding_bag_4bit_rowwise_offsets, - ) -) -@register_acc_op -def embedding_bag_4bit_rowwise_offsets( - *, - weight, - indices, - offsets, - scale_grad_by_freq, - mode, - pruned_weights, - per_sample_weights, - compressed_indices_mapping, - include_last_offset, -): - return torch.ops.quantized.embedding_bag_4bit_rowwise_offsets( - weight=weight, - indices=indices, - offsets=offsets, - scale_grad_by_freq=scale_grad_by_freq, - mode=mode, - pruned_weights=pruned_weights, - per_sample_weights=per_sample_weights, - compressed_indices_mapping=compressed_indices_mapping, - include_last_offset=include_last_offset, - ) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.sin)) -@register_acc_op -def sin(*, input): - return torch.sin(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.cos)) -@register_acc_op -def cos(*, input): - return torch.cos(input=input) - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.tan)) -@register_acc_op -def tan(*, input): - return torch.tan(input=input) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.topk)) -@register_acc_op -def topk(*, input, k, dim, largest, sorted): - return torch.topk(input=input, k=k, dim=dim, largest=largest, sorted=sorted) - - -@register_acc_op_mapping(op_and_target=("call_function", operator.getitem)) -@register_acc_op -def getitem(*, input, idx): - return input[idx] - - -@register_acc_op_mapping(op_and_target=("call_function", torch.nan_to_num)) -@register_acc_op_mapping(op_and_target=("call_method", "nan_to_num")) -@register_acc_op -def nan_to_num(*, input, nan=0.0, posinf=None, neginf=None): - return torch.nan_to_num(input, nan=nan, posinf=posinf, neginf=neginf) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op -def slice_tensor(*, input, dim, start, stop, step): - slc = slice(start, stop, step) - if dim >= 0: - slices: List[slice] = [slice(None, None, None) for _ in range(dim)] - slices.append(slc) - else: - slices = [Ellipsis, slc] # type: ignore[list-item] - slices.extend([slice(None, None, None) for _ in range(-dim - 1)]) - - return input[tuple(slices)] - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.narrow), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim"), - ("start", "start"), - ("length", "length"), - ], -) -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "narrow"), - arg_replacement_tuples=[ - ("input", "input"), - ("dim", "dim"), - ("start", "start"), - ("length", "length"), - ], -) -def custom_narrow_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node: - assert isinstance(node.kwargs["start"], int) and isinstance( - node.kwargs["length"], int - ) - kwargs = { - "input": node.kwargs["input"], - "dim": node.kwargs["dim"], - "start": node.kwargs["start"], - "stop": node.kwargs["start"] + node.kwargs["length"], - "step": 1, - } - with node.graph.inserting_before(node): - new_node = node.graph.call_function(slice_tensor, kwargs=kwargs) - new_node.meta = node.meta.copy() - return new_node - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping( - op_and_target=("call_function", torch.reshape), - arg_replacement_tuples=[ - ("input", "input"), - ("shape", "shape"), - ], - kwargs_to_move_to_acc_out_ty=[("shape", "shape")], -) -@register_acc_op_mapping( - op_and_target=("call_method", "view"), - arg_replacement_tuples=[ - ("input", "input"), - ("*", "shape"), - ], - kwargs_to_move_to_acc_out_ty=[("shape", "shape")], -) -@register_acc_op -def reshape(*, input, acc_out_ty=None): - assert acc_out_ty is not None - return input.reshape(TensorMetadata(*acc_out_ty).shape) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "reshape"), - arg_replacement_tuples=[ - ("input", "input"), - ("*", "shape"), - ], -) -def custom_tensor_reshape_mapper(node: torch.fx.Node, _: nn.Module) -> torch.fx.Node: - """ - For Tensor.reshape node, args could be (input, 1, 2, 3) or (input, (1, 2, 3)). - Here we do some special handling with the `shape` arg in order to map it to - acc_ops.reshape. It also handles the case when `shape` is a list instead of - tuple. - """ - input_node = node.kwargs["input"] - shape = node.kwargs["shape"] - - assert isinstance(shape, Sequence) - if isinstance(shape[0], (tuple, list)): # type: ignore[index] - shape = shape[0] # type: ignore[index] - - with node.graph.inserting_before(node): - new_node = node.graph.call_function( - reshape, - kwargs={ - "input": input_node, - "acc_out_ty": acc_utils.build_raw_tensor_meta(shape=shape), - }, - ) - new_node.meta = node.meta.copy() - return new_node - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op -def to_dtype(input, acc_out_ty=None): - assert acc_out_ty is not None - return input.to(dtype=TensorMetadata(*acc_out_ty).dtype) - - -@register_custom_acc_mapper_fn( - op_and_target=("call_method", "to"), - arg_replacement_tuples=[ - ("input", "input"), - ("dtype", "dtype"), - ], -) -def custom_tensor_to_mapper(node: torch.fx.Node, _: nn.Module): - dest_dtype = node.kwargs["dtype"] - mem_format = node.kwargs.get("memory_format") - device = node.kwargs.get("device") - assert dest_dtype is not None - assert mem_format is None or mem_format == torch.preserve_format - assert device is None - - new_kwargs = { - "input": node.kwargs["input"], - "acc_out_ty": acc_utils.build_raw_tensor_meta(dtype=dest_dtype), - } - - with node.graph.inserting_before(node): - new_node = node.graph.create_node( - "call_function", to_dtype, kwargs=new_kwargs, name=node.name - ) - new_node.meta = node.meta - return new_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.add), - # Note that we may have aliases for inputs here due to issues with deterministically - # knowing the correct target that will be resolved by pytorch. - arg_replacement_tuples=[ - (("input", "a"), "input"), - (("other", "b"), "other"), - ("alpha", "alpha", this_arg_is_optional), - ], -) -def custom_torch_add_mapper(node: torch.fx.Node, mod: nn.Module) -> torch.fx.Node: - """ - Add custom mapping for torch.add because it has an `alpha` parameter which scales - the `other` input, and we want to make that mul a separate node. - """ - with node.graph.inserting_before(node): - # If alpha is in kwargs check if we need to add a mul, and use correct kwargs. - if "alpha" in node.kwargs: - # Add mul node only if it has a numerical impact, i.e. alpha != 1.0. - if node.kwargs["alpha"] != 1.0: - other_node = node.graph.create_node( - "call_function", - mul, - kwargs={ - "input": node.kwargs["other"], - "other": node.kwargs["alpha"], - }, - name=node.name + "_mul_alpha", - ) - other_node.meta = node.meta - else: - other_node = node.kwargs["other"] - add_kwargs = {"input": node.kwargs["input"], "other": other_node} - else: - add_kwargs = node.kwargs - - new_node = node.graph.create_node( - "call_function", add, kwargs=add_kwargs, name=node.name - ) - new_node.meta = node.meta - return new_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_module", nn.quantized.Linear), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -def packed_quantized_linear_mapper( - node: torch.fx.Node, mod: nn.Module -) -> torch.fx.Node: - """ - Mapping from quantized_linear module to acc_op.linear. We unpack weight and bias - in this mapper and pass them directly to linear node. - """ - assert isinstance(node.target, str) - linear_module = dict(mod.named_modules())[node.target] - prefix = node.target.replace(".", "_") - weight_name = f"{prefix}_weight" - bias_name = f"{prefix}_bias" - - # Store weight and bias in the main module - mod.register_buffer(weight_name, linear_module.weight()) - if linear_module.bias() is not None: - mod.register_buffer(bias_name, linear_module.bias()) - - with node.graph.inserting_before(node): - # Insert get_attr nodes for weight and bias - get_weight = node.graph.get_attr(weight_name) - get_weight.meta["tensor_meta"] = _extract_tensor_metadata( - linear_module.weight() - ) - - get_bias = None - if linear_module.bias() is not None: - get_bias = node.graph.get_attr(bias_name) - get_bias.meta["tensor_meta"] = _extract_tensor_metadata( - linear_module.bias() - ) - - qparams = {"scale": linear_module.scale, "zero_point": linear_module.zero_point} - # Create kwargs for acc_op.quantized_linear - kwargs = { - "input": node.kwargs["input"], - "weight": get_weight, - "bias": get_bias, - "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams), - } - - new_node = node.graph.call_function(quantized_linear, kwargs=kwargs) - new_node.meta = node.meta - return new_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_module", nn.quantized.Conv2d), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -def packed_quantized_conv2d_mapper( - node: torch.fx.Node, mod: nn.Module -) -> torch.fx.Node: - """ - Mapping from quantzed Conv2d module to acc_op.conv. We unpack all the parameters - in this mapper and pass them directly to conv2d node. - """ - assert isinstance(node.target, str) - conv_module = dict(mod.named_modules())[node.target] - prefix = node.target.replace(".", "_") - weight_name = f"{prefix}_weight" - bias_name = f"{prefix}_bias" - - # Store weight and bias in the main module - mod.register_buffer(weight_name, conv_module.weight()) - if conv_module.bias() is not None: - mod.register_buffer(bias_name, conv_module.bias()) - - with node.graph.inserting_before(node): - # Insert get_attr nodes for weight and bias - get_weight = node.graph.get_attr(weight_name) - get_weight.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.weight()) - - get_bias = None - if conv_module.bias() is not None: - get_bias = node.graph.get_attr(bias_name) - get_bias.meta["tensor_meta"] = _extract_tensor_metadata(conv_module.bias()) - - qparams = {"scale": conv_module.scale, "zero_point": conv_module.zero_point} - - # Create kwargs for acc_op.conv - kwargs = { - "input": node.kwargs["input"], - "weight": get_weight, - "bias": get_bias, - "stride": conv_module.stride, - "padding": conv_module.padding, - "dilation": conv_module.dilation, - "groups": conv_module.groups, - "padding_mode": conv_module.padding_mode, - "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams), - } - - new_node = node.graph.call_function(quantized_conv2d, kwargs=kwargs) - new_node.meta = node.meta - return new_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_function", torch.ops.quantized.add_relu), - arg_replacement_tuples=[ - ("input", "input"), - ("other", "other"), - ("scale", "scale"), - ("zero_point", "zero_point"), - ], -) -def add_relu_unfuse_mapper( - node: torch.fx.Node, mod: torch.fx.GraphModule -) -> torch.fx.Node: - with node.graph.inserting_before(node): - qparams = { - "scale": node.kwargs["scale"], - "zero_point": node.kwargs["zero_point"], - } - add_kwargs = { - "input": node.kwargs["input"], - "other": node.kwargs["other"], - "acc_out_ty": acc_utils.build_raw_tensor_meta(qparams=qparams), - } - add_node = node.graph.call_function(quantized_add, kwargs=add_kwargs) - add_node.meta = node.meta.copy() - - relu_node = node.graph.call_function( - relu, kwargs={"input": add_node, "inplace": False} - ) - relu_node.meta = node.meta - return relu_node - - -@register_custom_acc_mapper_fn( - op_and_target=("call_module", nn.intrinsic.quantized.ConvReLU2d), - arg_replacement_tuples=[ - ("input", "input"), - ], -) -def packed_quantized_convrelu2d_mapper( - node: torch.fx.Node, mod: nn.Module -) -> torch.fx.Node: - """ - Mapping from quantized ConvReLU2d module to acc_op.relu. We use packed_quantized_conv2d_mapper to unpack all the parameters - in this mapper and pass the returned conv2d node directly to relu node. - """ - - with node.graph.inserting_before(node): - # conv2d op - conv2d_node = packed_quantized_conv2d_mapper(node, mod) - - # relu op - relu_node = node.graph.call_function( - relu, kwargs={"input": conv2d_node, "inplace": False} - ) - relu_node.meta = node.meta - return relu_node - - -@register_acc_op_properties(AccOpProperty.pointwise, AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.nn.functional.gelu)) -@register_acc_op_mapping(op_and_target=("call_method", "gelu")) -@register_acc_op -def gelu(*, input): - return torch.nn.functional.gelu(input=input) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.cumsum)) -@register_acc_op_mapping(op_and_target=("call_method", "cumsum")) -@register_acc_op -def cumsum(*, input, dim, dtype=None): - return torch.cumsum(input=input, dim=dim, dtype=dtype) - - -@register_acc_op_properties(AccOpProperty.unary) -@register_acc_op_mapping(op_and_target=("call_function", torch.chunk)) -@register_acc_op_mapping(op_and_target=("call_method", "chunk")) -@register_acc_op -def chunk(*, input, chunks, dim=0): - return torch.chunk(input=input, chunks=chunks, dim=dim) diff --git a/torch/fx/experimental/fx_acc/acc_tracer.py b/torch/fx/experimental/fx_acc/acc_tracer.py deleted file mode 100644 index 352b7161170..00000000000 --- a/torch/fx/experimental/fx_acc/acc_tracer.py +++ /dev/null @@ -1,462 +0,0 @@ -import ast -import builtins -import copy -import inspect -import logging -import textwrap -import warnings -from types import FunctionType -from typing import Dict, Optional, Any, Type, Tuple, Set, List - -import torch.fx.experimental.fx_acc.acc_normalizer as acc_normalizer -import torch.fx.experimental.fx_acc.acc_ops # noqa: F401 -import torch -import torch.jit as jit -import torch.nn as nn -from torch._sources import normalize_source_lines -from torch.fx import Graph, Tracer -from torch.fx.experimental.normalize import NormalizeArgs -from torch.fx.passes import shape_prop - - -_LOGGER = logging.getLogger(__name__) - - -def _get_exception_wrapper_attr_name(exc_type: Type[Exception]) -> str: - return f"_conditional_exception_wrapper_{exc_type.__name__}" - - -class Acc_Rewriter(ast.NodeTransformer): - """ - Take a FunctionType object representing a `forward` method, then - perform an AST rewrite to swap out nodes that are not symbolically - traceable with a callsite to the FX alternative. - - To support swapping out an AST node, define a new `visit` method on - that node. For more details, see: - https://docs.python.org/3/library/ast.html#ast.NodeTransformer - """ - - def __init__(self): - super().__init__() - self.exceptions_rewritten: Set[Type[Exception]] = set() - - def rewrite(self, fn: FunctionType) -> Tuple[FunctionType, Set[Type[Exception]]]: - - # Normalize the source lines - sourcelines, _ = inspect.getsourcelines(fn) - sourcelines = normalize_source_lines(sourcelines) - source = "".join(sourcelines) - normalized_str = textwrap.dedent(source) - - # Rewrite the original AST - source_ast = ast.parse(normalized_str) - dest_ast = ast.fix_missing_locations(self.visit(source_ast)) - - # Pull out the compiled function from the newly-created Module - code = compile(dest_ast, "", "exec") - globals_dict = copy.copy(fn.__globals__) - keys_before = set(globals_dict.keys()) - exec(code, globals_dict) - new_keys = list(set(globals_dict.keys()) - keys_before) - assert len(new_keys) <= 1 - fn_compiled = globals_dict[fn.__name__] - - # Return the correct FunctionType object and the Exceptions that were - # rewritten during visit_If. - return fn_compiled, self.exceptions_rewritten - - def visit_Assert(self, node: ast.Assert): - """ - Swap out the Assert node (Python's `assert`) with a callsite to the - symbolically-traceable torch._assert function - """ - # Create the Call node - n = ast.parse("torch._assert()", mode="eval") - assert isinstance(n, ast.Expression) - call_node = n.body - assert isinstance(call_node, ast.Call) - msg = node.msg if node.msg else ast.Constant(value="", kind=None) - call_node.args = [node.test, msg] - - # Ensure that the new node conforms to the Python AST grammar - expr_wrapper = ast.Expr(value=call_node) - - # Return the new Call node to signify that we want to use it as - # a replacement for the original _assert node - return ast.copy_location(expr_wrapper, node) - - def visit_If(self, if_node: ast.If): - """ - Swap out the pattern `If(x): Raise(y)` with a ConditionalExceptionWrapper - specialized for the specific exception y. The specialized - ConditionalExceptionWrapper module will be added in the RewrittenModule. - Only works with builtin Exceptions, as we assume the signature of the - init for the Exception is a string. - """ - raise_node = if_node.body[0] - if not isinstance(raise_node, ast.Raise): - return if_node - - # Don't handle orelse for now. - # TODO: Move orelse to the body after calling ConditionalExceptionWrapper. - if len(if_node.orelse) != 0: - return if_node - - def _reuse_loc(node): - return ast.copy_location(node, if_node) - - # If the exception has a message then we expect the raise's exc to be a - # Call w/ a msg. Else if it's a exc Name then there's no msg to use. - node_for_exc = raise_node.exc - if isinstance(node_for_exc, ast.Name): - # E.g. `raise AssertionError`, i.e. without an exc_msg. - name_node_of_exc = node_for_exc - exc_msg = _reuse_loc(ast.Constant(None)) - elif isinstance(node_for_exc, ast.Call): - # E.g. `raise AssertionError("error message")` - name_node_of_exc = node_for_exc.func # type: ignore[assignment] - if not isinstance(name_node_of_exc, ast.Name): - return if_node - # Most assertions just take a single string arg, but some may not; skip - # handling such assertions for now. - if len(node_for_exc.args) != 1: - return if_node - exc_msg = node_for_exc.args[0] - else: - return if_node - - # Convert what we expect is the name of the exception into its - # associated python class. - name_of_exc = name_node_of_exc.id - try: - exc_type = eval(name_of_exc) - except Exception: - return if_node - - # Check that we actually have a builtin exception. - if ( - not issubclass(exc_type, Exception) - or getattr(getattr(exc_type, "__class__", None), "__module__", None) - != "builtins" - ): - return if_node - - # We need a ConditionalExceptionWrapper specialized for every kind of - # exception, so add it to exceptions_rewritten to remember for later to - # add a specialized attr with it. - self.exceptions_rewritten.add(exc_type) - - # From here we definitely should be able to do the replacement. Create a - # Call node to the ConditionalExceptionWrapper module we're replacing - # the If with, with args set as the If's condition and the string of the - # exception. The call to the self._conditional_exception_wrapper_*Error - # module is safe because the RewrittenModule will add it as an attr - # based on the returned exceptions_rewritten, and we assume we are - # currently modifying the AST of a method from a RewrittenModule. - exc_wrapper_node = ast.parse( - f"self.{_get_exception_wrapper_attr_name(exc_type)}()", mode="eval" - ) - assert isinstance(exc_wrapper_node, ast.Expression) - exc_wrapper_call_node = exc_wrapper_node.body - assert isinstance(exc_wrapper_call_node, ast.Call) - exc_wrapper_call_node.args = [if_node.test, exc_msg] - - # Ensure that the new node conforms to the Python AST grammar - expr_wrapper = _reuse_loc(ast.Expr(_reuse_loc(exc_wrapper_call_node))) - - # Return the new node to signify that we want to use it as a replacement - # for the original `If x: Raise y` pattern. - return expr_wrapper - - -class ConditionalExceptionWrapper(nn.Module): - """ - This wrapper class is used to wrap conditional raising of exceptions during - rewriting. For example: - - .. code-block:: python - - if self.name != "x": - raise AssertionError(f"Name was not x: {self.name}") - - Is rewritten into - - .. code-block:: python - - self._conditional_exception_wrapper_AssertionError( - self.name != "x", f"Name was not x: {self.name}" - ) - - Note that __init__ takes the Exception class that it is wrapping, while - forward takes the condition to check and the message for the exception. - - """ - - # Mark as impure so that calls to it will not be removed during DCE. - _is_impure = True - - def __init__(self, exc: Type[Exception]): - super().__init__() - self.exc = exc - - def forward(self, cond: bool, msg: str): - if cond: - raise self.exc if msg is None else self.exc(msg) - - -# Custom tracer that traces to the functional level and rewrites asserts and -# exceptions. -class AccRewritingTracer(Tracer): - # Add an explicit check for mutable operations, which break symbolic tracing. - check_mutable_operations = True - - # Note: Treat ConditionalExceptionWrapper as a leaf so that we don't - # trace into it, because it contains control flow and raises an exception. - DEFAULT_LEAF_MODULE_LIST = { - ConditionalExceptionWrapper, - torch.nn.quantized.Linear, - torch.nn.quantized.Conv2d, - torch.nn.intrinsic.quantized.ConvReLU2d, - jit.ScriptModule, - jit.RecursiveScriptModule, - } - - def is_leaf_module(self, m: nn.Module, mod_qual_name: str) -> bool: - return getattr(m, "_base_class_origin", type(m)) in self.leaf_module_list - - def trace( - self, - root: nn.Module, - concrete_args: Optional[Dict[str, Any]] = None, - ast_rewriter_allow_list: Optional[Set] = None, - leaf_module_list: Optional[Set] = None, - ) -> Tuple[Graph, nn.Module]: - self.leaf_module_list = self.DEFAULT_LEAF_MODULE_LIST - if leaf_module_list: - self.leaf_module_list.update(leaf_module_list) - rewritten = _rewrite(root, ast_rewriter_allow_list, self.leaf_module_list) - return super().trace(rewritten, concrete_args), rewritten - - -# List of modules that need rewriting to be supported for tracing. -DEFAULT_REWRITE_ALLOW_LIST = { - nn.BatchNorm1d, - nn.BatchNorm2d, - nn.BatchNorm3d, -} - - -def _rewrite(mod_to_rewrite: nn.Module, allow_list: Optional[Set] = None, leaf_module_list: Optional[Set] = None) -> nn.Module: - if allow_list is None: - allow_list = DEFAULT_REWRITE_ALLOW_LIST - else: - allow_list = allow_list.union(DEFAULT_REWRITE_ALLOW_LIST) - - if not leaf_module_list: - leaf_module_list = set() - - # Rewrite this module's functions as well as all recursive modules' - # functions that are attrs of this moodule. Return the new, rewritten module - # hierarchy. - def rewrite_module(m: nn.Module): - if isinstance(m, jit.ScriptModule): - # ScriptModule cannot be rewritten, so bypass it. The issue is it - # requires explicitly calling its `__init__()`, calling - # `nn.Module.__init__()` in the derived `RewrittenModule` is not - # enough. And even if we init it we can't do much with it. - return m - - # If m is an already-rewritten RewrittenModule, then use the original base class. - base_class : Type[nn.Module] = getattr(m, "_base_class_origin", type(m)) - - # Keep track of all the ConditionalExceptionWrappers that the - # Acc_Rewriter calls into in this module so we can add them in init - # below. - all_added_wrappers: Set[Type[Exception]] = set() - - # Note: Make this a subclass of our base class. - class RewrittenModule(base_class): # type: ignore[valid-type, misc] - # Keep track of the base_class so that symbolic tracing can - # determine what kind of module this originally was later on. - _base_class_origin = base_class - # Add suffix to qualname so it's easier to debug the origin of this module. - __qualname__ = f"{base_class.__qualname__}__AccRewrittenModule" - - # Write all of the non-dunder or special methods from base_class - # into RewrittenModule. - for method_name in dir(base_class): - method = getattr(base_class, method_name, None) - if method is None: - _LOGGER.warning(f"{__qualname__} does not have attribute {method_name}") - - if builtins.type(method) is not FunctionType: - continue - - # Always skip rewriting dunder methods, as they haven't (yet) been - # problematic, and modifying them has caused issues previously. - if method_name.startswith("__") and method_name.endswith("__"): - continue - - # Only rewrite those Modules explicitly in the allow_list. - assert allow_list is not None - if base_class not in allow_list: - vars()[method_name] = method - else: - vars()[method_name], added_wrappers = Acc_Rewriter().rewrite(method) - all_added_wrappers.update(added_wrappers) - - def __init__(self, orig): - nn.Module.__init__(self) - - # Iterate over all added exception wrappers and add - # ConditionalExceptionWrapper attrs for each. - for exc_type in all_added_wrappers: - wrapper_name = _get_exception_wrapper_attr_name(exc_type) - assert not hasattr(self, wrapper_name) - setattr( - self, - wrapper_name, - ConditionalExceptionWrapper(exc_type), - ) - # Recursively rewrite and copy all module attrs of this module. - for k, v in orig.__dict__.items(): - if k == "_modules": - for mod_k, mod_v in v.items(): - if getattr(mod_v, "_base_class_origin", type(mod_v)) in leaf_module_list: # type: ignore[operator] - print(f"Skip rewriting leaf module {type(mod_v)}") - self._modules[mod_k] = mod_v - else: - self._modules[mod_k] = rewrite_module(mod_v) - else: - self.__dict__[k] = v - - # Add suffix to name so it's easier to debug the origin of this module. - RewrittenModule.__name__ = f"{base_class.__name__}__AccRewrittenModule" - return RewrittenModule(m) - - return rewrite_module(mod_to_rewrite) - - -def _remove_assertions(gm: torch.fx.GraphModule) -> bool: - """ - Unconditionally removes all assertions found in GraphModule gm. - Returns whether the graph is modified. - """ - changed = False - for node in gm.graph.nodes: - if node.op == "call_function" and node.target == torch._assert: - gm.graph.erase_node(node) - changed = True - return changed - - -def _remove_exceptions(gm: torch.fx.GraphModule) -> bool: - """ - Unconditionally removes all call_modules to ConditionalExceptionWrappers - found in GraphModule gm. Returns whether the graph is modified. - """ - changed = False - for node in gm.graph.nodes: - if node.op == "call_module" and isinstance( - gm.get_submodule(node.target), ConditionalExceptionWrapper - ): - gm.graph.erase_node(node) - changed = True - return changed - - -def trace( - mod: nn.Module, - sample_inputs: List[torch.Tensor], - remove_assertions: bool = True, - remove_exceptions: bool = True, - use_acc_normalization: bool = True, - ast_rewriter_allow_list: Optional[Set[Type[nn.Module]]] = None, - leaf_module_list: Optional[Set[Type[nn.Module]]] = None, -) -> torch.fx.GraphModule: - """ - Performs tracing and arg normalization specialized for accelerator lowering. - - It first rewrites the AST of the module's methods (and all attr methods - recursively) to transform un-tracable parts of the module to make them - traceable. - - It then traces to the functional level so that optimizations and backend - accelerator importers have the ability to see and/or change inputs to each - op. - - It then removes assertions and exception wrappers found during symbolic - tracing if requested based on remove_assertions and remove_exceptions - - Dead code is then eliminated, which will e.g. remove any nodes that were - only used by assertions or exceptions if they were removed. - - It then performs normalization on args/kwargs, aligning any arg that can be - moved to kwarg to be so, and then making default values explicit. - - Args: - - mod (Module): The module to transform and trace. - - sample_inputs (Tuple[Union[torch.Tensor, List[torch.Tensor]]]): - Sample inputs with which to run shape prop. - - remove_assertions (bool): Whether to remove assertion nodes from - the graph after symbolic tracing. - - remove_exceptions (bool): Whether to remove exception wrapper nodes - from the graph after symbolic tracing. - - use_acc_normalization (bool): Whether to use acc-specific - normalization to all acc_ops. - - ast_rewriter_allow_list (Optional[Set[nn.Module]]): Optional allow list of - modules that need AST rewriting. - - leaf_module_list (Optional[Set[nn.Module]]): Optional leaf module list where - modules will not be traced into. - - """ - if mod.training: - warnings.warn( - "acc_tracer does not support currently support models for training." - " Calling eval on model before tracing." - ) - mod.eval() - - # Rewrite the module to make it symbolic traceable, and then trace it. - rewritten_graph, rewritten_mod = AccRewritingTracer().trace( - mod, - ast_rewriter_allow_list=ast_rewriter_allow_list, - leaf_module_list=leaf_module_list, - ) - - assert isinstance(rewritten_mod, nn.Module) - # Note: use the rewritten_mod here as the root. This is necessary because - # RewrittenModule includes a new module for the ConditionalExceptionWrapper. - traced = torch.fx.GraphModule(rewritten_mod, rewritten_graph) - - # Now remove all assertions and exceptions if requested. - if remove_assertions: - _remove_assertions(traced) - if remove_exceptions: - _remove_exceptions(traced) - - # Cleanup any dead code from the original module as well as resulting dead - # nodes after removing assertions and exceptions. - traced.graph.eliminate_dead_code() - - # Now normalize args/kwargs to make default values visible. Leave args/kwargs as - # they were, since all-kwarg normalization is broken, and we don't need it anyway. - shape_prop.ShapeProp(traced).propagate(*sample_inputs) - traced = NormalizeArgs(traced, normalize_to_only_use_kwargs=False).transform() - - # Normalize to acc-specialized wrappers for consistency across op naming and - # ensuring all kwarg usage. - if use_acc_normalization: - acc_normalizer.normalize(traced) - - traced.recompile() - - return traced diff --git a/torch/fx/experimental/fx_acc/acc_utils.py b/torch/fx/experimental/fx_acc/acc_utils.py deleted file mode 100644 index 31613ebb775..00000000000 --- a/torch/fx/experimental/fx_acc/acc_utils.py +++ /dev/null @@ -1,175 +0,0 @@ -import inspect -import json -import os -from typing import Any, Tuple, Callable, Union, Dict, List, Optional -import re - -import torch -import torch.fx -from torch.fx.passes.graph_manipulation import ( - serialize_module, -) -from torch.fx.graph_module import GraphModule -from torch.fx.node import _get_qualified_name -from torch.fx.passes import graph_drawer -from torch.fx.passes.shape_prop import TensorMetadata - - -def get_target_from_module(mod: torch.nn.Module, target: str): - """ - Gets `target` from `mod` and returns it. If `target` is empty then returns `mod.` - """ - if target == "": - return mod - - target_atoms = target.split(".") - curr_obj = mod - for i, atom in enumerate(target_atoms): - if not hasattr(curr_obj, atom): - raise RuntimeError( - f"Node referenced nonexistent target '{'.'.join(target_atoms[:i])}'; " - f" original whole target: '{target}'" - ) - curr_obj = getattr(curr_obj, atom) - return curr_obj - - -def get_attr(node: torch.fx.Node) -> Any: - """ - Returns the underlying attr for a given node which - must be of type get_attr. - """ - assert node.op == "get_attr", "Expected a get_attr node" - return get_target_from_module(node.graph.owning_module, str(node.target)) - - -def is_acc_op(node_or_target: Union[Callable, torch.fx.Node]) -> bool: - """ - Returns whether `node_or_target` is an acc_op. If it's a node, then checks whether - it's a call_function target is from the acc_ops module. Otherwise it's already - the target, which is similarly checked to see if it's from the acc_ops module. - """ - if isinstance(node_or_target, torch.fx.Node): - # All acc_ops are call_functions. - if node_or_target.op != "call_function": - return False - target = node_or_target.target - else: - target = node_or_target - return "acc_ops" in target.__module__ - - -def is_acc_op_with_kwarg( - node_or_target: Union[Callable, torch.fx.Node], kwarg: str -) -> bool: - """ - Helper that inspects `node_or_target` and returns whether it is an acc_op node - (or a target for an acc_op) that has an arg signature that includes `kwarg`. - """ - if not is_acc_op(node_or_target): - return False - - target = ( - node_or_target.target - if isinstance(node_or_target, torch.fx.Node) - else node_or_target - ) - assert not isinstance(target, str) - return kwarg in inspect.signature(inspect.unwrap(target)).parameters - - -def serialize_module_json_to_file(fx_module: GraphModule, fname: str): - weights: Dict = {} - serialized_json = json.dumps(serialize_module(fx_module, weights), indent=2) - with open(fname, "w") as ofile: - ofile.write(serialized_json) - - -def build_raw_tensor_meta( - shape=None, - dtype=None, - requires_grad=None, - stride=None, - memory_format=None, - is_quantized=None, - qparams=None, -): - return TensorMetadata(**locals()) - - -def draw_graph(traced: torch.fx.GraphModule, fname: str, figname: str = "fx_graph"): - base, ext = os.path.splitext(fname) - if not ext: - ext = ".svg" - print(f"Writing FX graph to file: {base}{ext}") - g = graph_drawer.FxGraphDrawer(traced, figname) - x = g.get_main_dot_graph() - try: - getattr(x, "write_" + ext.lstrip("."))(fname) - except OSError as e: - print(f"Failed to write the FX graph due to: {e}") - - -def get_model_info_str(gm: torch.fx.GraphModule, header: Optional[str] = None): - """ - Print out info of the provided `gm`. - If `header` is provided then it's included in the printed string. - """ - ops_and_counts: Dict[Callable, int] = dict() - placeholder_count = get_attr_count = call_method_count = call_module_count = 0 - for node in gm.graph.nodes: - if node.op == "call_function": - ops_and_counts[node.target] = ops_and_counts.get(node.target, 0) + 1 - elif node.op == "placeholder": - placeholder_count += 1 - elif node.op == "get_attr": - get_attr_count += 1 - elif node.op == "call_method": - call_method_count += 1 - elif node.op == "call_module": - call_module_count += 1 - elif node.op == "output": - output_count = len(node.args[0]) if isinstance(node.args[0], tuple) else 1 - else: - raise RuntimeError(f"Unknown node found: {node.format_node()}") - - header = "" if header is None else f" [{header}]" - model_info_str = f"Model Info{header}:\n" - model_info_str += f"> placeholder: {placeholder_count}\n" - model_info_str += f"> get_attr: {get_attr_count}\n" - model_info_str += f"> output: {output_count}\n" - if call_module_count != 0: - model_info_str += f"> WARNING: call_module: {call_module_count}" - if call_method_count != 0: - model_info_str += f"> WARNING: call_method: {call_method_count}" - - # Sort and print all the other ops. Sort so it's deterministic between runs and - # easier to parse. - pretty_ops_and_counts: List[Tuple[str, int]] = [] - for op, count in ops_and_counts.items(): - pretty_ops_and_counts.append((_get_qualified_name(op), count)) - pretty_ops_and_counts.sort() - for op_str, count in pretty_ops_and_counts: - model_info_str += f"> {op_str}: {count}\n" - - return model_info_str - - -def get_unique_attr_name_in_module(mod_traced: torch.fx.GraphModule, name: str) -> str: - """ - Make sure the name is unique (in a module) and can represents an attr. - """ - # Delete all characters that are illegal in a Python identifier. - name = re.sub("[^0-9a-zA-Z_]+", "_", name) - if name[0].isdigit(): - name = f"_{name}" - # Now make sure it is in fact unique to the module by incrementing suffix value. - while hasattr(mod_traced, name): - match = re.match(r"(.*)_(\d+)$", name) - if match is None: - name = name + "_1" - else: - base, num = match.group(1, 2) - name = f"{base}_{int(num) + 1}" - - return name diff --git a/torch/fx/passes/graph_manipulation.py b/torch/fx/passes/graph_manipulation.py index a33696e74c5..4c429e42421 100644 --- a/torch/fx/passes/graph_manipulation.py +++ b/torch/fx/passes/graph_manipulation.py @@ -383,7 +383,7 @@ def serialize_module(fx_module: GraphModule, weights: Dict, name_prefix="") -> D # so we check if the users of this get_attr is a quantized EB and this is the weight for the EB. user_targets = { _get_qualified_name(n.target) - .replace("torch.fx.experimental.fx_acc.", "") + .replace("fx2trt_oss.tracer.acc_tracer.", "") .replace("glow.fb.fx.", ""): n for n in node.users.keys() } diff --git a/torch/testing/_internal/common_fx2trt.py b/torch/testing/_internal/common_fx2trt.py index 79eab0d68d1..5d50d78e186 100644 --- a/torch/testing/_internal/common_fx2trt.py +++ b/torch/testing/_internal/common_fx2trt.py @@ -3,7 +3,7 @@ from typing import Callable, List, Tuple import torch import torch.fx -import torch.fx.experimental.fx_acc.acc_tracer as acc_tracer +import fx2trt_oss.tracer.acc_tracer.acc_tracer as acc_tracer from fx2trt_oss.fx import ( TRTInterpreter, InputTensorSpec, From 1b089292df265406946d7394781a79b2f692e512 Mon Sep 17 00:00:00 2001 From: Ilya Persky Date: Tue, 15 Feb 2022 08:33:59 -0800 Subject: [PATCH 046/199] Fix test failure when compiled without LAPACK support (#70671) Summary: Fixes https://github.com/pytorch/pytorch/issues/70670 Pull Request resolved: https://github.com/pytorch/pytorch/pull/70671 Reviewed By: H-Huang Differential Revision: D34242339 Pulled By: janeyx99 fbshipit-source-id: 8cd13c13588007c60e9c3f17dbf707dcfa2e0e04 (cherry picked from commit cf6dbe3e819aef3a3dca8801de5c9bc1e8884e22) --- test/run_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/run_test.py b/test/run_test.py index 2b772bc6f36..de8a4f90775 100644 --- a/test/run_test.py +++ b/test/run_test.py @@ -373,6 +373,11 @@ DISTRIBUTED_TESTS = [ "distributed/_shard/sharded_optim/test_sharded_optim", ] + [test for test in TESTS if test.startswith("distributed/fsdp")] +TESTS_REQUIRING_LAPACK = [ + "distributions/test_constraints", + "distributions/test_distributions", +] + # Dictionary matching test modules (in TESTS) to lists of test cases (within that test_module) that would be run when # options.run_specified_test_cases is enabled. # For example: @@ -961,6 +966,11 @@ def get_selected_tests(options): selected_tests = exclude_tests(DISTRIBUTED_TESTS, selected_tests, "PyTorch is built without distributed support.") + # skip tests that require LAPACK when it's not available + if not torch._C.has_lapack: + selected_tests = exclude_tests(TESTS_REQUIRING_LAPACK, selected_tests, + "PyTorch is built without LAPACK support.") + return selected_tests From 5dd07324578f5110a2ec5c213fb559bc49004c7a Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Tue, 15 Feb 2022 08:47:06 -0800 Subject: [PATCH 047/199] [ZeRO] Add ctor support for multiple param groups (#72578) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72578 **Overview** This adds `ZeroRedundancyOptimizer` constructor support for multiple parameter groups (i.e. passing an `iterable` of `dict`s instead of an `iterable` of `torch.Tensor` as the `parameters` argument) to mirror the API for non-sharded optimizers. Fixes https://github.com/pytorch/pytorch/issues/71347 and https://github.com/pytorch/pytorch/issues/59973. This modifies `test_collect_shards()` to skip if ROCm. **Test Plan** I adjusted the existing constructor test, and I added a test for parity between constructing with two parameter groups up front versus constructor with one parameter group and adding the second parameter group after (via `add_param_group()`) versus a non-sharded optimizer. Test Plan: Imported from OSS Reviewed By: rohan-varma Differential Revision: D34106940 Pulled By: awgu fbshipit-source-id: 7e70fc0b3cec891646e0698eaedf02ff4354c128 (cherry picked from commit 40f2d45172ba3286b64000a466e42c055cca8ddc) --- .../optim/test_zero_redundancy_optimizer.py | 146 +++++++++++++++--- .../optim/zero_redundancy_optimizer.py | 72 ++++++--- 2 files changed, 174 insertions(+), 44 deletions(-) diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py index de8ea511b63..bd819cf2a5c 100644 --- a/test/distributed/optim/test_zero_redundancy_optimizer.py +++ b/test/distributed/optim/test_zero_redundancy_optimizer.py @@ -33,7 +33,7 @@ from torch.distributed.algorithms.join import Join, Joinable, JoinHook from torch.distributed.optim import ZeroRedundancyOptimizer from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object from torch.nn.parallel import DistributedDataParallel as DDP -from torch.optim import SGD +from torch.optim import SGD, AdamW from torch.testing._internal import common_distributed, common_utils from torch.testing._internal.common_utils import ( TEST_WITH_ASAN, @@ -247,30 +247,6 @@ class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer): self.assertFalse(m.weight.grad) self.assertFalse(m.bias.grad) - def test_constructor(self): - """Check the robustness of the ZeroRedundancyOptimizer constructor by - passing different values for `params`""" - self.dist_init(self.rank) - - m = torch.nn.Linear(1, 1) - # (input, expected error) - inputs = [ - ([], ValueError), # empty parameter list - (torch.randn(1), TypeError), # non-iterable: `torch.Tensor` - (1.2, TypeError), # non-iterable: `float` - ([{"params": m.parameters()}], TypeError), # iterable of dict - (list(m.parameters()) + [42], TypeError), # iterable containing non-`torch.Tensor` - (m.parameters(), None), # `params` as a generator - (list(m.parameters()), None) # `params` as a list - ] - - for input, error in inputs: - if (error): - with self.assertRaises(error): - ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1) - else: - ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1) - def test_same_dense_param_type(self): """Check that ZeroRedundancyOptimizer raises an exception if the input parameters include sparse tensors or different dense types. @@ -296,6 +272,58 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer): def world_size(self): return min(4, max(2, torch.cuda.device_count())) + def test_constructor(self): + """Check the robustness of the ZeroRedundancyOptimizer constructor by + passing different values for the ``params`` argument.""" + self.dist_init(self.rank) + + m = torch.nn.Sequential( + torch.nn.Linear(5, 10), + torch.nn.Linear(10, 10), + torch.nn.Linear(10, 10), + ).to(self.device) + + # Test various constructor inputs in the form: (input, expected error) + ctor_inputs = [ + ([], ValueError), # empty parameter list + (torch.randn(1), TypeError), # non-iterable: `torch.Tensor` + (1.2, TypeError), # non-iterable: `float` + ([ + {"params": [l.weight for l in m]}, + {"params": [l.bias for l in m]}, + ], None), # iterable of dict + (list(m.parameters()) + [42], TypeError), # iterable containing invalid type + (m.parameters(), None), # `params` as a generator + (list(m.parameters()), None) # `params` as a list + ] + + for ctor_input, error in ctor_inputs: + if error: + with self.assertRaises(error): + ZeroRedundancyOptimizer(ctor_input, optimizer_class=SGD, lr=0.01) + else: + ZeroRedundancyOptimizer(ctor_input, optimizer_class=SGD, lr=0.01) + + # Test constructing with multiple parameter groups more thoroughly + weight_decay = 0.01 + lr = 0.01 + betas = (0.9, 0.999) + eps = 1e-8 + params = [ + {"params": [l.weight for l in m], "weight_decay": 0.}, + {"params": [l.bias for l in m], "weight_decay": weight_decay}, + ] + o = ZeroRedundancyOptimizer( + params, optimizer_class=AdamW, + lr=lr, betas=betas, eps=eps, + ) + assert len(o.param_groups) == 2, \ + f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}" + assert len(o.optim.param_groups) == 2, \ + "Expected 2 local optimizer param groups, but got " \ + f"{len(o.optim.param_groups)}" + + @common_distributed.skip_if_rocm def test_step(self): """ Check that the ZeroRedundancyOptimizer wrapper properly exposes the `.step()` interface""" @@ -459,7 +487,75 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer): all_trainable() some_trainable() + def test_multiple_param_groups(self): + """ + Tests parity between constructing ZeRO with multiple parameter groups + upfront versus adding parameter groups to ZeRO after construction + versus a non-sharded optimizer. + """ + self.dist_init(self.rank) + + model1 = torch.nn.Sequential( + torch.nn.Linear(5, 10), + torch.nn.Linear(10, 10), + torch.nn.Linear(10, 5), + ) + model2 = copy.deepcopy(model1) + model3 = copy.deepcopy(model1) + model1 = model1.to(self.device) + model2 = model2.to(self.device) + model3 = model3.to(self.device) + + batch_size = 8 + num_iters = 3 + inputs = [ + torch.randn(batch_size, 5).to(self.device) for _ in range(num_iters) + ] + wd = 0.01 + lr = 0.01 + # Construct `optim1` with both parameter groups upfront + optim1 = ZeroRedundancyOptimizer( + [ + {"params": [l.weight for l in model1], "weight_decay": 0.}, + {"params": [l.bias for l in model1], "weight_decay": wd}, + ], + optimizer_class=AdamW, lr=lr, + ) + # Construct `optim2` by adding the second parameter after + optim2 = ZeroRedundancyOptimizer( + [l.weight for l in model2], + optimizer_class=AdamW, lr=lr, weight_decay=0., + ) + optim2.add_param_group( + {"params": [l.bias for l in model2], "weight_decay": wd} + ) + # Construct `optim3` as a non-sharded optimizer + optim3 = AdamW( + [ + {"params": [l.weight for l in model3], "weight_decay": 0.}, + {"params": [l.bias for l in model3], "weight_decay": wd}, + ], lr=lr, + ) + + # Check parity over a few iterations + for iter in range(num_iters): + for model, optim in ( + (model1, optim1), (model2, optim2), (model3, optim3), + ): + optim.zero_grad() + out = model(inputs[iter]) + loss = out.sum() + loss.backward() + optim.step() + + for layer1, layer2, layer3 in zip(model1, model2, model3): + assert torch.allclose(layer1.weight, layer2.weight) + assert torch.allclose(layer1.weight, layer3.weight) + assert torch.allclose(layer1.bias, layer2.bias) + assert torch.allclose(layer1.bias, layer3.bias) + @common_distributed.skip_if_lt_x_gpu(2) + @common_distributed.skip_if_rocm def test_collect_shards(self): """ Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer""" self.dist_init(self.rank) diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py index 70779eac3f1..a87bfdaf5fd 100644 --- a/torch/distributed/optim/zero_redundancy_optimizer.py +++ b/torch/distributed/optim/zero_redundancy_optimizer.py @@ -10,7 +10,16 @@ import inspect import io import logging from itertools import chain -from typing import Any, Callable, Dict, List, Optional, Set, Type +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Set, + Type, + Union, +) import torch import torch.distributed as dist @@ -287,7 +296,8 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable): Arguments: params (``Iterable``): an ``Iterable`` of :class:`torch.Tensor` s - giving all parameters, which will be sharded across ranks. + or :class:`dict` s giving all parameters, which will be sharded + across ranks. Keyword Args: optimizer_class (:class:`torch.nn.Optimizer`): the class of the local @@ -364,7 +374,7 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable): **defaults: Any, ): # Perform type and assumption checks on the input parameters - self._verify_and_init_params(params) + params = self._verify_and_init_params(params) self._verify_same_dense_param_type() # NOTE: The parent constructor uses `add_param_group()` which is @@ -373,7 +383,7 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable): # between the parent and child. self.initialized = False - Optimizer.__init__(self, self._all_params, defaults) + Optimizer.__init__(self, params, defaults) Joinable.__init__(self) # Now, all parameters are held in both `self._all_params` and # `self.param_groups` @@ -1289,36 +1299,60 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable): offset = offset_next bucket_assignment.tensor = tensor - def _verify_and_init_params(self, params: Any) -> None: + def _verify_and_init_params( + self, params: Any, + ) -> Union[List[torch.Tensor], List[dict]]: r""" Verifies the type of ``params`` and initializes ``self._all_params`` - if ``params`` is valid. + as a :class:`list` of all parameters if ``params`` is valid. - While :class:`optim.Optimizer ` allows - ``params`` to be an iterable of :class:`dict` s, currently - ``ZeroRedundancyOptimizer`` strictly requires ``params`` to be an - iterable of :class:`torch.Tensor` s. + Arguments: + params (Any): Candidate parameter list or parameter groups to + verify. Raises: TypeError: ``params`` has an invalid type. ValueError: ``params`` is empty. + + Returns: + The persistent form of ``params`` to be passed into the parent + :class:`Optimizer` constructor -- i.e. returns ``params`` as a + :class:`list` to ensure that it can be iterated over again. """ if isinstance(params, torch.Tensor): - raise TypeError("params argument should be an iterable of " + raise TypeError("`params` argument should be an iterable of " f"Tensors, but got {torch.typename(params)}") try: - self._all_params = list(params) + all_params = list(params) except TypeError: - raise TypeError("params argument should be an iterable of " + raise TypeError("`params` argument should be an iterable of " f"Tensors, but got {torch.typename(params)}") - if len(self._all_params) == 0: + if len(all_params) == 0: raise ValueError("ZeroRedundancyOptimizer got an empty parameter " "list") - for param in self._all_params: - if not isinstance(param, torch.Tensor): - raise TypeError("params argument should be an iterable of " - "Tensors, but got an iterable containing " - f"{torch.typename(param)}") + all_tensors = True + all_dicts = True + for param in all_params: + all_tensors &= isinstance(param, torch.Tensor) + all_dicts &= isinstance(param, dict) + if not all_tensors and not all_dicts: + raise TypeError("`params` argument should be an iterable of " + "Tensors or dicts") + # Ensure that `self._all_params` contains a list of all parameters + if all_tensors: + self._all_params = all_params + elif all_dicts: + self._all_params = [] + # `all_params` contains parameter groups (not parameters) + for param_group in all_params: + if "params" not in param_group: + raise ValueError( + "Each parameter group passed-in via `params` must " + "have a 'params' key mapping to the parameters in " + "the group" + ) + self._all_params.extend(param_group["params"]) + return all_params def _verify_same_dense_param_type(self) -> None: r""" From 5f9590681ddf7536db4a32695f114d9c2df4f0b1 Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 15 Feb 2022 09:52:30 -0800 Subject: [PATCH 048/199] Optim foreach cleanup for Adam (#70295) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/70295 Test Plan: Imported from OSS Reviewed By: anjali411 Differential Revision: D33767870 Pulled By: mikaylagawarecki fbshipit-source-id: f922f15ecb0307458c8ecee737325c42c4f3ce8b (cherry picked from commit 66233a8a3eaa073acdaeaa16ca83413da8a2d969) --- torch/distributed/optim/functional_adam.py | 8 +- torch/optim/_functional.py | 54 +----- torch/optim/_multi_tensor/__init__.py | 3 +- torch/optim/_multi_tensor/__init__.pyi | 2 +- torch/optim/_multi_tensor/adam.py | 154 ---------------- torch/optim/_multi_tensor/adam.pyi | 5 - torch/optim/adam.py | 196 +++++++++++++++++++-- 7 files changed, 188 insertions(+), 234 deletions(-) delete mode 100644 torch/optim/_multi_tensor/adam.py delete mode 100644 torch/optim/_multi_tensor/adam.pyi diff --git a/torch/distributed/optim/functional_adam.py b/torch/distributed/optim/functional_adam.py index 690b167967f..d0d2a7df06b 100644 --- a/torch/distributed/optim/functional_adam.py +++ b/torch/distributed/optim/functional_adam.py @@ -24,6 +24,7 @@ class _FunctionalAdam(object): weight_decay: float = 0.0, amsgrad: bool = False, maximize: bool = False, + foreach: bool = False, _allow_empty_param_list: bool = False, ): if not 0.0 <= lr: @@ -46,6 +47,7 @@ class _FunctionalAdam(object): } self.amsgrad = amsgrad self.maximize = maximize + self.foreach = foreach self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {}) if len(params) == 0 and not _allow_empty_param_list: @@ -100,7 +102,8 @@ class _FunctionalAdam(object): beta2=self.defaults['beta2'], lr=self.defaults['lr'], weight_decay=self.defaults['weight_decay'], - eps=self.defaults['eps']) + eps=self.defaults['eps'], + foreach=self.foreach) def step(self, gradients: List[Optional[Tensor]]): params = self.param_group['params'] @@ -158,4 +161,5 @@ class _FunctionalAdam(object): beta2=self.defaults['beta2'], lr=self.defaults['lr'], weight_decay=self.defaults['weight_decay'], - eps=self.defaults['eps']) + eps=self.defaults['eps'], + foreach=self.foreach) diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py index b7637464761..4dc6ba22dca 100644 --- a/torch/optim/_functional.py +++ b/torch/optim/_functional.py @@ -6,6 +6,7 @@ from typing import List, Optional from .adadelta import adadelta # type: ignore[attr-defined] # noqa: F401 from .adagrad import adagrad, _make_sparse # type: ignore[attr-defined] # noqa: F401 +from .adam import adam # type: ignore[attr-defined] # noqa: F401 from .adamax import adamax # type: ignore[attr-defined] # noqa: F401 from .asgd import asgd # type: ignore[attr-defined] # noqa: F401 from .nadam import nadam # type: ignore[attr-defined] # noqa: F401 @@ -14,59 +15,6 @@ from .radam import radam # type: ignore[attr-defined] # noqa: F401 # TODO: use foreach API in optim._functional to do all the computation -def adam(params: List[Tensor], - grads: List[Tensor], - exp_avgs: List[Tensor], - exp_avg_sqs: List[Tensor], - max_exp_avg_sqs: List[Tensor], - state_steps: List[Tensor], - *, - amsgrad: bool, - beta1: float, - beta2: float, - lr: float, - weight_decay: float, - eps: float, - maximize: bool): - r"""Functional API that performs Adam algorithm computation. - See :class:`~torch.optim.Adam` for details. - """ - - if not all([isinstance(t, torch.Tensor) for t in state_steps]): - raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors") - - for i, param in enumerate(params): - - grad = grads[i] if not maximize else -grads[i] - exp_avg = exp_avgs[i] - exp_avg_sq = exp_avg_sqs[i] - step_t = state_steps[i] - # update step - step_t += 1 - step = step_t.item() - - bias_correction1 = 1 - beta1 ** step - bias_correction2 = 1 - beta2 ** step - - if weight_decay != 0: - grad = grad.add(param, alpha=weight_decay) - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2) - if amsgrad: - # Maintains the maximum of all 2nd moment running avg. till now - torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i]) - # Use the max. for normalizing running avg. of gradient - denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps) - else: - denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps) - - - - step_size = lr / bias_correction1 - param.addcdiv_(exp_avg, denom, value=-step_size) - def adamw(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py index 75661046b97..fb065119d48 100644 --- a/torch/optim/_multi_tensor/__init__.py +++ b/torch/optim/_multi_tensor/__init__.py @@ -7,7 +7,7 @@ future. from functools import partial from torch import optim -from .adam import Adam +Adam = partial(optim.Adam, foreach=True) from .adamw import AdamW NAdam = partial(optim.NAdam, foreach=True) from .sgd import SGD @@ -19,7 +19,6 @@ Adamax = partial(optim.Adamax, foreach=True) Adadelta = partial(optim.Adadelta, foreach=True) Adagrad = partial(optim.Adagrad, foreach=True) -del adam del adamw del sgd del rmsprop diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi index 516c334e445..d42abbcea2f 100644 --- a/torch/optim/_multi_tensor/__init__.pyi +++ b/torch/optim/_multi_tensor/__init__.pyi @@ -1,7 +1,7 @@ from functools import partial from torch import optim -from .adam import Adam as Adam +Adam = partial(optim.Adam, foreach=True) from .adamw import AdamW as AdamW NAdam = partial(optim.NAdam, foreach=True) from .sgd import SGD as SGD diff --git a/torch/optim/_multi_tensor/adam.py b/torch/optim/_multi_tensor/adam.py deleted file mode 100644 index 30f17baf482..00000000000 --- a/torch/optim/_multi_tensor/adam.py +++ /dev/null @@ -1,154 +0,0 @@ -import math -import torch -from ..optimizer import Optimizer - -class Adam(Optimizer): - r"""Implements Adam algorithm with multi tensor APIs. - - It has been proposed in `Adam: A Method for Stochastic Optimization`_. - The implementation of the L2 penalty follows changes proposed in - `Decoupled Weight Decay Regularization`_. - - Args: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - - .. _Adam\: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - .. _Decoupled Weight Decay Regularization: - https://arxiv.org/abs/1711.05101 - .. _On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=0, amsgrad=False, *, maximize: bool = False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - if not 0.0 <= weight_decay: - raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, amsgrad=amsgrad, - maximize=maximize, foreach=True) - super(Adam, self).__init__(params, defaults) - - def __setstate__(self, state): - super(Adam, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsgrad', False) - group.setdefault('maximize', False) - state_values = list(self.state.values()) - step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step']) - if not step_is_tensor: - for s in state_values: - s['step'] = torch.tensor(float(s['step'])) - - @torch.no_grad() - def step(self, closure=None): - """Performs a single optimization step. - - Args: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - for group in self.param_groups: - amsgrad = group['amsgrad'] - - grads = [] - state_steps = [] - exp_avg = [] - exp_avg_sq = [] - max_exp_avg_sq = [] - params_with_grad = [] - - - for p in group['params']: - if p.grad is not None: - if p.grad.is_sparse: - raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') - params_with_grad.append(p) - grads.append(p.grad) - - if group['maximize']: - grads = torch._foreach_neg(tuple(grads)) - - for p in params_with_grad: - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = torch.tensor(0.) - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) - - exp_avg.append(state['exp_avg']) - exp_avg_sq.append(state['exp_avg_sq']) - - if amsgrad: - max_exp_avg_sq.append(state['max_exp_avg_sq']) - - state_steps.append(state['step']) - - beta1, beta2 = group['betas'] - - # update steps - torch._foreach_add_(state_steps, 1) - - bias_correction1 = [1 - beta1 ** step.item() for step in state_steps] - bias_correction2 = [1 - beta2 ** step.item() for step in state_steps] - if group['weight_decay'] != 0: - grads = torch._foreach_add(grads, params_with_grad, alpha=group['weight_decay']) - - # - # Decay the first and second moment running average coefficient - # - torch._foreach_mul_(exp_avg, beta1) - torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) - - torch._foreach_mul_(exp_avg_sq, beta2) - torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) - - if amsgrad: - # Maintains the maximum of all 2nd moment running avg. till now - max_exp_avg_sq = torch._foreach_maximum(max_exp_avg_sq, exp_avg_sq) - - # Use the max. for normalizing running avg. of gradient - max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq) - bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] - torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) - denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps']) - else: - exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) - bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] - torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) - denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps']) - - step_size = [(group['lr'] / bc) * -1 for bc in bias_correction1] - torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size) - - return loss diff --git a/torch/optim/_multi_tensor/adam.pyi b/torch/optim/_multi_tensor/adam.pyi deleted file mode 100644 index 09f29597fd1..00000000000 --- a/torch/optim/_multi_tensor/adam.pyi +++ /dev/null @@ -1,5 +0,0 @@ -from typing import Tuple -from ..optimizer import _params_t, Optimizer - -class Adam(Optimizer): - def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ... diff --git a/torch/optim/adam.py b/torch/optim/adam.py index f69eef21ad2..09c5c6b56c9 100644 --- a/torch/optim/adam.py +++ b/torch/optim/adam.py @@ -1,6 +1,8 @@ +import math import torch -from . import _functional as F +from torch import Tensor from .optimizer import Optimizer +from typing import List, Optional class Adam(Optimizer): @@ -55,6 +57,8 @@ class Adam(Optimizer): amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) + foreach (bool, optional): whether foreach implementation of optimizer + is used (default: None) maximize (bool, optional): maximize the params based on the objective, instead of minimizing (default: False) @@ -65,7 +69,8 @@ class Adam(Optimizer): """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=0, amsgrad=False, *, maximize: bool = False): + weight_decay=0, amsgrad=False, *, foreach: Optional[bool] = None, + maximize: bool = False): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: @@ -77,14 +82,16 @@ class Adam(Optimizer): if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, amsgrad=amsgrad, maximize=maximize) + weight_decay=weight_decay, amsgrad=amsgrad, + maximize=maximize, foreach=foreach) super(Adam, self).__init__(params, defaults) def __setstate__(self, state): - super(Adam, self).__setstate__(state) + super().__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) group.setdefault('maximize', False) + group.setdefault('foreach', None) state_values = list(self.state.values()) step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step']) if not step_is_tensor: @@ -140,17 +147,172 @@ class Adam(Optimizer): state_steps.append(state['step']) - F.adam(params_with_grad, - grads, - exp_avgs, - exp_avg_sqs, - max_exp_avg_sqs, - state_steps, - amsgrad=group['amsgrad'], - beta1=beta1, - beta2=beta2, - lr=group['lr'], - weight_decay=group['weight_decay'], - eps=group['eps'], - maximize=group['maximize']) + adam(params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad=group['amsgrad'], + beta1=beta1, + beta2=beta2, + lr=group['lr'], + weight_decay=group['weight_decay'], + eps=group['eps'], + maximize=group['maximize'], + foreach=group['foreach']) + return loss + + +def adam(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + exp_avg_sqs: List[Tensor], + max_exp_avg_sqs: List[Tensor], + state_steps: List[Tensor], + # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 + # setting this as kwarg for now as functional API is compiled by torch/distributed/optim + foreach: bool = None, + *, + amsgrad: bool, + beta1: float, + beta2: float, + lr: float, + weight_decay: float, + eps: float, + maximize: bool): + r"""Functional API that performs Adam algorithm computation. + See :class:`~torch.optim.Adam` for details. + """ + + if not all([isinstance(t, torch.Tensor) for t in state_steps]): + raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors") + + if foreach is None: + # Placeholder for more complex foreach logic to be added when value is not set + foreach = False + + if foreach and torch.jit.is_scripting(): + raise RuntimeError('torch.jit.script not supported with foreach optimizers') + + if foreach and not torch.jit.is_scripting(): + func = _multi_tensor_adam + else: + func = _single_tensor_adam + + func(params, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad=amsgrad, + beta1=beta1, + beta2=beta2, + lr=lr, + weight_decay=weight_decay, + eps=eps, + maximize=maximize) + + +def _single_tensor_adam(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + exp_avg_sqs: List[Tensor], + max_exp_avg_sqs: List[Tensor], + state_steps: List[Tensor], + *, + amsgrad: bool, + beta1: float, + beta2: float, + lr: float, + weight_decay: float, + eps: float, + maximize: bool): + + for i, param in enumerate(params): + + grad = grads[i] if not maximize else -grads[i] + exp_avg = exp_avgs[i] + exp_avg_sq = exp_avg_sqs[i] + step_t = state_steps[i] + # update step + step_t += 1 + step = step_t.item() + + bias_correction1 = 1 - beta1 ** step + bias_correction2 = 1 - beta2 ** step + + if weight_decay != 0: + grad = grad.add(param, alpha=weight_decay) + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i]) + # Use the max. for normalizing running avg. of gradient + denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps) + else: + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps) + + + + step_size = lr / bias_correction1 + param.addcdiv_(exp_avg, denom, value=-step_size) + + +def _multi_tensor_adam(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + exp_avg_sqs: List[Tensor], + max_exp_avg_sqs: List[Tensor], + state_steps: List[Tensor], + *, + amsgrad: bool, + beta1: float, + beta2: float, + lr: float, + weight_decay: float, + eps: float, + maximize: bool): + + if len(params) == 0: + return + + # update steps + torch._foreach_add_(state_steps, 1) + + if maximize: + grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] + + bias_correction1 = [1 - beta1 ** step.item() for step in state_steps] + bias_correction2 = [1 - beta2 ** step.item() for step in state_steps] + if weight_decay != 0: + torch._foreach_add_(grads, params, alpha=weight_decay) + + torch._foreach_mul_(exp_avgs, beta1) + torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) + + torch._foreach_mul_(exp_avg_sqs, beta2) + torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) + + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs) # type: ignore[assignment] + + # Use the max. for normalizing running avg. of gradient + max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) + bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] + torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) + denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) + else: + exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) + bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] + torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) + denom = torch._foreach_add(exp_avg_sq_sqrt, eps) + + step_size = [(lr / bc) * -1 for bc in bias_correction1] + torch._foreach_addcdiv_(params, exp_avgs, denom, step_size) From 2cb03e926f013493cb3986bb1c9446594b602385 Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 15 Feb 2022 09:52:30 -0800 Subject: [PATCH 049/199] Optim foreach cleanup for SGD (#70481) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/70481 Test Plan: Imported from OSS Reviewed By: anjali411 Differential Revision: D33767868 Pulled By: mikaylagawarecki fbshipit-source-id: 89b9227a4ddf99602855973cbc343c58ae3d5328 (cherry picked from commit ffea8ddcfd39f3f33e18d1c7b2b903d5464d5eb9) --- torch/distributed/optim/functional_sgd.py | 13 ++ torch/optim/_functional.py | 42 +---- torch/optim/_multi_tensor/__init__.py | 3 +- torch/optim/_multi_tensor/__init__.pyi | 2 +- torch/optim/_multi_tensor/sgd.py | 156 ------------------- torch/optim/_multi_tensor/sgd.pyi | 4 - torch/optim/sgd.py | 180 +++++++++++++++++++--- 7 files changed, 178 insertions(+), 222 deletions(-) delete mode 100644 torch/optim/_multi_tensor/sgd.py delete mode 100644 torch/optim/_multi_tensor/sgd.pyi diff --git a/torch/distributed/optim/functional_sgd.py b/torch/distributed/optim/functional_sgd.py index 73d72febcce..57cf724ad07 100644 --- a/torch/distributed/optim/functional_sgd.py +++ b/torch/distributed/optim/functional_sgd.py @@ -24,6 +24,7 @@ class _FunctionalSGD(object): weight_decay: float = 0.0, nesterov: bool = False, maximize: bool = False, + foreach: bool = False, _allow_empty_param_list: bool = False, ): self.defaults = { @@ -34,6 +35,7 @@ class _FunctionalSGD(object): } self.nesterov = nesterov self.maximize = maximize + self.foreach = foreach self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {}) if len(params) == 0 and not _allow_empty_param_list: @@ -56,8 +58,12 @@ class _FunctionalSGD(object): params = [param] momentum_buffer_list: List[Optional[Tensor]] = [] grads = [] + + has_sparse_grad = False if grad is not None: grads.append(grad) + if grad.is_sparse: + has_sparse_grad = True if param not in self.state: self.state[param] = {} state = self.state[param] @@ -77,6 +83,8 @@ class _FunctionalSGD(object): dampening=dampening, nesterov=self.nesterov, maximize=self.maximize, + has_sparse_grad=has_sparse_grad, + foreach=self.foreach, ) # update momentum_buffer in state state = self.state[param] @@ -101,10 +109,13 @@ class _FunctionalSGD(object): + f"Gradients length: {len(gradients)}" ) + has_sparse_grad = False for param, gradient in zip(params, gradients): if gradient is not None: params_with_grad.append(param) grads.append(gradient) + if gradient.is_sparse: + has_sparse_grad = True if param not in self.state: self.state[param] = {} @@ -125,6 +136,8 @@ class _FunctionalSGD(object): dampening=dampening, nesterov=self.nesterov, maximize=self.maximize, + has_sparse_grad=has_sparse_grad, + foreach=self.foreach, ) # update momentum_buffers in state diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py index 4dc6ba22dca..8bf6a46b90f 100644 --- a/torch/optim/_functional.py +++ b/torch/optim/_functional.py @@ -2,7 +2,7 @@ r"""Functional interface""" import math import torch from torch import Tensor -from typing import List, Optional +from typing import List from .adadelta import adadelta # type: ignore[attr-defined] # noqa: F401 from .adagrad import adagrad, _make_sparse # type: ignore[attr-defined] # noqa: F401 @@ -11,6 +11,7 @@ from .adamax import adamax # type: ignore[attr-defined] # noqa: F401 from .asgd import asgd # type: ignore[attr-defined] # noqa: F401 from .nadam import nadam # type: ignore[attr-defined] # noqa: F401 from .radam import radam # type: ignore[attr-defined] # noqa: F401 +from .sgd import sgd # type: ignore[attr-defined] # noqa: F401 # TODO: use foreach API in optim._functional to do all the computation @@ -68,45 +69,6 @@ def adamw(params: List[Tensor], param.addcdiv_(exp_avg, denom, value=-step_size) -def sgd(params: List[Tensor], - d_p_list: List[Tensor], - momentum_buffer_list: List[Optional[Tensor]], - *, - weight_decay: float, - momentum: float, - lr: float, - dampening: float, - nesterov: bool, - maximize: bool): - r"""Functional API that performs SGD algorithm computation. - - See :class:`~torch.optim.SGD` for details. - """ - - for i, param in enumerate(params): - - d_p = d_p_list[i] - if weight_decay != 0: - d_p = d_p.add(param, alpha=weight_decay) - - if momentum != 0: - buf = momentum_buffer_list[i] - - if buf is None: - buf = torch.clone(d_p).detach() - momentum_buffer_list[i] = buf - else: - buf.mul_(momentum).add_(d_p, alpha=1 - dampening) - - if nesterov: - d_p = d_p.add(buf, alpha=momentum) - else: - d_p = buf - - alpha = lr if maximize else -lr - param.add_(d_p, alpha=alpha) - - def rmsprop(params: List[Tensor], grads: List[Tensor], square_avgs: List[Tensor], diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py index fb065119d48..f4486bc072c 100644 --- a/torch/optim/_multi_tensor/__init__.py +++ b/torch/optim/_multi_tensor/__init__.py @@ -10,7 +10,7 @@ from torch import optim Adam = partial(optim.Adam, foreach=True) from .adamw import AdamW NAdam = partial(optim.NAdam, foreach=True) -from .sgd import SGD +SGD = partial(optim.SGD, foreach=True) RAdam = partial(optim.RAdam, foreach=True) from .rmsprop import RMSprop from .rprop import Rprop @@ -20,6 +20,5 @@ Adadelta = partial(optim.Adadelta, foreach=True) Adagrad = partial(optim.Adagrad, foreach=True) del adamw -del sgd del rmsprop del rprop diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi index d42abbcea2f..354087ef959 100644 --- a/torch/optim/_multi_tensor/__init__.pyi +++ b/torch/optim/_multi_tensor/__init__.pyi @@ -4,7 +4,7 @@ from torch import optim Adam = partial(optim.Adam, foreach=True) from .adamw import AdamW as AdamW NAdam = partial(optim.NAdam, foreach=True) -from .sgd import SGD as SGD +SGD = partial(optim.SGD, foreach=True) RAdam = partial(optim.RAdam, foreach=True) from .rmsprop import RMSprop as RMSprop from .rprop import Rprop as Rprop diff --git a/torch/optim/_multi_tensor/sgd.py b/torch/optim/_multi_tensor/sgd.py deleted file mode 100644 index 5f5697560d0..00000000000 --- a/torch/optim/_multi_tensor/sgd.py +++ /dev/null @@ -1,156 +0,0 @@ -import torch -from ..optimizer import Optimizer, required - -class SGD(Optimizer): - r"""Implements stochastic gradient descent (optionally with momentum). - - Nesterov momentum is based on the formula from - `On the importance of initialization and momentum in deep learning`__. - - Args: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float): learning rate - momentum (float, optional): momentum factor (default: 0) - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - dampening (float, optional): dampening for momentum (default: 0) - nesterov (bool, optional): enables Nesterov momentum (default: False) - - Example: - >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) - >>> optimizer.zero_grad() - >>> loss_fn(model(input), target).backward() - >>> optimizer.step() - - __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf - - .. note:: - The implementation of SGD with Momentum/Nesterov subtly differs from - Sutskever et. al. and implementations in some other frameworks. - - Considering the specific case of Momentum, the update can be written as - - .. math:: - \begin{aligned} - v_{t+1} & = \mu * v_{t} + g_{t+1}, \\ - p_{t+1} & = p_{t} - \text{lr} * v_{t+1}, - \end{aligned} - - where :math:`p`, :math:`g`, :math:`v` and :math:`\mu` denote the - parameters, gradient, velocity, and momentum respectively. - - This is in contrast to Sutskever et. al. and - other frameworks which employ an update of the form - - .. math:: - \begin{aligned} - v_{t+1} & = \mu * v_{t} + \text{lr} * g_{t+1}, \\ - p_{t+1} & = p_{t} - v_{t+1}. - \end{aligned} - - The Nesterov version is analogously modified. - """ - - def __init__(self, params, lr=required, momentum=0, dampening=0, - weight_decay=0, nesterov=False, *, maximize=False): - if lr is not required and lr < 0.0: - raise ValueError("Invalid learning rate: {}".format(lr)) - if momentum < 0.0: - raise ValueError("Invalid momentum value: {}".format(momentum)) - if weight_decay < 0.0: - raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) - - defaults = dict(lr=lr, momentum=momentum, dampening=dampening, - weight_decay=weight_decay, nesterov=nesterov, maximize=maximize, foreach=True) - if nesterov and (momentum <= 0 or dampening != 0): - raise ValueError("Nesterov momentum requires a momentum and zero dampening") - super(SGD, self).__init__(params, defaults) - - def __setstate__(self, state): - super(SGD, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('nesterov', False) - group.setdefault('maximize', False) - - @torch.no_grad() - def step(self, closure=None): - """Performs a single optimization step. - - Args: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - for group in self.param_groups: - weight_decay = group['weight_decay'] - momentum = group['momentum'] - dampening = group['dampening'] - nesterov = group['nesterov'] - maximize = group['maximize'] - - grads = [] - params_with_grad = [] - states = [] - has_sparse_grad = False - - for p in group['params']: - if p.grad is not None: - grads.append(p.grad) - params_with_grad.append(p) - states.append(self.state[p]) - - if p.grad.is_sparse: - has_sparse_grad = True - - if momentum != 0: - raise RuntimeError('SGD does not support momentum for sparse gradients') - - if grads == []: - return loss - - if weight_decay != 0: - grads = torch._foreach_add(grads, params_with_grad, alpha=weight_decay) - - if momentum != 0: - bufs = [] - - all_states_with_momentum_buffer = True - for i in range(len(states)): - if 'momentum_buffer' not in states[i]: - all_states_with_momentum_buffer = False - break - else: - bufs.append(states[i]['momentum_buffer']) - - if all_states_with_momentum_buffer: - torch._foreach_mul_(bufs, momentum) - torch._foreach_add_(bufs, grads, alpha=1 - dampening) - else: - bufs = [] - for i in range(len(states)): - if 'momentum_buffer' not in states[i]: - buf = states[i]['momentum_buffer'] = torch.clone(grads[i]).detach() - else: - buf = states[i]['momentum_buffer'] - buf.mul_(momentum).add_(grads[i], alpha=1 - dampening) - - bufs.append(buf) - - if nesterov: - torch._foreach_add_(grads, bufs, alpha=momentum) - else: - grads = bufs - - alpha = group['lr'] if maximize else -group['lr'] - if not has_sparse_grad: - torch._foreach_add_(params_with_grad, grads, alpha=alpha) - else: - # foreach APIs dont support sparse - for i in range(len(params_with_grad)): - params_with_grad[i].add_(grads[i], alpha=alpha) - - return loss diff --git a/torch/optim/_multi_tensor/sgd.pyi b/torch/optim/_multi_tensor/sgd.pyi deleted file mode 100644 index 6082e230cd7..00000000000 --- a/torch/optim/_multi_tensor/sgd.pyi +++ /dev/null @@ -1,4 +0,0 @@ -from ..optimizer import _params_t, Optimizer - -class SGD(Optimizer): - def __init__(self, params: _params_t, lr: float, momentum: float=..., dampening: float=..., weight_decay:float=..., nesterov:bool=...) -> None: ... diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py index 8109b9a3747..e3fe05344fe 100644 --- a/torch/optim/sgd.py +++ b/torch/optim/sgd.py @@ -1,6 +1,7 @@ import torch -from . import _functional as F +from torch import Tensor from .optimizer import Optimizer, required +from typing import List, Optional class SGD(Optimizer): @@ -49,6 +50,8 @@ class SGD(Optimizer): nesterov (bool, optional): enables Nesterov momentum (default: False) maximize (bool, optional): maximize the params based on the objective, instead of minimizing (default: False) + foreach (bool, optional): whether foreach implementation of optimizer + is used (default: None) Example: >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) @@ -86,7 +89,7 @@ class SGD(Optimizer): """ def __init__(self, params, lr=required, momentum=0, dampening=0, - weight_decay=0, nesterov=False, *, maximize=False): + weight_decay=0, nesterov=False, *, maximize=False, foreach: Optional[bool] = None): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) if momentum < 0.0: @@ -95,16 +98,18 @@ class SGD(Optimizer): raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, momentum=momentum, dampening=dampening, - weight_decay=weight_decay, nesterov=nesterov, maximize=maximize) + weight_decay=weight_decay, nesterov=nesterov, + maximize=maximize, foreach=foreach) if nesterov and (momentum <= 0 or dampening != 0): raise ValueError("Nesterov momentum requires a momentum and zero dampening") super(SGD, self).__init__(params, defaults) def __setstate__(self, state): - super(SGD, self).__setstate__(state) + super().__setstate__(state) for group in self.param_groups: group.setdefault('nesterov', False) group.setdefault('maximize', False) + group.setdefault('foreach', None) @torch.no_grad() def step(self, closure=None): @@ -123,17 +128,14 @@ class SGD(Optimizer): params_with_grad = [] d_p_list = [] momentum_buffer_list = [] - weight_decay = group['weight_decay'] - momentum = group['momentum'] - dampening = group['dampening'] - nesterov = group['nesterov'] - maximize = group['maximize'] - lr = group['lr'] + has_sparse_grad = False for p in group['params']: if p.grad is not None: params_with_grad.append(p) d_p_list.append(p.grad) + if p.grad.is_sparse: + has_sparse_grad = True state = self.state[p] if 'momentum_buffer' not in state: @@ -141,15 +143,17 @@ class SGD(Optimizer): else: momentum_buffer_list.append(state['momentum_buffer']) - F.sgd(params_with_grad, - d_p_list, - momentum_buffer_list, - weight_decay=weight_decay, - momentum=momentum, - lr=lr, - dampening=dampening, - nesterov=nesterov, - maximize=maximize,) + sgd(params_with_grad, + d_p_list, + momentum_buffer_list, + weight_decay=group['weight_decay'], + momentum=group['momentum'], + lr=group['lr'], + dampening=group['dampening'], + nesterov=group['nesterov'], + maximize=group['maximize'], + has_sparse_grad=has_sparse_grad, + foreach=group['foreach']) # update momentum_buffers in state for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list): @@ -157,3 +161,141 @@ class SGD(Optimizer): state['momentum_buffer'] = momentum_buffer return loss + + +def sgd(params: List[Tensor], + d_p_list: List[Tensor], + momentum_buffer_list: List[Optional[Tensor]], + # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 + # setting this as kwarg for now as functional API is compiled by torch/distributed/optim + has_sparse_grad: bool = None, + foreach: bool = None, + *, + weight_decay: float, + momentum: float, + lr: float, + dampening: float, + nesterov: bool, + maximize: bool): + r"""Functional API that performs SGD algorithm computation. + + See :class:`~torch.optim.SGD` for details. + """ + + if foreach is None: + # Placeholder for more complex foreach logic to be added when value is not set + foreach = False + + if foreach and torch.jit.is_scripting(): + raise RuntimeError('torch.jit.script not supported with foreach optimizers') + + if foreach and not torch.jit.is_scripting(): + func = _multi_tensor_sgd + else: + func = _single_tensor_sgd + + func(params, + d_p_list, + momentum_buffer_list, + weight_decay=weight_decay, + momentum=momentum, + lr=lr, + dampening=dampening, + nesterov=nesterov, + has_sparse_grad=has_sparse_grad, + maximize=maximize) + +def _single_tensor_sgd(params: List[Tensor], + d_p_list: List[Tensor], + momentum_buffer_list: List[Optional[Tensor]], + *, + weight_decay: float, + momentum: float, + lr: float, + dampening: float, + nesterov: bool, + maximize: bool, + has_sparse_grad: bool): + + for i, param in enumerate(params): + + d_p = d_p_list[i] + if weight_decay != 0: + d_p = d_p.add(param, alpha=weight_decay) + + if momentum != 0: + buf = momentum_buffer_list[i] + + if buf is None: + buf = torch.clone(d_p).detach() + momentum_buffer_list[i] = buf + else: + buf.mul_(momentum).add_(d_p, alpha=1 - dampening) + + if nesterov: + d_p = d_p.add(buf, alpha=momentum) + else: + d_p = buf + + alpha = lr if maximize else -lr + param.add_(d_p, alpha=alpha) + + +def _multi_tensor_sgd(params: List[Tensor], + grads: List[Tensor], + momentum_buffer_list: List[Optional[Tensor]], + *, + weight_decay: float, + momentum: float, + lr: float, + dampening: float, + nesterov: bool, + maximize: bool, + has_sparse_grad: bool): + + if len(params) == 0: + return + + if has_sparse_grad is None: + has_sparse_grad = any([grad.is_sparse for grad in grads]) + + if weight_decay != 0: + grads = torch._foreach_add(grads, params, alpha=weight_decay) + + if momentum != 0: + bufs = [] + + all_states_with_momentum_buffer = True + for i in range(len(momentum_buffer_list)): + if momentum_buffer_list[i] is None: + all_states_with_momentum_buffer = False + break + else: + bufs.append(momentum_buffer_list[i]) + + if all_states_with_momentum_buffer: + torch._foreach_mul_(bufs, momentum) + torch._foreach_add_(bufs, grads, alpha=1 - dampening) + else: + bufs = [] + for i in range(len(momentum_buffer_list)): + if momentum_buffer_list[i] is None: + buf = momentum_buffer_list[i] = torch.clone(grads[i]).detach() + else: + buf = momentum_buffer_list[i] + buf.mul_(momentum).add_(grads[i], alpha=1 - dampening) + + bufs.append(buf) + + if nesterov: + torch._foreach_add_(grads, bufs, alpha=momentum) + else: + grads = bufs + + alpha = lr if maximize else -lr + if not has_sparse_grad: + torch._foreach_add_(params, grads, alpha=alpha) + else: + # foreach APIs dont support sparse + for i in range(len(params)): + params[i].add_(grads[i], alpha=alpha) From ce3094f5f61a13191dd01b7a3856c4eab0814027 Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 15 Feb 2022 09:52:30 -0800 Subject: [PATCH 050/199] Optim foreach cleanup for Rmsprop (#70482) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/70482 Test Plan: Imported from OSS Reviewed By: anjali411 Differential Revision: D33767862 Pulled By: mikaylagawarecki fbshipit-source-id: 8e2e9c986d5a3774093a79755940372945f1b3a9 (cherry picked from commit baea53727711fcc083e1c18641afd1e617c24495) --- torch/distributed/optim/functional_rmsprop.py | 5 +- torch/optim/_functional.py | 42 +---- torch/optim/_multi_tensor/__init__.py | 3 +- torch/optim/_multi_tensor/__init__.pyi | 2 +- torch/optim/_multi_tensor/rmsprop.py | 123 -------------- torch/optim/_multi_tensor/rmsprop.pyi | 5 - torch/optim/rmsprop.py | 158 ++++++++++++++++-- 7 files changed, 150 insertions(+), 188 deletions(-) delete mode 100644 torch/optim/_multi_tensor/rmsprop.py delete mode 100644 torch/optim/_multi_tensor/rmsprop.pyi diff --git a/torch/distributed/optim/functional_rmsprop.py b/torch/distributed/optim/functional_rmsprop.py index 7c02338ceb5..e628e4855a8 100644 --- a/torch/distributed/optim/functional_rmsprop.py +++ b/torch/distributed/optim/functional_rmsprop.py @@ -24,6 +24,7 @@ class _FunctionalRMSprop(object): weight_decay: float = 0.0, momentum: float = 0.0, centered: bool = False, + foreach: bool = False, _allow_empty_param_list: bool = False, ): self.defaults = { @@ -34,6 +35,7 @@ class _FunctionalRMSprop(object): "momentum": momentum, } self.centered = centered + self.foreach = foreach if len(params) == 0 and not _allow_empty_param_list: raise ValueError("optimizer got an empty parameter list") @@ -99,4 +101,5 @@ class _FunctionalRMSprop(object): eps=eps, weight_decay=weight_decay, momentum=momentum, - centered=self.centered) + centered=self.centered, + foreach=self.foreach) diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py index 8bf6a46b90f..64fd7a44674 100644 --- a/torch/optim/_functional.py +++ b/torch/optim/_functional.py @@ -11,6 +11,7 @@ from .adamax import adamax # type: ignore[attr-defined] # noqa: F401 from .asgd import asgd # type: ignore[attr-defined] # noqa: F401 from .nadam import nadam # type: ignore[attr-defined] # noqa: F401 from .radam import radam # type: ignore[attr-defined] # noqa: F401 +from .rmsprop import rmsprop # type: ignore[attr-defined] # noqa: F401 from .sgd import sgd # type: ignore[attr-defined] # noqa: F401 @@ -69,47 +70,6 @@ def adamw(params: List[Tensor], param.addcdiv_(exp_avg, denom, value=-step_size) -def rmsprop(params: List[Tensor], - grads: List[Tensor], - square_avgs: List[Tensor], - grad_avgs: List[Tensor], - momentum_buffer_list: List[Tensor], - *, - lr: float, - alpha: float, - eps: float, - weight_decay: float, - momentum: float, - centered: bool): - r"""Functional API that performs rmsprop algorithm computation. - - See :class:`~torch.optim.RMSProp` for details. - """ - - for i, param in enumerate(params): - grad = grads[i] - square_avg = square_avgs[i] - - if weight_decay != 0: - grad = grad.add(param, alpha=weight_decay) - - square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha) - - if centered: - grad_avg = grad_avgs[i] - grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha) - avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(eps) - else: - avg = square_avg.sqrt().add_(eps) - - if momentum > 0: - buf = momentum_buffer_list[i] - buf.mul_(momentum).addcdiv_(grad, avg) - param.add_(buf, alpha=-lr) - else: - param.addcdiv_(grad, avg, value=-lr) - - def rprop(params: List[Tensor], grads: List[Tensor], prevs: List[Tensor], diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py index f4486bc072c..16300a91527 100644 --- a/torch/optim/_multi_tensor/__init__.py +++ b/torch/optim/_multi_tensor/__init__.py @@ -12,7 +12,7 @@ from .adamw import AdamW NAdam = partial(optim.NAdam, foreach=True) SGD = partial(optim.SGD, foreach=True) RAdam = partial(optim.RAdam, foreach=True) -from .rmsprop import RMSprop +RMSprop = partial(optim.RMSprop, foreach=True) from .rprop import Rprop ASGD = partial(optim.ASGD, foreach=True) Adamax = partial(optim.Adamax, foreach=True) @@ -20,5 +20,4 @@ Adadelta = partial(optim.Adadelta, foreach=True) Adagrad = partial(optim.Adagrad, foreach=True) del adamw -del rmsprop del rprop diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi index 354087ef959..3d3c6028377 100644 --- a/torch/optim/_multi_tensor/__init__.pyi +++ b/torch/optim/_multi_tensor/__init__.pyi @@ -6,7 +6,7 @@ from .adamw import AdamW as AdamW NAdam = partial(optim.NAdam, foreach=True) SGD = partial(optim.SGD, foreach=True) RAdam = partial(optim.RAdam, foreach=True) -from .rmsprop import RMSprop as RMSprop +RMSprop = partial(optim.RMSprop, foreach=True) from .rprop import Rprop as Rprop ASGD = partial(optim.ASGD, foreach=True) Adamax = partial(optim.Adamax, foreach=True) diff --git a/torch/optim/_multi_tensor/rmsprop.py b/torch/optim/_multi_tensor/rmsprop.py deleted file mode 100644 index b15491907b8..00000000000 --- a/torch/optim/_multi_tensor/rmsprop.py +++ /dev/null @@ -1,123 +0,0 @@ -import torch -from ..optimizer import Optimizer - -class RMSprop(Optimizer): - r"""Implements RMSprop algorithm. - - Proposed by G. Hinton in his - `course `_. - - The centered version first appears in `Generating Sequences - With Recurrent Neural Networks `_. - - The implementation here takes the square root of the gradient average before - adding epsilon (note that TensorFlow interchanges these two operations). The effective - learning rate is thus :math:`\alpha/(\sqrt{v} + \epsilon)` where :math:`\alpha` - is the scheduled learning rate and :math:`v` is the weighted moving average - of the squared gradient. - - Args: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-2) - momentum (float, optional): momentum factor (default: 0) - alpha (float, optional): smoothing constant (default: 0.99) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - centered (bool, optional) : if ``True``, compute the centered RMSProp, - the gradient is normalized by an estimation of its variance - weight_decay (float, optional): weight decay (L2 penalty) (default: 0) - - """ - - def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= momentum: - raise ValueError("Invalid momentum value: {}".format(momentum)) - if not 0.0 <= weight_decay: - raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) - if not 0.0 <= alpha: - raise ValueError("Invalid alpha value: {}".format(alpha)) - - defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, - weight_decay=weight_decay, foreach=True) - super(RMSprop, self).__init__(params, defaults) - - def __setstate__(self, state): - super(RMSprop, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('momentum', 0) - group.setdefault('centered', False) - - @torch.no_grad() - def step(self, closure=None): - """Performs a single optimization step. - - Args: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - for group in self.param_groups: - grads = [] - params_with_grad = [] - states = [] - alpha = group['alpha'] - square_avg = [] - - for p in group['params']: - if p.grad is not None: - if p.grad.is_sparse: - raise RuntimeError('RMSprop does not support sparse gradients') - - grads.append(p.grad) - params_with_grad.append(p) - - state = self.state[p] - # State initialization - if len(state) == 0: - state['step'] = 0 - state['square_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) - if group['momentum'] > 0: - state['momentum_buffer'] = torch.zeros_like(p, memory_format=torch.preserve_format) - if group['centered']: - state['grad_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) - - state['step'] += 1 - - states.append(state) - square_avg.append(state['square_avg']) - - if group['weight_decay'] != 0: - torch._foreach_add_(grads, params_with_grad, alpha=group['weight_decay']) - - torch._foreach_mul_(square_avg, alpha) - torch._foreach_addcmul_(square_avg, grads, grads, value=1 - alpha) - - if group['centered']: - grad_avgs = [s['grad_avg'] for s in states] - torch._foreach_mul_(grad_avgs, alpha) - torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) - avg = torch._foreach_addcmul(square_avg, grad_avgs, grad_avgs, value=-1) - torch._foreach_sqrt_(avg) - torch._foreach_add_(avg, group['eps']) - else: - avg = torch._foreach_sqrt(square_avg) - torch._foreach_add_(avg, group['eps']) - - if group['momentum'] > 0: - buf = [s['momentum_buffer'] for s in states] - torch._foreach_mul_(buf, group['momentum']) - torch._foreach_addcdiv_(buf, grads, avg) - torch._foreach_add_(params_with_grad, buf, alpha=-group['lr']) - else: - torch._foreach_addcdiv_(params_with_grad, grads, avg, value=-group['lr']) - - return loss diff --git a/torch/optim/_multi_tensor/rmsprop.pyi b/torch/optim/_multi_tensor/rmsprop.pyi deleted file mode 100644 index 691f2188ebb..00000000000 --- a/torch/optim/_multi_tensor/rmsprop.pyi +++ /dev/null @@ -1,5 +0,0 @@ -from typing import Tuple -from ..optimizer import _params_t, Optimizer - -class RMSprop(Optimizer): - def __init__(self, params: _params_t, lr: float=..., alpha: float=..., eps: float=..., weight_decay: float=..., momentum: float=..., centered: bool=...) -> None: ... diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py index dc72181b351..313c4e92295 100644 --- a/torch/optim/rmsprop.py +++ b/torch/optim/rmsprop.py @@ -1,6 +1,7 @@ import torch -from . import _functional as F +from torch import Tensor from .optimizer import Optimizer +from typing import List, Optional class RMSprop(Optimizer): @@ -58,10 +59,13 @@ class RMSprop(Optimizer): centered (bool, optional) : if ``True``, compute the centered RMSProp, the gradient is normalized by an estimation of its variance weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + foreach (bool, optional): whether foreach implementation of optimizer + is used (default: None) """ - def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False): + def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, + centered=False, foreach: Optional[bool] = None): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: @@ -73,14 +77,16 @@ class RMSprop(Optimizer): if not 0.0 <= alpha: raise ValueError("Invalid alpha value: {}".format(alpha)) - defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay) + defaults = dict(lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, + weight_decay=weight_decay, foreach=foreach) super(RMSprop, self).__init__(params, defaults) def __setstate__(self, state): - super(RMSprop, self).__setstate__(state) + super().__setstate__(state) for group in self.param_groups: group.setdefault('momentum', 0) group.setdefault('centered', False) + group.setdefault('foreach', None) @torch.no_grad() def step(self, closure=None): @@ -132,16 +138,138 @@ class RMSprop(Optimizer): state['step'] += 1 - F.rmsprop(params_with_grad, - grads, - square_avgs, - grad_avgs, - momentum_buffer_list, - lr=group['lr'], - alpha=group['alpha'], - eps=group['eps'], - weight_decay=group['weight_decay'], - momentum=group['momentum'], - centered=group['centered']) + rmsprop(params_with_grad, + grads, + square_avgs, + grad_avgs, + momentum_buffer_list, + lr=group['lr'], + alpha=group['alpha'], + eps=group['eps'], + weight_decay=group['weight_decay'], + momentum=group['momentum'], + centered=group['centered'], + foreach=group['foreach']) return loss + + +def rmsprop(params: List[Tensor], + grads: List[Tensor], + square_avgs: List[Tensor], + grad_avgs: List[Tensor], + momentum_buffer_list: List[Tensor], + # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 + # setting this as kwarg for now as functional API is compiled by torch/distributed/optim + foreach: bool = None, + *, + lr: float, + alpha: float, + eps: float, + weight_decay: float, + momentum: float, + centered: bool): + r"""Functional API that performs rmsprop algorithm computation. + See :class:`~torch.optim.RMSProp` for details. + """ + + if foreach is None: + # Placeholder for more complex foreach logic to be added when value is not set + foreach = False + + if foreach and torch.jit.is_scripting(): + raise RuntimeError('torch.jit.script not supported with foreach optimizers') + + if foreach and not torch.jit.is_scripting(): + func = _multi_tensor_rmsprop + else: + func = _single_tensor_rmsprop + + func(params, + grads, + square_avgs, + grad_avgs, + momentum_buffer_list, + lr=lr, + alpha=alpha, + eps=eps, + weight_decay=weight_decay, + momentum=momentum, + centered=centered) + + +def _single_tensor_rmsprop(params: List[Tensor], + grads: List[Tensor], + square_avgs: List[Tensor], + grad_avgs: List[Tensor], + momentum_buffer_list: List[Tensor], + *, + lr: float, + alpha: float, + eps: float, + weight_decay: float, + momentum: float, + centered: bool): + + for i, param in enumerate(params): + grad = grads[i] + square_avg = square_avgs[i] + + if weight_decay != 0: + grad = grad.add(param, alpha=weight_decay) + + square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha) + + if centered: + grad_avg = grad_avgs[i] + grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha) + avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).sqrt_().add_(eps) + else: + avg = square_avg.sqrt().add_(eps) + + if momentum > 0: + buf = momentum_buffer_list[i] + buf.mul_(momentum).addcdiv_(grad, avg) + param.add_(buf, alpha=-lr) + else: + param.addcdiv_(grad, avg, value=-lr) + + +def _multi_tensor_rmsprop(params: List[Tensor], + grads: List[Tensor], + square_avgs: List[Tensor], + grad_avgs: List[Tensor], + momentum_buffer_list: List[Tensor], + *, + lr: float, + alpha: float, + eps: float, + weight_decay: float, + momentum: float, + centered: bool): + + if len(params) == 0: + return + + if weight_decay != 0: + torch._foreach_add_(grads, params, alpha=weight_decay) + + torch._foreach_mul_(square_avgs, alpha) + torch._foreach_addcmul_(square_avgs, grads, grads, value=1 - alpha) + + if centered: + torch._foreach_mul_(grad_avgs, alpha) + torch._foreach_add_(grad_avgs, grads, alpha=1 - alpha) + avg = torch._foreach_addcmul(square_avgs, grad_avgs, grad_avgs, value=-1) + torch._foreach_sqrt_(avg) + torch._foreach_add_(avg, eps) + else: + avg = torch._foreach_sqrt(square_avgs) + torch._foreach_add_(avg, eps) + + if momentum > 0: + torch._foreach_mul_(momentum_buffer_list, momentum) + torch._foreach_addcdiv_(momentum_buffer_list, grads, avg) + torch._foreach_add_(params, momentum_buffer_list, alpha=-lr) + else: + torch._foreach_addcdiv_(params, grads, avg, value=-lr) From dff58d519f92d560f71a47577c885858b8d2cea3 Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 15 Feb 2022 09:52:30 -0800 Subject: [PATCH 051/199] Optim foreach cleanup for Rprop (#70483) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/70483 Test Plan: Imported from OSS Reviewed By: anjali411 Differential Revision: D33767866 Pulled By: mikaylagawarecki fbshipit-source-id: ffc5ae68eeea8fa09385862b853b731554b77bcb (cherry picked from commit 3a0fe295807bb4519884a1838edeea1a9d222e41) --- torch/distributed/optim/functional_rprop.py | 5 +- torch/optim/_functional.py | 39 +---- torch/optim/_multi_tensor/__init__.py | 3 +- torch/optim/_multi_tensor/__init__.pyi | 2 +- torch/optim/_multi_tensor/rprop.py | 94 ------------ torch/optim/_multi_tensor/rprop.pyi | 5 - torch/optim/rprop.py | 150 ++++++++++++++++++-- 7 files changed, 143 insertions(+), 155 deletions(-) delete mode 100644 torch/optim/_multi_tensor/rprop.py delete mode 100644 torch/optim/_multi_tensor/rprop.pyi diff --git a/torch/distributed/optim/functional_rprop.py b/torch/distributed/optim/functional_rprop.py index 3302822688c..ed6ebddc3d2 100644 --- a/torch/distributed/optim/functional_rprop.py +++ b/torch/distributed/optim/functional_rprop.py @@ -21,6 +21,7 @@ class _FunctionalRprop(object): lr: float = 1e-2, etas: Tuple[float, float] = (0.5, 1.2), step_sizes: Tuple[float, float] = (1e-6, 50), + foreach: bool = False, _allow_empty_param_list: bool = False, ): self.defaults = { @@ -28,6 +29,7 @@ class _FunctionalRprop(object): } self.etas = etas self.step_sizes = step_sizes + self.foreach = foreach if len(params) == 0 and not _allow_empty_param_list: raise ValueError("optimizer got an empty parameter list") @@ -81,4 +83,5 @@ class _FunctionalRprop(object): step_size_min=step_size_min, step_size_max=step_size_max, etaminus=etaminus, - etaplus=etaplus) + etaplus=etaplus, + foreach=self.foreach) diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py index 64fd7a44674..da137165754 100644 --- a/torch/optim/_functional.py +++ b/torch/optim/_functional.py @@ -12,6 +12,7 @@ from .asgd import asgd # type: ignore[attr-defined] # noqa: F401 from .nadam import nadam # type: ignore[attr-defined] # noqa: F401 from .radam import radam # type: ignore[attr-defined] # noqa: F401 from .rmsprop import rmsprop # type: ignore[attr-defined] # noqa: F401 +from .rprop import rprop # type: ignore[attr-defined] # noqa: F401 from .sgd import sgd # type: ignore[attr-defined] # noqa: F401 @@ -70,44 +71,6 @@ def adamw(params: List[Tensor], param.addcdiv_(exp_avg, denom, value=-step_size) -def rprop(params: List[Tensor], - grads: List[Tensor], - prevs: List[Tensor], - step_sizes: List[Tensor], - *, - step_size_min: float, - step_size_max: float, - etaminus: float, - etaplus: float): - r"""Functional API that performs rprop algorithm computation. - - See :class:`~torch.optim.Rprop` for details. - """ - - for i, param in enumerate(params): - grad = grads[i] - prev = prevs[i] - step_size = step_sizes[i] - - sign = grad.mul(prev).sign() - sign[sign.gt(0)] = etaplus - sign[sign.lt(0)] = etaminus - sign[sign.eq(0)] = 1 - - # update stepsizes with step size updates - step_size.mul_(sign).clamp_(step_size_min, step_size_max) - - # for dir<0, dfdx=0 - # for dir>=0 dfdx=dfdx - grad = grad.clone(memory_format=torch.preserve_format) - grad[sign.eq(etaminus)] = 0 - - # update parameters - param.addcmul_(grad.sign(), step_size, value=-1) - - prev.copy_(grad) - - def sparse_adam(params: List[Tensor], grads: List[Tensor], exp_avgs: List[Tensor], diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py index 16300a91527..e9e6b13b8da 100644 --- a/torch/optim/_multi_tensor/__init__.py +++ b/torch/optim/_multi_tensor/__init__.py @@ -13,11 +13,10 @@ NAdam = partial(optim.NAdam, foreach=True) SGD = partial(optim.SGD, foreach=True) RAdam = partial(optim.RAdam, foreach=True) RMSprop = partial(optim.RMSprop, foreach=True) -from .rprop import Rprop +Rprop = partial(optim.Rprop, foreach=True) ASGD = partial(optim.ASGD, foreach=True) Adamax = partial(optim.Adamax, foreach=True) Adadelta = partial(optim.Adadelta, foreach=True) Adagrad = partial(optim.Adagrad, foreach=True) del adamw -del rprop diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi index 3d3c6028377..812d9fc3416 100644 --- a/torch/optim/_multi_tensor/__init__.pyi +++ b/torch/optim/_multi_tensor/__init__.pyi @@ -7,7 +7,7 @@ NAdam = partial(optim.NAdam, foreach=True) SGD = partial(optim.SGD, foreach=True) RAdam = partial(optim.RAdam, foreach=True) RMSprop = partial(optim.RMSprop, foreach=True) -from .rprop import Rprop as Rprop +Rprop = partial(optim.Rprop, foreach=True) ASGD = partial(optim.ASGD, foreach=True) Adamax = partial(optim.Adamax, foreach=True) Adadelta = partial(optim.Adadelta, foreach=True) diff --git a/torch/optim/_multi_tensor/rprop.py b/torch/optim/_multi_tensor/rprop.py deleted file mode 100644 index 67baf1e3b34..00000000000 --- a/torch/optim/_multi_tensor/rprop.py +++ /dev/null @@ -1,94 +0,0 @@ -import torch -from ..optimizer import Optimizer - -class Rprop(Optimizer): - """Implements the resilient backpropagation algorithm. - - Args: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-2) - etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that - are multiplicative increase and decrease factors - (default: (0.5, 1.2)) - step_sizes (Tuple[float, float], optional): a pair of minimal and - maximal allowed step sizes (default: (1e-6, 50)) - """ - - def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 < etas[0] < 1.0 < etas[1]: - raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1])) - - defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes, foreach=True) - super(Rprop, self).__init__(params, defaults) - - @torch.no_grad() - def step(self, closure=None): - """Performs a single optimization step. - - Args: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - grads = [] - states = [] - params_with_grad = [] - step_sizes = [] - - for group in self.param_groups: - for p in group['params']: - etaminus, etaplus = group['etas'] - step_size_min, step_size_max = group['step_sizes'] - - if p.grad is not None: - if p.grad.is_sparse: - raise RuntimeError('RMSprop does not support sparse gradients') - - grads.append(p.grad) - params_with_grad.append(p) - - state = self.state[p] - # State initialization - if len(state) == 0: - state['step'] = 0 - state['prev'] = torch.zeros_like(p, memory_format=torch.preserve_format) - state['step_size'] = p.grad.new().resize_as_(p.grad).fill_(group['lr']) - - state['step'] += 1 - - states.append(state) - step_sizes.append(state['step_size']) - - signs = torch._foreach_mul(grads, [s['prev'] for s in states]) - signs = [s.sign() for s in signs] - for sign in signs: - sign[sign.gt(0)] = etaplus - sign[sign.lt(0)] = etaminus - sign[sign.eq(0)] = 1 - - # update stepsizes with step size updates - torch._foreach_mul_(step_sizes, signs) - for step_size in step_sizes: - step_size.clamp_(step_size_min, step_size_max) - - # for dir<0, dfdx=0 - # for dir>=0 dfdx=dfdx - for i in range(len(grads)): - grads[i] = grads[i].clone(memory_format=torch.preserve_format) - grads[i][signs[i].eq(etaminus)] = 0 - - # update parameters - grad_signs = [grad.sign() for grad in grads] - torch._foreach_addcmul_(params_with_grad, grad_signs, step_sizes, value=-1) - - for i in range(len(states)): - states[i]['prev'].copy_(grads[i]) - - return loss diff --git a/torch/optim/_multi_tensor/rprop.pyi b/torch/optim/_multi_tensor/rprop.pyi deleted file mode 100644 index 0ea64c63d25..00000000000 --- a/torch/optim/_multi_tensor/rprop.pyi +++ /dev/null @@ -1,5 +0,0 @@ -from typing import Tuple -from ..optimizer import _params_t, Optimizer - -class Rprop(Optimizer): - def __init__(self, params: _params_t, lr: float=..., etas: Tuple[float, float]=..., step_sizes: Tuple[float, float]=...) -> None: ... diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py index 741f6de6943..f580e35319b 100644 --- a/torch/optim/rprop.py +++ b/torch/optim/rprop.py @@ -1,6 +1,7 @@ import torch -from . import _functional as F +from torch import Tensor from .optimizer import Optimizer +from typing import List, Optional class Rprop(Optimizer): @@ -47,17 +48,25 @@ class Rprop(Optimizer): (default: (0.5, 1.2)) step_sizes (Tuple[float, float], optional): a pair of minimal and maximal allowed step sizes (default: (1e-6, 50)) + foreach (bool, optional): whether foreach implementation of optimizer + is used (default: None) """ - def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50)): + def __init__(self, params, lr=1e-2, etas=(0.5, 1.2), step_sizes=(1e-6, 50), + foreach: Optional[bool] = None): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 < etas[0] < 1.0 < etas[1]: raise ValueError("Invalid eta values: {}, {}".format(etas[0], etas[1])) - defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes) + defaults = dict(lr=lr, etas=etas, step_sizes=step_sizes, foreach=foreach) super(Rprop, self).__init__(params, defaults) + def __setstate__(self, state): + super().__setstate__(state) + for group in self.param_groups: + group.setdefault('foreach', None) + @torch.no_grad() def step(self, closure=None): """Performs a single optimization step. @@ -76,6 +85,9 @@ class Rprop(Optimizer): grads = [] prevs = [] step_sizes = [] + etaminus, etaplus = group['etas'] + step_size_min, step_size_max = group['step_sizes'] + foreach = group['foreach'] for p in group['params']: if p.grad is None: @@ -97,18 +109,128 @@ class Rprop(Optimizer): prevs.append(state['prev']) step_sizes.append(state['step_size']) - etaminus, etaplus = group['etas'] - step_size_min, step_size_max = group['step_sizes'] - state['step'] += 1 - F.rprop(params, - grads, - prevs, - step_sizes, - step_size_min=step_size_min, - step_size_max=step_size_max, - etaminus=etaminus, - etaplus=etaplus) + rprop(params, + grads, + prevs, + step_sizes, + step_size_min=step_size_min, + step_size_max=step_size_max, + etaminus=etaminus, + etaplus=etaplus, + foreach=foreach) return loss + + +def rprop(params: List[Tensor], + grads: List[Tensor], + prevs: List[Tensor], + step_sizes: List[Tensor], + # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 + # setting this as kwarg for now as functional API is compiled by torch/distributed/optim + foreach: bool = None, + *, + step_size_min: float, + step_size_max: float, + etaminus: float, + etaplus: float): + r"""Functional API that performs rprop algorithm computation. + + See :class:`~torch.optim.Rprop` for details. + """ + + if foreach is None: + # Placeholder for more complex foreach logic to be added when value is not set + foreach = False + + if foreach and torch.jit.is_scripting(): + raise RuntimeError('torch.jit.script not supported with foreach optimizers') + + if foreach and not torch.jit.is_scripting(): + func = _multi_tensor_rprop + else: + func = _single_tensor_rprop + + func(params, + grads, + prevs, + step_sizes, + step_size_min=step_size_min, + step_size_max=step_size_max, + etaminus=etaminus, + etaplus=etaplus) + + +def _single_tensor_rprop(params: List[Tensor], + grads: List[Tensor], + prevs: List[Tensor], + step_sizes: List[Tensor], + *, + step_size_min: float, + step_size_max: float, + etaminus: float, + etaplus: float): + + for i, param in enumerate(params): + grad = grads[i] + prev = prevs[i] + step_size = step_sizes[i] + + sign = grad.mul(prev).sign() + sign[sign.gt(0)] = etaplus + sign[sign.lt(0)] = etaminus + sign[sign.eq(0)] = 1 + + # update stepsizes with step size updates + step_size.mul_(sign).clamp_(step_size_min, step_size_max) + + # for dir<0, dfdx=0 + # for dir>=0 dfdx=dfdx + grad = grad.clone(memory_format=torch.preserve_format) + grad[sign.eq(etaminus)] = 0 + + # update parameters + param.addcmul_(grad.sign(), step_size, value=-1) + + prev.copy_(grad) + + +def _multi_tensor_rprop(params: List[Tensor], + grads: List[Tensor], + prevs: List[Tensor], + step_sizes: List[Tensor], + *, + step_size_min: float, + step_size_max: float, + etaminus: float, + etaplus: float): + + if len(params) == 0: + return + + signs = torch._foreach_mul(grads, prevs) + signs = [s.sign() for s in signs] + for sign in signs: + sign[sign.gt(0)] = etaplus + sign[sign.lt(0)] = etaminus + sign[sign.eq(0)] = 1 + + # update stepsizes with step size updates + torch._foreach_mul_(step_sizes, signs) + for step_size in step_sizes: + step_size.clamp_(step_size_min, step_size_max) + + # for dir<0, dfdx=0 + # for dir>=0 dfdx=dfdx + for i in range(len(grads)): + grads[i] = grads[i].clone(memory_format=torch.preserve_format) + grads[i][signs[i].eq(etaminus)] = 0 + + # update parameters + grad_signs = [grad.sign() for grad in grads] + torch._foreach_addcmul_(params, grad_signs, step_sizes, value=-1) + + for i in range(len(prevs)): + prevs[i].copy_(grads[i]) From 2a5aaf1c49f18e91c911d6020b71a4201c734eb4 Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 15 Feb 2022 09:52:30 -0800 Subject: [PATCH 052/199] Optim foreach cleanup for AdamW (#70484) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/70484 Test Plan: Imported from OSS Reviewed By: anjali411 Differential Revision: D33767869 Pulled By: mikaylagawarecki fbshipit-source-id: 2f5273bbfeea3ed502c5d77da4bebe1674243e86 (cherry picked from commit 2dd9b77917d67223012cfe1719d0919a422c5428) --- torch/distributed/optim/functional_adamw.py | 8 +- torch/optim/_functional.py | 54 +----- torch/optim/_multi_tensor/__init__.py | 4 +- torch/optim/_multi_tensor/__init__.pyi | 2 +- torch/optim/_multi_tensor/adamw.py | 154 --------------- torch/optim/_multi_tensor/adamw.pyi | 5 - torch/optim/adamw.py | 197 ++++++++++++++++++-- 7 files changed, 188 insertions(+), 236 deletions(-) delete mode 100644 torch/optim/_multi_tensor/adamw.py delete mode 100644 torch/optim/_multi_tensor/adamw.pyi diff --git a/torch/distributed/optim/functional_adamw.py b/torch/distributed/optim/functional_adamw.py index ae036538f99..3114d069113 100644 --- a/torch/distributed/optim/functional_adamw.py +++ b/torch/distributed/optim/functional_adamw.py @@ -24,6 +24,7 @@ class _FunctionalAdamW(object): weight_decay: float = 1e-2, amsgrad: bool = False, maximize: bool = False, + foreach: bool = False, _allow_empty_param_list: bool = False, ): if not 0.0 <= lr: @@ -46,6 +47,7 @@ class _FunctionalAdamW(object): } self.amsgrad = amsgrad self.maximize = maximize + self.foreach = foreach self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {}) if len(params) == 0 and not _allow_empty_param_list: @@ -100,7 +102,8 @@ class _FunctionalAdamW(object): beta2=self.defaults['beta2'], lr=self.defaults['lr'], weight_decay=self.defaults['weight_decay'], - eps=self.defaults['eps']) + eps=self.defaults['eps'], + foreach=self.foreach) def step(self, gradients: List[Optional[Tensor]]): params = self.param_group['params'] @@ -158,4 +161,5 @@ class _FunctionalAdamW(object): beta2=self.defaults['beta2'], lr=self.defaults['lr'], weight_decay=self.defaults['weight_decay'], - eps=self.defaults['eps']) + eps=self.defaults['eps'], + foreach=self.foreach) diff --git a/torch/optim/_functional.py b/torch/optim/_functional.py index da137165754..2fa7b3dddd0 100644 --- a/torch/optim/_functional.py +++ b/torch/optim/_functional.py @@ -1,12 +1,12 @@ r"""Functional interface""" import math -import torch from torch import Tensor from typing import List from .adadelta import adadelta # type: ignore[attr-defined] # noqa: F401 from .adagrad import adagrad, _make_sparse # type: ignore[attr-defined] # noqa: F401 from .adam import adam # type: ignore[attr-defined] # noqa: F401 +from .adamw import adamw # type: ignore[attr-defined] # noqa: F401 from .adamax import adamax # type: ignore[attr-defined] # noqa: F401 from .asgd import asgd # type: ignore[attr-defined] # noqa: F401 from .nadam import nadam # type: ignore[attr-defined] # noqa: F401 @@ -18,58 +18,6 @@ from .sgd import sgd # type: ignore[attr-defined] # noqa: F401 # TODO: use foreach API in optim._functional to do all the computation -def adamw(params: List[Tensor], - grads: List[Tensor], - exp_avgs: List[Tensor], - exp_avg_sqs: List[Tensor], - max_exp_avg_sqs: List[Tensor], - state_steps: List[Tensor], - *, - amsgrad: bool, - beta1: float, - beta2: float, - lr: float, - weight_decay: float, - eps: float, - maximize: bool): - r"""Functional API that performs AdamW algorithm computation. - - See :class:`~torch.optim.AdamW` for details. - """ - - if not all([isinstance(t, torch.Tensor) for t in state_steps]): - raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors") - - for i, param in enumerate(params): - grad = grads[i] if not maximize else -grads[i] - exp_avg = exp_avgs[i] - exp_avg_sq = exp_avg_sqs[i] - step_t = state_steps[i] - # update step - step_t += 1 - step = step_t.item() - - # Perform stepweight decay - param.mul_(1 - lr * weight_decay) - - bias_correction1 = 1 - beta1 ** step - bias_correction2 = 1 - beta2 ** step - - # Decay the first and second moment running average coefficient - exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) - if amsgrad: - # Maintains the maximum of all 2nd moment running avg. till now - torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i]) - # Use the max. for normalizing running avg. of gradient - denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps) - else: - denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps) - - step_size = lr / bias_correction1 - - param.addcdiv_(exp_avg, denom, value=-step_size) - def sparse_adam(params: List[Tensor], grads: List[Tensor], diff --git a/torch/optim/_multi_tensor/__init__.py b/torch/optim/_multi_tensor/__init__.py index e9e6b13b8da..ed0b6a7d178 100644 --- a/torch/optim/_multi_tensor/__init__.py +++ b/torch/optim/_multi_tensor/__init__.py @@ -8,7 +8,7 @@ from functools import partial from torch import optim Adam = partial(optim.Adam, foreach=True) -from .adamw import AdamW +AdamW = partial(optim.AdamW, foreach=True) NAdam = partial(optim.NAdam, foreach=True) SGD = partial(optim.SGD, foreach=True) RAdam = partial(optim.RAdam, foreach=True) @@ -18,5 +18,3 @@ ASGD = partial(optim.ASGD, foreach=True) Adamax = partial(optim.Adamax, foreach=True) Adadelta = partial(optim.Adadelta, foreach=True) Adagrad = partial(optim.Adagrad, foreach=True) - -del adamw diff --git a/torch/optim/_multi_tensor/__init__.pyi b/torch/optim/_multi_tensor/__init__.pyi index 812d9fc3416..fec9f9ae782 100644 --- a/torch/optim/_multi_tensor/__init__.pyi +++ b/torch/optim/_multi_tensor/__init__.pyi @@ -2,7 +2,7 @@ from functools import partial from torch import optim Adam = partial(optim.Adam, foreach=True) -from .adamw import AdamW as AdamW +AdamW = partial(optim.AdamW, foreach=True) NAdam = partial(optim.NAdam, foreach=True) SGD = partial(optim.SGD, foreach=True) RAdam = partial(optim.RAdam, foreach=True) diff --git a/torch/optim/_multi_tensor/adamw.py b/torch/optim/_multi_tensor/adamw.py deleted file mode 100644 index 6e4e1701f3f..00000000000 --- a/torch/optim/_multi_tensor/adamw.py +++ /dev/null @@ -1,154 +0,0 @@ -import math -import torch -from ..optimizer import Optimizer - -class AdamW(Optimizer): - r"""Implements AdamW algorithm. - - The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. - The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. - - Args: - params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - lr (float, optional): learning rate (default: 1e-3) - betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.999)) - eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - weight_decay (float, optional): weight decay coefficient (default: 1e-2) - amsgrad (boolean, optional): whether to use the AMSGrad variant of this - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - - .. _Adam\: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - .. _Decoupled Weight Decay Regularization: - https://arxiv.org/abs/1711.05101 - .. _On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ - """ - - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=1e-2, amsgrad=False, *, maximize: bool = False): - if not 0.0 <= lr: - raise ValueError("Invalid learning rate: {}".format(lr)) - if not 0.0 <= eps: - raise ValueError("Invalid epsilon value: {}".format(eps)) - if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) - if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) - if not 0.0 <= weight_decay: - raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) - defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, amsgrad=amsgrad, maximize=maximize, foreach=True) - super(AdamW, self).__init__(params, defaults) - - def __setstate__(self, state): - super(AdamW, self).__setstate__(state) - for group in self.param_groups: - group.setdefault('amsgrad', False) - group.setdefault('maximize', False) - state_values = list(self.state.values()) - step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step']) - if not step_is_tensor: - for s in state_values: - s['step'] = torch.tensor(float(s['step'])) - - @torch.no_grad() - def step(self, closure=None): - """Performs a single optimization step. - - Args: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. - """ - loss = None - if closure is not None: - with torch.enable_grad(): - loss = closure() - - for group in self.param_groups: - amsgrad = group['amsgrad'] - - grads = [] - state_steps = [] - exp_avg = [] - exp_avg_sq = [] - max_exp_avg_sq = [] - params_with_grad = [] - - for p in group['params']: - if p.grad is not None: - if p.grad.is_sparse: - raise RuntimeError('AdamW does not support sparse gradients') - - # Perform stepweight decay - p.mul_(1 - group['lr'] * group['weight_decay']) - - params_with_grad.append(p) - grads.append(p.grad) - - if group['maximize']: - grads = torch._foreach_neg(tuple(grads)) - - for p in params_with_grad: - state = self.state[p] - - # State initialization - if len(state) == 0: - state['step'] = torch.tensor(0.) - # Exponential moving average of gradient values - state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) - # Exponential moving average of squared gradient values - state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) - if amsgrad: - # Maintains max of all exp. moving avg. of sq. grad. values - state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) - - exp_avg.append(state['exp_avg']) - exp_avg_sq.append(state['exp_avg_sq']) - - if amsgrad: - max_exp_avg_sq.append(state['max_exp_avg_sq']) - - state_steps.append(state['step']) - - - beta1, beta2 = group['betas'] - - # update steps - torch._foreach_add_(state_steps, 1) - - bias_correction1 = [1 - beta1 ** step.item() for step in state_steps] - bias_correction2 = [1 - beta2 ** step.item() for step in state_steps] - - # - # Decay the first and second moment running average coefficient - # - torch._foreach_mul_(exp_avg, beta1) - torch._foreach_add_(exp_avg, grads, alpha=1 - beta1) - - torch._foreach_mul_(exp_avg_sq, beta2) - torch._foreach_addcmul_(exp_avg_sq, grads, grads, 1 - beta2) - - if amsgrad: - # Maintains the maximum of all 2nd moment running avg. till now - max_exp_avg_sq = torch._foreach_maximum(max_exp_avg_sq, exp_avg_sq) - - # Use the max. for normalizing running avg. of gradient - max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sq) - bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] - torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) - denom = torch._foreach_add(max_exp_avg_sq_sqrt, group['eps']) - else: - exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sq) - bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] - torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) - denom = torch._foreach_add(exp_avg_sq_sqrt, group['eps']) - - step_size = [-1 * (group['lr'] / bc) for bc in bias_correction1] - torch._foreach_addcdiv_(params_with_grad, exp_avg, denom, step_size) - - return loss diff --git a/torch/optim/_multi_tensor/adamw.pyi b/torch/optim/_multi_tensor/adamw.pyi deleted file mode 100644 index dedd8de3f87..00000000000 --- a/torch/optim/_multi_tensor/adamw.pyi +++ /dev/null @@ -1,5 +0,0 @@ -from typing import Tuple -from ..optimizer import _params_t, Optimizer - -class AdamW(Optimizer): - def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=..., amsgrad: bool = ...) -> None: ... diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py index 0089acbcff0..9582ea38b00 100644 --- a/torch/optim/adamw.py +++ b/torch/optim/adamw.py @@ -1,6 +1,8 @@ +import math import torch -from . import _functional as F +from torch import Tensor from .optimizer import Optimizer +from typing import List, Optional class AdamW(Optimizer): @@ -57,6 +59,8 @@ class AdamW(Optimizer): (default: False) maximize (bool, optional): maximize the params based on the objective, instead of minimizing (default: False) + foreach (bool, optional): whether foreach implementation of optimizer + is used (default: None) .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 @@ -65,7 +69,8 @@ class AdamW(Optimizer): """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=1e-2, amsgrad=False, *, maximize: bool = False): + weight_decay=1e-2, amsgrad=False, *, maximize: bool = False, + foreach: Optional[bool] = None): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: @@ -77,14 +82,16 @@ class AdamW(Optimizer): if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict(lr=lr, betas=betas, eps=eps, - weight_decay=weight_decay, amsgrad=amsgrad, maximize=maximize) + weight_decay=weight_decay, amsgrad=amsgrad, + foreach=foreach, maximize=maximize) super(AdamW, self).__init__(params, defaults) def __setstate__(self, state): - super(AdamW, self).__setstate__(state) + super().__setstate__(state) for group in self.param_groups: group.setdefault('amsgrad', False) group.setdefault('maximize', False) + group.setdefault('foreach', None) state_values = list(self.state.values()) step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step']) if not step_is_tensor: @@ -109,7 +116,6 @@ class AdamW(Optimizer): grads = [] exp_avgs = [] exp_avg_sqs = [] - state_sums = [] max_exp_avg_sqs = [] state_steps = [] amsgrad = group['amsgrad'] @@ -144,18 +150,173 @@ class AdamW(Optimizer): state_steps.append(state['step']) - F.adamw(params_with_grad, - grads, - exp_avgs, - exp_avg_sqs, - max_exp_avg_sqs, - state_steps, - amsgrad=amsgrad, - beta1=beta1, - beta2=beta2, - lr=group['lr'], - weight_decay=group['weight_decay'], - eps=group['eps'], - maximize=group['maximize']) + adamw(params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad=amsgrad, + beta1=beta1, + beta2=beta2, + lr=group['lr'], + weight_decay=group['weight_decay'], + eps=group['eps'], + maximize=group['maximize'], + foreach=group['foreach']) return loss + + +def adamw(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + exp_avg_sqs: List[Tensor], + max_exp_avg_sqs: List[Tensor], + state_steps: List[Tensor], + # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627 + # setting this as kwarg for now as functional API is compiled by torch/distributed/optim + foreach: bool = None, + *, + amsgrad: bool, + beta1: float, + beta2: float, + lr: float, + weight_decay: float, + eps: float, + maximize: bool): + r"""Functional API that performs AdamW algorithm computation. + + See :class:`~torch.optim.AdamW` for details. + """ + + if not all([isinstance(t, torch.Tensor) for t in state_steps]): + raise RuntimeError("API has changed, `state_steps` argument must contain a list of singleton tensors") + + if foreach is None: + # Placeholder for more complex foreach logic to be added when value is not set + foreach = False + + if foreach and torch.jit.is_scripting(): + raise RuntimeError('torch.jit.script not supported with foreach optimizers') + + if foreach and not torch.jit.is_scripting(): + func = _multi_tensor_adamw + else: + func = _single_tensor_adamw + + func(params, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad=amsgrad, + beta1=beta1, + beta2=beta2, + lr=lr, + weight_decay=weight_decay, + eps=eps, + maximize=maximize) + + +def _single_tensor_adamw(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + exp_avg_sqs: List[Tensor], + max_exp_avg_sqs: List[Tensor], + state_steps: List[Tensor], + *, + amsgrad: bool, + beta1: float, + beta2: float, + lr: float, + weight_decay: float, + eps: float, + maximize: bool): + + for i, param in enumerate(params): + grad = grads[i] if not maximize else -grads[i] + exp_avg = exp_avgs[i] + exp_avg_sq = exp_avg_sqs[i] + step_t = state_steps[i] + # update step + step_t += 1 + step = step_t.item() + + # Perform stepweight decay + param.mul_(1 - lr * weight_decay) + + bias_correction1 = 1 - beta1 ** step + bias_correction2 = 1 - beta2 ** step + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.maximum(max_exp_avg_sqs[i], exp_avg_sq, out=max_exp_avg_sqs[i]) + # Use the max. for normalizing running avg. of gradient + denom = (max_exp_avg_sqs[i].sqrt() / math.sqrt(bias_correction2)).add_(eps) + else: + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps) + + step_size = lr / bias_correction1 + + param.addcdiv_(exp_avg, denom, value=-step_size) + + +def _multi_tensor_adamw(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + exp_avg_sqs: List[Tensor], + max_exp_avg_sqs: List[Tensor], + state_steps: List[Tensor], + *, + amsgrad: bool, + beta1: float, + beta2: float, + lr: float, + weight_decay: float, + eps: float, + maximize: bool): + + if len(params) == 0: + return + + if maximize: + grads = torch._foreach_neg(tuple(grads)) # type: ignore[assignment] + + # Perform stepweight decay + torch._foreach_mul_(params, 1 - lr * weight_decay) + + # update steps + torch._foreach_add_(state_steps, 1) + + bias_correction1 = [1 - beta1 ** step.item() for step in state_steps] + bias_correction2 = [1 - beta2 ** step.item() for step in state_steps] + + # Decay the first and second moment running average coefficient + torch._foreach_mul_(exp_avgs, beta1) + torch._foreach_add_(exp_avgs, grads, alpha=1 - beta1) + + torch._foreach_mul_(exp_avg_sqs, beta2) + torch._foreach_addcmul_(exp_avg_sqs, grads, grads, 1 - beta2) + + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + max_exp_avg_sqs = torch._foreach_maximum(max_exp_avg_sqs, exp_avg_sqs) # type: ignore[assignment] + + # Use the max. for normalizing running avg. of gradient + max_exp_avg_sq_sqrt = torch._foreach_sqrt(max_exp_avg_sqs) + bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] + torch._foreach_div_(max_exp_avg_sq_sqrt, bias_correction_sqrt) + denom = torch._foreach_add(max_exp_avg_sq_sqrt, eps) + else: + exp_avg_sq_sqrt = torch._foreach_sqrt(exp_avg_sqs) + bias_correction_sqrt = [math.sqrt(bc) for bc in bias_correction2] + torch._foreach_div_(exp_avg_sq_sqrt, bias_correction_sqrt) + denom = torch._foreach_add(exp_avg_sq_sqrt, eps) + + step_size = [-1 * (lr / bc) for bc in bias_correction1] + torch._foreach_addcdiv_(params, exp_avgs, denom, step_size) From 03662b32d55b33eb6a2d63bb8ccec4671e9d323f Mon Sep 17 00:00:00 2001 From: Mikayla Gawarecki Date: Tue, 15 Feb 2022 09:52:30 -0800 Subject: [PATCH 053/199] Uncomment step no-op test in test_optim (#70953) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/70953 Test Plan: Imported from OSS Reviewed By: anjali411 Differential Revision: D33767861 Pulled By: mikaylagawarecki fbshipit-source-id: 8b41c8ee5d0e045436b10da5f68e9d5c5852c334 (cherry picked from commit 9224afc453cbc1c74da2b1c036dc78e1c210ac37) --- test/test_optim.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/test_optim.py b/test/test_optim.py index 061f8a44765..c59d6a49bb4 100644 --- a/test/test_optim.py +++ b/test/test_optim.py @@ -481,9 +481,8 @@ class TestOptim(TestCase): loss.backward() # Test that step behaves as expected (a no-op) when grads are set to None - # TODO: uncomment after optim foreach cleanup is landed - # if iter == 0: - # optimizer.zero_grad(set_to_none=True) + if iter == 0: + optimizer.zero_grad(set_to_none=True) optimizer.step() From 41782a4542e5e63111b11adb2fed4f4180295a0b Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 15 Feb 2022 10:03:33 -0800 Subject: [PATCH 054/199] [quant][core][devs] Refactor the implementation for quantized batchnorm module (#72489) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72489 To reduce the duplicated code Test Plan: python test/test_quantization.py TestStaticQuantizedModule python test/test_quantization.py TestQuantizeFxOps.test_qbatch_norm Imported from OSS Reviewed By: vkuzo Differential Revision: D34062367 fbshipit-source-id: cee14051bbe5dd2597e0eb6bf2d38993be9e51b3 (cherry picked from commit d9ca5cdbb1c0b5bcd8a98077cdb2b2d9e7aa5c48) --- torch/nn/quantized/modules/batchnorm.py | 102 ++++++++++++++---------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/torch/nn/quantized/modules/batchnorm.py b/torch/nn/quantized/modules/batchnorm.py index b30bf203cfa..d6357a50425 100644 --- a/torch/nn/quantized/modules/batchnorm.py +++ b/torch/nn/quantized/modules/batchnorm.py @@ -1,70 +1,86 @@ import torch import torch.nn.quantized.functional import torch.nn.intrinsic as nni +from torch import Tensor -class BatchNorm2d(torch.nn.BatchNorm2d): - r"""This is the quantized version of :class:`~torch.nn.BatchNorm2d`. - """ - +class _BatchNorm(torch.nn.modules.batchnorm._BatchNorm): def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None) -> None: factory_kwargs = {'device': device, 'dtype': dtype} - super(BatchNorm2d, self).__init__(num_features, **factory_kwargs) - self.eps = eps + super().__init__(num_features, eps, momentum, True, True, **factory_kwargs) self.register_buffer('scale', torch.tensor(1.0, **factory_kwargs)) self.register_buffer('zero_point', torch.tensor(0, **factory_kwargs)) - def forward(self, input): - return torch.ops.quantized.batch_norm2d(input, self.weight, self.bias, self.running_mean, - self.running_var, self.eps, self.scale, self.zero_point) + @staticmethod + def from_float(cls, mod): + activation_post_process = mod.activation_post_process + if type(mod) == cls._NNI_BN_RELU_MODULE: + mod = mod[0] + scale, zero_point = activation_post_process.calculate_qparams() + new_mod = cls(mod.num_features, mod.eps) + new_mod.weight = mod.weight + new_mod.bias = mod.bias + new_mod.running_mean = mod.running_mean + new_mod.running_var = mod.running_var + new_mod.scale = scale + new_mod.zero_point = zero_point + return new_mod + +class BatchNorm2d(_BatchNorm): + r"""This is the quantized version of :class:`~torch.nn.BatchNorm2d`. + """ + + _NNI_BN_RELU_MODULE = nni.BNReLU2d + + def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None) -> None: + factory_kwargs = {'device': device, 'dtype': dtype} + super().__init__(num_features, eps, momentum, **factory_kwargs) def _get_name(self): return 'QuantizedBatchNorm2d' + def _check_input_dim(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 4: + raise ValueError("Input shape must be `(N, C, H, W)`!") + + def forward(self, input: Tensor) -> Tensor: + # disabling this since this is not symbolically traceable + # self._check_input_dim(input) + return torch.ops.quantized.batch_norm2d( + input, self.weight, self.bias, self.running_mean, + self.running_var, self.eps, self.scale, self.zero_point) + @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process - if type(mod) == nni.BNReLU2d: - mod = mod[0] - scale, zero_point = activation_post_process.calculate_qparams() - new_mod = cls(mod.num_features, mod.eps) - new_mod.weight = mod.weight - new_mod.bias = mod.bias - new_mod.running_mean = mod.running_mean - new_mod.running_var = mod.running_var - new_mod.scale = scale - new_mod.zero_point = zero_point - return new_mod + return _BatchNorm.from_float(cls, mod) -# TODO: dedup with BatchNorm2d -class BatchNorm3d(torch.nn.BatchNorm3d): +class BatchNorm3d(_BatchNorm): r"""This is the quantized version of :class:`~torch.nn.BatchNorm3d`. """ + _NNI_BN_RELU_MODULE = nni.BNReLU3d + def __init__(self, num_features, eps=1e-5, momentum=0.1, device=None, dtype=None): factory_kwargs = {'device': device, 'dtype': dtype} - super(BatchNorm3d, self).__init__(num_features, **factory_kwargs) - self.eps = eps - self.register_buffer('scale', torch.tensor(1.0, **factory_kwargs)) - self.register_buffer('zero_point', torch.tensor(0, **factory_kwargs)) - - def forward(self, input): - return torch.ops.quantized.batch_norm3d(input, self.weight, self.bias, self.running_mean, - self.running_var, self.eps, self.scale, self.zero_point) + super().__init__(num_features, eps, momentum, **factory_kwargs) def _get_name(self): return 'QuantizedBatchNorm3d' + def _check_input_dim(self, input): + # Temporarily using len(shape) instead of ndim due to JIT issue + # https://github.com/pytorch/pytorch/issues/23890 + if len(input.shape) != 5: + raise ValueError("Input shape must be `(N, C, H, W)`!") + + def forward(self, input: Tensor) -> Tensor: + # disabling this since this is not symbolically traceable + # self._check_input_dim(input) + return torch.ops.quantized.batch_norm3d( + input, self.weight, self.bias, self.running_mean, + self.running_var, self.eps, self.scale, self.zero_point) + @classmethod def from_float(cls, mod): - activation_post_process = mod.activation_post_process - if type(mod) == nni.BNReLU3d: - mod = mod[0] - scale, zero_point = activation_post_process.calculate_qparams() - new_mod = cls(mod.num_features, mod.eps) - new_mod.weight = mod.weight - new_mod.bias = mod.bias - new_mod.running_mean = mod.running_mean - new_mod.running_var = mod.running_var - new_mod.scale = scale - new_mod.zero_point = zero_point - return new_mod + return _BatchNorm.from_float(cls, mod) From 7db4a48d92d8a5cc23de5cfd79704f1417e85af2 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Tue, 15 Feb 2022 10:39:11 -0800 Subject: [PATCH 055/199] Revert D33342569: (2/2) Make TorchScript Preserve Fully Qualified Class Name for Python Exceptions: frontend change Test Plan: revert-hammer Differential Revision: D33342569 (https://github.com/pytorch/pytorch/commit/856157fcee3f5f8c48ca185ecad65457be490a7d) Original commit changeset: 57984ac67ae2 Original Phabricator Diff: D33342569 (https://github.com/pytorch/pytorch/commit/856157fcee3f5f8c48ca185ecad65457be490a7d) fbshipit-source-id: 4c12235a1776a3652e7f91e93b626705759d5176 (cherry picked from commit 4cbd7d8bab76fcf050e376c8528dba36541a779f) --- test/cpp/jit/test_exception.cpp | 159 ---------------- test/jit/myexception.py | 8 - test/jit/test_exception.py | 176 ------------------ test/test_jit.py | 147 +++++++++++++++ torch/_jit_internal.py | 19 +- torch/csrc/jit/frontend/ir_emitter.cpp | 8 +- torch/csrc/jit/frontend/sugared_value.h | 12 +- .../csrc/jit/python/python_sugared_value.cpp | 5 +- torch/csrc/jit/python/python_sugared_value.h | 10 +- 9 files changed, 159 insertions(+), 385 deletions(-) delete mode 100644 test/cpp/jit/test_exception.cpp delete mode 100644 test/jit/myexception.py delete mode 100644 test/jit/test_exception.py diff --git a/test/cpp/jit/test_exception.cpp b/test/cpp/jit/test_exception.cpp deleted file mode 100644 index b6b3cbcd679..00000000000 --- a/test/cpp/jit/test_exception.cpp +++ /dev/null @@ -1,159 +0,0 @@ -/* - * We have a python unit test for exceptions in test/jit/test_exception.py . - * Add a CPP version here to verify that excepted exception types thrown from - * C++. This is hard to test in python code since C++ exceptions will be - * translated to python exceptions. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -namespace torch { -namespace jit { - -namespace py = pybind11; - -TEST(TestException, TestAssertion) { - std::string pythonCode = R"PY( - def foo(): - raise AssertionError("An assertion failed") - )PY"; - auto cu_ptr = torch::jit::compile(pythonCode); - torch::jit::GraphFunction* gf = - (torch::jit::GraphFunction*)&cu_ptr->get_function("foo"); - std::cerr << "Graph is\n" << *gf->graph() << std::endl; - - bool is_jit_exception = false; - std::string message; - c10::optional exception_class; - try { - cu_ptr->run_method("foo"); - } catch (JITException& e) { - is_jit_exception = true; - message = e.what(); - exception_class = e.getPythonClassName(); - } - EXPECT_TRUE(is_jit_exception); - EXPECT_FALSE(exception_class); - EXPECT_TRUE( - message.find("RuntimeError: AssertionError: An assertion failed") != - std::string::npos); -} - -struct MyPythonExceptionValue : public torch::jit::SugaredValue { - explicit MyPythonExceptionValue(const py::object& exception_class) { - qualified_name_ = - (py::str(py::getattr(exception_class, "__module__", py::str(""))) + - py::str(".") + - py::str(py::getattr(exception_class, "__name__", py::str("")))) - .cast(); - } - - std::string kind() const override { - return "My Python exception"; - } - - // Simplified from PythonExceptionValue::call - std::shared_ptr call( - const torch::jit::SourceRange& loc, - torch::jit::GraphFunction& caller, - at::ArrayRef args, - at::ArrayRef kwargs, - size_t n_binders) override { - TORCH_CHECK(args.size() == 1); - Value* error_message = args.at(0).value(*caller.graph()); - Value* qualified_class_name = - insertConstant(*caller.graph(), qualified_name_, loc); - return std::make_shared( - error_message, qualified_class_name); - } - - private: - std::string qualified_name_; -}; - -class SimpleResolver : public torch::jit::Resolver { - public: - explicit SimpleResolver() {} - - std::shared_ptr resolveValue( - const std::string& name, - torch::jit::GraphFunction& m, - const torch::jit::SourceRange& loc) override { - // follows toSugaredValue (toSugaredValue is defined in caffe2:_C which is - // a python extension. We can not add that as a cpp_binary's dep) - if (name == "SimpleValueError") { - py::object obj = py::globals()["SimpleValueError"]; - return std::make_shared(obj); - } - TORCH_CHECK(false, "resolveValue: can not resolve '", name, "{}'"); - } - - torch::jit::TypePtr resolveType( - const std::string& name, - const torch::jit::SourceRange& loc) override { - return nullptr; - } -}; - -/* - * - The python source code parsing for TorchScript here is learned from - * torch::jit::compile. - * - The code only parses one Def. If there are multiple in the code, those - * except the first one are skipped. - */ -TEST(TestException, TestCustomException) { - py::scoped_interpreter guard{}; - py::exec(R"PY( - class SimpleValueError(ValueError): - def __init__(self, message): - super(SimpleValueError, self).__init__(message) - )PY"); - - std::string pythonCode = R"PY( - def foo(): - raise SimpleValueError("An assertion failed") - )PY"; - - torch::jit::Parser p( - std::make_shared(pythonCode, "", 1)); - auto def = torch::jit::Def(p.parseFunction(/*is_method=*/false)); - std::cerr << "Def is:\n" << def << std::endl; - auto cu = std::make_shared(); - (void)cu->define( - c10::nullopt, - {}, - {}, - {def}, - // class PythonResolver is defined in - // torch/csrc/jit/python/script_init.cpp. It's not in a header file so I - // can not use it. Create a SimpleResolver insteand - {std::make_shared()}, - nullptr); - torch::jit::GraphFunction* gf = - (torch::jit::GraphFunction*)&cu->get_function("foo"); - std::cerr << "Graph is\n" << *gf->graph() << std::endl; - bool is_jit_exception = false; - c10::optional exception_class; - std::string message; - try { - cu->run_method("foo"); - } catch (JITException& e) { - is_jit_exception = true; - exception_class = e.getPythonClassName(); - message = e.what(); - } - EXPECT_TRUE(is_jit_exception); - EXPECT_EQ("__main__.SimpleValueError", *exception_class); - EXPECT_TRUE( - message.find("__main__.SimpleValueError: An assertion failed") != - std::string::npos); -} - -} // namespace jit -} // namespace torch diff --git a/test/jit/myexception.py b/test/jit/myexception.py deleted file mode 100644 index 5937bd3c91b..00000000000 --- a/test/jit/myexception.py +++ /dev/null @@ -1,8 +0,0 @@ -r""" -Define exceptions used in test_exception.py. We define them in a -separate file on purpose to make sure the fully qualified exception class name -is captured correctly in suce cases. -""" -class MyKeyError(KeyError): - def __init__(self, msg): - super(KeyError, self).__init__(msg) diff --git a/test/jit/test_exception.py b/test/jit/test_exception.py deleted file mode 100644 index dce38e3be89..00000000000 --- a/test/jit/test_exception.py +++ /dev/null @@ -1,176 +0,0 @@ -# Owner(s): ["oncall: jit"] -from torch.testing._internal.common_utils import TestCase -import torch -from torch import nn - -r""" -Test TorchScript exception handling. -""" -class TestException(TestCase): - def test_pyop_exception_message(self): - class Foo(torch.jit.ScriptModule): - def __init__(self): - super(Foo, self).__init__() - self.conv = nn.Conv2d(1, 10, kernel_size=5) - - @torch.jit.script_method - def forward(self, x): - return self.conv(x) - foo = Foo() - # testing that the correct error message propagates - with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"): - foo(torch.ones([123])) # wrong size - - def test_builtin_error_messsage(self): - with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"): - @torch.jit.script - def close_match(x): - return x.masked_fill(True) - - with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently " - "supported in TorchScript"): - @torch.jit.script - def unknown_op(x): - torch.set_anomaly_enabled(True) - return x - - def test_exceptions(self): - cu = torch.jit.CompilationUnit(''' - def foo(cond): - if bool(cond): - raise ValueError(3) - return 1 - ''') - - cu.foo(torch.tensor(0)) - with self.assertRaisesRegex(torch.jit.Error, "3"): - cu.foo(torch.tensor(1)) - - def foo(cond): - a = 3 - if bool(cond): - raise ArbitraryError(a, "hi") - if 1 == 2: - raise ArbitraryError - return a - - with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"): - torch.jit.script(foo) - - def exception_as_value(): - a = Exception() - print(a) - - with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"): - torch.jit.script(exception_as_value) - - @torch.jit.script - def foo_no_decl_always_throws(): - raise RuntimeError("Hi") - - # function that has no declared type but always throws set to None - output_type = next(foo_no_decl_always_throws.graph.outputs()).type() - self.assertTrue(str(output_type) == "NoneType") - - @torch.jit.script - def foo_decl_always_throws(): - # type: () -> Tensor - raise Exception("Hi") - - output_type = next(foo_decl_always_throws.graph.outputs()).type() - self.assertTrue(str(output_type) == "Tensor") - - def foo(): - raise 3 + 4 - - with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"): - torch.jit.script(foo) - - # a escapes scope - @torch.jit.script - def foo(): - if 1 == 1: - a = 1 - else: - if 1 == 1: - raise Exception("Hi") - else: - raise Exception("Hi") - return a - self.assertEqual(foo(), 1) - - @torch.jit.script - def tuple_fn(): - raise RuntimeError("hello", "goodbye") - - with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"): - tuple_fn() - - @torch.jit.script - def no_message(): - raise RuntimeError - - with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"): - no_message() - - def test_assertions(self): - cu = torch.jit.CompilationUnit(''' - def foo(cond): - assert bool(cond), "hi" - return 0 - ''') - - cu.foo(torch.tensor(1)) - with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): - cu.foo(torch.tensor(0)) - - @torch.jit.script - def foo(cond): - assert bool(cond), "hi" - - foo(torch.tensor(1)) - # we don't currently validate the name of the exception - with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): - foo(torch.tensor(0)) - - def test_python_op_exception(self): - @torch.jit.ignore - def python_op(x): - raise Exception("bad!") - - @torch.jit.script - def fn(x): - return python_op(x) - - with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"): - fn(torch.tensor(4)) - - def test_dict_expansion_raises_error(self): - def fn(self): - d = {"foo": 1, "bar": 2, "baz": 3} - return {**d} - - with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, - "Dict expansion "): - torch.jit.script(fn) - - def test_custom_python_exception(self): - class MyValueError(ValueError): - def __init__(self, msg): - super(MyValueError, self).__init__(msg) - - @torch.jit.script - def fn(): - raise MyValueError("test custom exception") - - with self.assertRaisesRegex(torch.jit.Error, "jit.test_exception.MyValueError: test custom exception"): - fn() - - def test_custom_python_exception_defined_elsewhere(self): - from jit.myexception import MyKeyError - - @torch.jit.script - def fn(): - raise MyKeyError("This is a user defined key error") - with self.assertRaisesRegex(torch.jit.Error, "jit.myexception.MyKeyError: This is a user defined key error"): - fn() diff --git a/test/test_jit.py b/test/test_jit.py index 2527fbf941b..37cd9b5d53c 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -13013,6 +13013,153 @@ dedent """ self.checkScript(dedent(code), (101,)) + def test_pyop_exception_message(self): + class Foo(torch.jit.ScriptModule): + def __init__(self): + super(Foo, self).__init__() + self.conv = nn.Conv2d(1, 10, kernel_size=5) + + @torch.jit.script_method + def forward(self, x): + return self.conv(x) + foo = Foo() + # testing that the correct error message propagates + with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"): + foo(torch.ones([123])) # wrong size + + def test_builtin_error_messsage(self): + with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"): + @torch.jit.script + def close_match(x): + return x.masked_fill(True) + + with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently " + "supported in TorchScript"): + @torch.jit.script + def unknown_op(x): + torch.set_anomaly_enabled(True) + return x + + def test_exceptions(self): + cu = torch.jit.CompilationUnit(''' + def foo(cond): + if bool(cond): + raise ValueError(3) + return 1 + ''') + + cu.foo(torch.tensor(0)) + with self.assertRaisesRegex(torch.jit.Error, "3"): + cu.foo(torch.tensor(1)) + + def foo(cond): + a = 3 + if bool(cond): + raise ArbitraryError(a, "hi") + if 1 == 2: + raise ArbitraryError + return a + + with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"): + torch.jit.script(foo) + + def exception_as_value(): + a = Exception() + print(a) + + with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"): + torch.jit.script(exception_as_value) + + @torch.jit.script + def foo_no_decl_always_throws(): + raise RuntimeError("Hi") + + # function that has no declared type but always throws set to None + output_type = next(foo_no_decl_always_throws.graph.outputs()).type() + self.assertTrue(str(output_type) == "NoneType") + + @torch.jit.script + def foo_decl_always_throws(): + # type: () -> Tensor + raise Exception("Hi") + + output_type = next(foo_decl_always_throws.graph.outputs()).type() + self.assertTrue(str(output_type) == "Tensor") + + def foo(): + raise 3 + 4 + + with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"): + torch.jit.script(foo) + + # a escapes scope + @torch.jit.script + def foo(): + if 1 == 1: + a = 1 + else: + if 1 == 1: + raise Exception("Hi") + else: + raise Exception("Hi") + return a + self.assertEqual(foo(), 1) + + @torch.jit.script + def tuple_fn(): + raise RuntimeError("hello", "goodbye") + + with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"): + tuple_fn() + + @torch.jit.script + def no_message(): + raise RuntimeError + + with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"): + no_message() + + def test_assertions(self): + cu = torch.jit.CompilationUnit(''' + def foo(cond): + assert bool(cond), "hi" + return 0 + ''') + + cu.foo(torch.tensor(1)) + with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): + cu.foo(torch.tensor(0)) + + @torch.jit.script + def foo(cond): + assert bool(cond), "hi" + + foo(torch.tensor(1)) + # we don't currently validate the name of the exception + with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): + foo(torch.tensor(0)) + + def test_python_op_exception(self): + @torch.jit.ignore + def python_op(x): + raise Exception("bad!") + + @torch.jit.script + def fn(x): + return python_op(x) + + with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"): + fn(torch.tensor(4)) + + def test_dict_expansion_raises_error(self): + def fn(self): + d = {"foo": 1, "bar": 2, "baz": 3} + return {**d} + + with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, + "Dict expansion "): + torch.jit.script(fn) + def test_module_parameters_and_buffers(self): weights = torch.randn(10, 10) bias = torch.randn(10) diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index ba570b35391..20616a978d4 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -977,7 +977,7 @@ def is_scripting() -> bool: # Retrieves a fully-qualified name (module hierarchy + classname) for a given obj. -def _qualified_name(obj, mangle_name=True) -> str: +def _qualified_name(obj) -> str: # This special case allows us to override the qualified name on a type. # It's currently used in conjunction with tracing, where we create a # fake module to filter only supported attributes. However, since this @@ -1026,16 +1026,13 @@ def _qualified_name(obj, mangle_name=True) -> str: module_name = module_name.replace("<", "_") module_name = module_name.replace(">", "_") - # The PythonExceptionValue C++ class in torch/csrc/jit/python/python_sugared_value.h - # does not need mangle the python class name. - if mangle_name: - # __main__ is a builtin module, so rewrite it to "__torch__". - if module_name == "__main__": - module_name = "__torch__" - else: - # Everything else gets a "__torch__" prefix to avoid name collisions - # with the names of user values. - module_name = "__torch__." + module_name + # __main__ is a builtin module, so rewrite it to "__torch__". + if module_name == "__main__": + module_name = "__torch__" + else: + # Everything else gets a "__torch__" prefix to avoid name collisions + # with the names of user values. + module_name = "__torch__." + module_name if "." in name: raise RuntimeError(f"Could not get qualified name for class '{name}': " diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp index eac6161c923..20cab7c7499 100644 --- a/torch/csrc/jit/frontend/ir_emitter.cpp +++ b/torch/csrc/jit/frontend/ir_emitter.cpp @@ -2478,14 +2478,12 @@ struct to_ir { void emitRaise(const Raise& raise) { auto sv = emitSugaredExpr(raise.expr(), 1); Value* error_message = nullptr; - Value* qualified_class_name = nullptr; if (auto exception_instance = std::dynamic_pointer_cast(sv)) { // The typical case, an instance of the exception class was thrown: // raise RuntimeError("error") error_message = exception_instance->getValue(); - qualified_class_name = exception_instance->getQualifiedClassName(); } else if ( auto exception_class = std::dynamic_pointer_cast(sv)) { // A bare exception was thrown so add an empty message. e.g. @@ -2502,11 +2500,7 @@ struct to_ir { error_message = graph->insert(aten::str, {error_message}); } - graph->insert( - prim::RaiseException, - {error_message, qualified_class_name}, - {}, - raise.range()); + graph->insert(prim::RaiseException, {error_message}, {}, raise.range()); exit_blocks.insert(environment_stack->block()); } diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h index 6ddd9bed753..f6a3f72a59d 100644 --- a/torch/csrc/jit/frontend/sugared_value.h +++ b/torch/csrc/jit/frontend/sugared_value.h @@ -744,10 +744,7 @@ struct SimpleSelf : public Self { // This is not a SimpleValue so it can not pass through the code paths that // expect a SimpleValue as a sugared value. struct TORCH_API ExceptionMessageValue : public SugaredValue { - explicit ExceptionMessageValue( - Value* value, - Value* qualified_class_name = nullptr) - : value_(value), qualified_class_name_(qualified_class_name) {} + explicit ExceptionMessageValue(Value* value) : value_(value) {} std::string kind() const override { return "exception message"; @@ -757,14 +754,7 @@ struct TORCH_API ExceptionMessageValue : public SugaredValue { return value_; } - // qualified python class name - Value* getQualifiedClassName() { - return qualified_class_name_; - } - - private: Value* value_; - Value* qualified_class_name_; }; struct TORCH_API ExceptionValue : public SugaredValue { diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index f014150d8a2..87ab27a5552 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -914,11 +914,8 @@ std::shared_ptr PythonExceptionValue::call( ->insertNode(caller.graph()->createTuple(message_values)) ->output(); } - Value* qualified_class_name = - insertConstant(*caller.graph(), exception_class_qualified_name_, loc); - return std::make_shared( - error_message, qualified_class_name); + return std::make_shared(error_message); } bool isNamedTupleClass(const py::object& obj) { diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h index 5fef124cf2b..d3559abda5c 100644 --- a/torch/csrc/jit/python/python_sugared_value.h +++ b/torch/csrc/jit/python/python_sugared_value.h @@ -328,12 +328,7 @@ struct VISIBILITY_HIDDEN PythonClassValue : public ClassValue { struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue { explicit PythonExceptionValue(const py::object& exception_class) : ExceptionValue( - py::str(py::getattr(exception_class, "__name__", py::str("")))), - exception_class_qualified_name_( - py::str(py::module::import("torch._jit_internal") - .attr("_qualified_name")( - exception_class, - /*mangle_name=*/false))) {} + py::str(py::getattr(exception_class, "__name__", py::str("")))) {} std::string kind() const override { return "Python exception"; @@ -345,9 +340,6 @@ struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue { at::ArrayRef args, at::ArrayRef kwargs, size_t n_binders) override; - - private: - std::string exception_class_qualified_name_; }; // Python Slice class. From a7cac05ca630da315164fbf298212e020664813f Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Tue, 15 Feb 2022 10:54:32 -0800 Subject: [PATCH 056/199] Add new tls snapshot feature (#72832) Summary: Reland of https://github.com/pytorch/pytorch/pull/72623 that was reverted for the tls cleanup was removed. From close inspection on the counting of the number of available keys, I think there is one more since the guard is actually one after the last usable key. With this update assert, the last updated key will still be <=63 which will fit just fine. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72832 Reviewed By: H-Huang Differential Revision: D34228571 Pulled By: albanD fbshipit-source-id: ce5e10a841ea87386727346cfc8d9327252574c4 (cherry picked from commit 59d3b863534a37ac3463e2814bc9599c322669ee) --- aten/src/ATen/core/PythonFallbackKernel.cpp | 24 +++++++++++++++++++++ aten/src/ATen/core/PythonModeTLS.cpp | 2 ++ aten/src/ATen/core/TensorBase.h | 1 - c10/core/DispatchKey.cpp | 3 +++ c10/core/DispatchKey.h | 7 +++++- c10/core/TensorImpl.cpp | 19 ++++++++-------- c10/core/TensorImpl.h | 4 ++-- c10/core/impl/LocalDispatchKeySet.h | 14 ++++++++++++ test/test_python_dispatch.py | 15 +++++-------- torch/csrc/autograd/init.cpp | 5 +++-- torch/csrc/utils/tensor_new.cpp | 1 + torch/testing/_internal/logging_tensor.py | 1 - 12 files changed, 70 insertions(+), 26 deletions(-) diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index 6b51aa53156..b5861253c1e 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -4,7 +4,14 @@ namespace { +// TLS saving the state of the include/exclude sets on entry to the dispatcher +// This is set in the pythonTLSSnapshot fallback and used by the Python fallback. +thread_local c10::optional tls_on_entry; + void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { + TORCH_INTERNAL_ASSERT(tls_on_entry.has_value()); + c10::impl::ForceDispatchKeyGuard guard(tls_on_entry.value()); + // If Python Mode is active, use its PyInterpreter for dispatch const auto& maybe_python_mode_state = at::impl::PythonModeTLS::get_state(); if (maybe_python_mode_state) { @@ -42,8 +49,25 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) { TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)"); } +void pythonTLSSnapshotFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { + // It is ok for the tls to be already set here. + // A CompositeImplicitAutograd function may have been called just before this and so the tls here were never cleared + // This is also why we don't need an RAII to ensure the tls is reset when exceptions happen + + tls_on_entry = c10::impl::tls_local_dispatch_key_set(); + + op.redispatchBoxed(dispatch_keys & c10::DispatchKeySet(c10::DispatchKeySet::FULL_AFTER, c10::DispatchKey::PythonTLSSnapshot), stack); + + tls_on_entry = c10::nullopt; +} + + } // anonymous namespace TORCH_LIBRARY_IMPL(_, Python, m) { m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>()); } + +TORCH_LIBRARY_IMPL(_, PythonTLSSnapshot, m) { + m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonTLSSnapshotFallback>()); +} diff --git a/aten/src/ATen/core/PythonModeTLS.cpp b/aten/src/ATen/core/PythonModeTLS.cpp index dd4b44bc5fe..97892fcf5d3 100644 --- a/aten/src/ATen/core/PythonModeTLS.cpp +++ b/aten/src/ATen/core/PythonModeTLS.cpp @@ -8,6 +8,7 @@ void PythonModeTLS::set_state(const std::shared_ptr& st pythonModeState = state; if (state) { c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, true); + c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, true); } else { PythonModeTLS::reset_state(); } @@ -20,6 +21,7 @@ const std::shared_ptr& PythonModeTLS::get_state() { void PythonModeTLS::reset_state() { pythonModeState.reset((TorchDispatchTypeObject*)nullptr); c10::impl::tls_set_dispatch_key_included(DispatchKey::Python, false); + c10::impl::tls_set_dispatch_key_included(DispatchKey::PythonTLSSnapshot, false); } } // namespace impl diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index 45c8325ecb9..097976f1b39 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -43,7 +43,6 @@ inline bool variable_excluded_from_dispatch() { // Please read the comment in `VariableFallbackKernel.cpp` about the background of this change. return true; #else - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!c10::impl::tls_local_dispatch_key_set().excluded_.has(DispatchKey::Autograd)); return c10::impl::tls_local_dispatch_key_set().excluded_.isSupersetOf(c10::autograd_dispatch_keyset); #endif } diff --git a/c10/core/DispatchKey.cpp b/c10/core/DispatchKey.cpp index 7d2f9e7fcb6..6dbcaf88d5d 100644 --- a/c10/core/DispatchKey.cpp +++ b/c10/core/DispatchKey.cpp @@ -67,6 +67,8 @@ const char* toString(DispatchKey t) { case DispatchKey::Python: return "Python"; + case DispatchKey::PythonTLSSnapshot: + return "PythonTLSSnapshot"; case DispatchKey::PrivateUse1: return "PrivateUse1"; @@ -248,6 +250,7 @@ c10::DispatchKey parseDispatchKey(const std::string& k) { {"PrivateUse3", c10::DispatchKey::PrivateUse3}, {"BackendSelect", c10::DispatchKey::BackendSelect}, {"Python", c10::DispatchKey::Python}, + {"PythonTLSSnapshot", c10::DispatchKey::PythonTLSSnapshot}, {"Named", c10::DispatchKey::Named}, {"Conjugate", c10::DispatchKey::Conjugate}, {"Negative", c10::DispatchKey::Negative}, diff --git a/c10/core/DispatchKey.h b/c10/core/DispatchKey.h index 1bb8268e2bd..29315051b41 100644 --- a/c10/core/DispatchKey.h +++ b/c10/core/DispatchKey.h @@ -282,6 +282,11 @@ enum class DispatchKey : uint8_t { Functionalize, FuncTorchDynamicLayerFrontMode, // See Note [Out-of-tree vmap+grad prototype] + // Used by Python key logic to know the set of tls on entry to the dispatcher + // This kernel assumes it is at the very top of the dispatcher. If you add + // a key above, make sure to update the fallback implementation for this. + PythonTLSSnapshot, + // TESTING: This is intended to be a generic testing tensor type id. // Don't use it for anything real; its only acceptable use is within a single // process test. Use it by creating a TensorImpl with this DispatchKey, and @@ -360,7 +365,7 @@ enum class DispatchKey : uint8_t { // built-in autograd formulas for operators are not appropriate. static_assert( - static_cast(DispatchKey::NumDispatchKeys) < 64, + static_cast(DispatchKey::NumDispatchKeys) <= 64, "DispatchKey is used as index into 64-bit bitmask; you must have less than 64 entries"); #if defined(C10_MOBILE_TRIM_DISPATCH_KEYS) diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index b83ee395045..e3bbed52aa5 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -120,11 +120,11 @@ TensorImpl::TensorImpl( // [Note: Python key removal] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// In most constructors for TensorImpl, you will see Python key is removed from -// the passed in DispatchKeySet. Why? +// In most constructors for TensorImpl, you will see Python and PythonTLSSnapshot +// keys are removed from the passed in DispatchKeySet. Why? // -// INVARIANT: Python dispatch key is set iff PyObject for the Tensor has a -// nontrivial __torch_dispatch__ implementation. +// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject for +// the Tensor has a nontrivial __torch_dispatch__ implementation. // // When a fresh TensorImpl is created, there is *no* PyObject (this only gets // initialized lazily at the first point in time the Tensor passes into Python). @@ -132,8 +132,8 @@ TensorImpl::TensorImpl( // // In practice, what will happen shortly afterwards is that the TensorImpl // will get its PyObject initialized by Tensor._make_subclass; at this point -// the Python dispatch key will be set and all is well. The point is to delay -// the dispatch key setting until that point. +// the Python and PythonTLSSnapshot dispatch keys will be set and all is well. +// The point is to delay the dispatch key setting until that point. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) TensorImpl::TensorImpl( @@ -149,7 +149,8 @@ TensorImpl::TensorImpl( data_type_(data_type), device_opt_(storage_.device()), key_set_(key_set.remove( - DispatchKey::Python)) { // See [Note: Python key removal] + DispatchKey::Python).remove( + DispatchKey::PythonTLSSnapshot)) { // See [Note: Python key removal] init_bitfields(); // Inference tensor doesn't have version counter. if (!is_inference()) { @@ -195,7 +196,7 @@ TensorImpl::TensorImpl( key_set = key_set | getAutocastRelatedKeySetFromBackend(k); key_set = - key_set.remove(DispatchKey::Python); // See [Note: Python key removal] + key_set.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); // See [Note: Python key removal] // Inference tensor doesn't have autograd related keys. if (inference_mode) { @@ -553,7 +554,7 @@ void TensorImpl::copy_tensor_metadata_except_version_counter( dest_impl->storage_offset_ = src_impl->storage_offset_; dest_impl->data_type_ = src_impl->data_type_; dest_impl->device_opt_ = src_impl->device_opt_; - dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python); + dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); dest_impl->is_contiguous_ = src_impl->is_contiguous_; dest_impl->has_contiguity_ = src_impl->has_contiguity_; dest_impl->is_channels_last_contiguous_ = diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 86aca278c9d..8ec099c2eab 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -1476,9 +1476,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { void set_python_dispatch(bool k) { if (k) { - key_set_ = key_set_.add(DispatchKey::Python); + key_set_ = key_set_.add(DispatchKey::Python).add(DispatchKey::PythonTLSSnapshot); } else { - key_set_ = key_set_.remove(DispatchKey::Python); + key_set_ = key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); } } diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 050363fc7c1..5ee622d433a 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -117,6 +117,20 @@ class C10_API ExcludeDispatchKeyGuard { DispatchKeySet exclude_; }; +struct C10_API ForceDispatchKeyGuard { + public: + ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) : + saved_keyset_(c10::impl::tls_local_dispatch_key_set()) { + c10::impl::_force_tls_local_dispatch_key_set(key_set); + } + ~ForceDispatchKeyGuard() { + c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_); + } + + private: + c10::impl::LocalDispatchKeySet saved_keyset_; +}; + // Non-RAII API for manipulating the thread-local dispatch state. // Please prefer the RAII API. The non-RAII API may be useful when // the included/excluded state of a given DispatchKey must span diff --git a/test/test_python_dispatch.py b/test/test_python_dispatch.py index d127bacc616..a3e7e545799 100644 --- a/test/test_python_dispatch.py +++ b/test/test_python_dispatch.py @@ -551,21 +551,16 @@ $6 = torch._ops.aten.add_($1, $5)''') self.assertFalse(out.requires_grad) self.assertIsNone(out.grad_fn) - # TODO: this should be True - self.assertFalse(out.elem.requires_grad) - # TODO: this should be not None - self.assertIsNone(out.elem.grad_fn) + self.assertTrue(out.elem.requires_grad) + self.assertIsNotNone(out.elem.grad_fn) with self.assertRaisesRegex(RuntimeError, "does not require grad"): - out.backward() + out.sum().backward() - # TODO: this should not raise - with self.assertRaisesRegex(RuntimeError, "does not require grad"): - out.elem.backward() + out.elem.sum().backward() self.assertIsNone(t.grad) - # TODO: this should not be None - self.assertIsNone(t.elem.grad) + self.assertIsNotNone(t.elem.grad) if __name__ == '__main__': diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 890b7f715ea..3e352294df1 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -27,9 +27,10 @@ #include struct DisableTorchDispatch { - DisableTorchDispatch() : guard_(c10::DispatchKey::Python) { - } + DisableTorchDispatch() : guard_(c10::DispatchKey::Python), + guard_tls_snapshot_(c10::DispatchKey::PythonTLSSnapshot) {} c10::impl::ExcludeDispatchKeyGuard guard_; + c10::impl::ExcludeDispatchKeyGuard guard_tls_snapshot_; }; PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index a340c661606..4b85cd81fdf 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -279,6 +279,7 @@ Tensor internal_new_from_data( at::AutoDispatchBelowADInplaceOrView guard; // TODO: remove at::tracer::impl::NoTracerDispatchMode tracer_guard; c10::impl::ExcludeDispatchKeyGuard pythonmode_guard(c10::DispatchKey::Python); + c10::impl::ExcludeDispatchKeyGuard pythonmode_snapshot_guard(c10::DispatchKey::PythonTLSSnapshot); // functorch uses FuncTorchDynamicLayerBackMode as a mode key to wrap all // tensors returned from operators in special TensorWrapper tensor extension // The problem with this is that TensorWrapper does not have storage so diff --git a/torch/testing/_internal/logging_tensor.py b/torch/testing/_internal/logging_tensor.py index a368d453651..d553d781735 100644 --- a/torch/testing/_internal/logging_tensor.py +++ b/torch/testing/_internal/logging_tensor.py @@ -27,7 +27,6 @@ def no_dispatch() -> Iterator[None]: # can require gradients if the user asks for it as a constructor kwarg. # - The wrapped Tensor can require gradients. In that case autograd will be tracked # for the wrapped Tensor and the LoggingTensor itself cannot require gradients. -# Note that this second one is not possible today as dispatcher exclude keys are not properly reset # WARNING: We allow these two possibilities for testing purposes. You should NEVER use both in a single # test or you might get surprising behavior. From ca0ac3a74b83a6e54464b0f0d22a3fc9179b2bbd Mon Sep 17 00:00:00 2001 From: Xiaohan Wei Date: Tue, 15 Feb 2022 11:08:17 -0800 Subject: [PATCH 057/199] [caffe2] allow dropout to take 1.0 as dropout ratio to zero-out a layer (#72741) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72741 as titled. Context: This is useful in fast mitigating feature induced overfitting in the sense that we can do omni-transfer on a trained model and apply dropout with ratio = 1 on features resulting in overfitting. Directly removing the features would not be feasible on omni-transfer scenarios since the downstream FC sizes would change. Experimental records: https://fb.quip.com/npIkAgRc8jl9#temp:C:DWC050ceaba14424d23a78462c01 Doing dropout = 1 on selected features improves the eval NE over the next few hours (compared to v0 baseline) as is shown in the figures. Test Plan: ``` buck test caffe2/caffe2/python/operator_test:dropout_op_test ``` Reviewed By: ustctf Differential Revision: D34178732 fbshipit-source-id: 533feebe21bc582eefd756de397d5c7807c7438d (cherry picked from commit 5dabf9c484c0bc5410e3700e3010cdabb4bf903c) --- caffe2/operators/dropout_op.cc | 5 ++- caffe2/operators/dropout_op.h | 2 -- .../python/operator_test/dropout_op_test.py | 32 +++++++++++++++++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc index 6f37407bd40..bbd1eb1c72c 100644 --- a/caffe2/operators/dropout_op.cc +++ b/caffe2/operators/dropout_op.cc @@ -15,13 +15,12 @@ bool DropoutOp::RunOnDevice() { return true; } else { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - float scale = 1. / (1. - ratio_); + float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_); // mask=true means keep, and mask=false means not keep, so we will // generate probability depending on 1-ratio. at::bernoulli_distribution dist(1. - ratio_); const float* Xdata = X.data(); float* Ydata = Y->template mutable_data(); - auto mask = Output(1, X.sizes(), at::dtype()); bool* mask_data = mask->template mutable_data(); auto* gen = context_.RandGenerator(); @@ -52,7 +51,7 @@ bool DropoutGradientOp::RunOnDevice() { const bool* mask_data = mask.data(); float* dXdata = dX->template mutable_data(); // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - float scale = 1. / (1. - ratio_); + float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_); for (int i = 0; i < dY.numel(); ++i) { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) dXdata[i] = dYdata[i] * mask_data[i] * scale; diff --git a/caffe2/operators/dropout_op.h b/caffe2/operators/dropout_op.h index aff0528c7ff..ae8f0ff1bba 100644 --- a/caffe2/operators/dropout_op.h +++ b/caffe2/operators/dropout_op.h @@ -19,7 +19,6 @@ class DropoutOp final : public Operator { is_test_( this->template GetSingleArgument(OpSchema::Arg_IsTest, 0)) { CAFFE_ENFORCE_GE(ratio_, 0); - CAFFE_ENFORCE_LT(ratio_, 1); } bool RunOnDevice() override; @@ -41,7 +40,6 @@ class DropoutGradientOp final : public Operator { is_test_( this->template GetSingleArgument(OpSchema::Arg_IsTest, 0)) { CAFFE_ENFORCE_GE(ratio_, 0); - CAFFE_ENFORCE_LT(ratio_, 1); } bool RunOnDevice() override; diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py index d3a5c831d87..ad2b6209cf4 100644 --- a/caffe2/python/operator_test/dropout_op_test.py +++ b/caffe2/python/operator_test/dropout_op_test.py @@ -74,3 +74,35 @@ class TestDropout(serial.SerializedTestCase): gc, op, [X], reference_dropout_ratio0, # Don't check the mask with cuDNN because it's packed data outputs_to_check=None if engine != 'CUDNN' else [0]) + + + @given(X=hu.tensor(), + in_place=st.booleans(), + output_mask=st.booleans(), + engine=st.sampled_from(["", "CUDNN"]), + **hu.gcs) + @settings(deadline=10000) + def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc): + """Test with ratio=0 for a deterministic reference impl.""" + if in_place: + # Skip if trying in-place on GPU + assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP}) + # If in-place on CPU, don't compare with GPU + dc = dc[:1] + is_test = not output_mask + op = core.CreateOperator("Dropout", ["X"], + ["X" if in_place else "Y"] + + (["mask"] if output_mask else []), + ratio=1.0, engine=engine, + is_test=is_test) + + self.assertDeviceChecks(dc, op, [X], [0]) + if not is_test: + self.assertGradientChecks(gc, op, [X], 0, [0]) + + def reference_dropout_ratio1(x): + return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool)) + self.assertReferenceChecks( + gc, op, [X], reference_dropout_ratio1, + # Don't check the mask with cuDNN because it's packed data + outputs_to_check=None if engine != 'CUDNN' else [0]) From da07d1cda2ea40ea72bd786e03ce068ea1354bf0 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Tue, 15 Feb 2022 19:33:59 +0000 Subject: [PATCH 058/199] Add GH1 merge rule to merge documentation changes @Lezcano Pull Request resolved: https://github.com/pytorch/pytorch/pull/72872 --- .github/merge_rules.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/merge_rules.json b/.github/merge_rules.json index 6b0e452683f..fdac34d185a 100644 --- a/.github/merge_rules.json +++ b/.github/merge_rules.json @@ -14,7 +14,13 @@ { "name": "OSS CI", "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**"], - "approved_by": ["seemethere", "malfet", "suo"], + "approved_by": ["seemethere", "malfet", "suo", "janeyx99"], "mandatory_app_id": 12274 + }, + { + "name": "Documentation", + "patterns": ["docs/**", "torch/*docs.py"], + "approved_by": ["mruberry", "ngimel", "albanD", "janeyx99"], + "mandatory_app_id": 12274 } ] From 313557a6133a4b3750ce17cd52045918b6a66e5d Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Tue, 15 Feb 2022 11:36:21 -0800 Subject: [PATCH 059/199] Add missing import (#72840) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72840 Reviewed By: H-Huang Differential Revision: D34242612 Pulled By: albanD fbshipit-source-id: 3dd34de96dbf1ae8f3c3ea45888d211d95862c49 (cherry picked from commit d2650ffa75dbba315daeb0e7cdf0fcb56f3584e1) --- torch/optim/swa_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torch/optim/swa_utils.py b/torch/optim/swa_utils.py index 479e532448a..7e66d2c4a91 100644 --- a/torch/optim/swa_utils.py +++ b/torch/optim/swa_utils.py @@ -1,6 +1,7 @@ import itertools import math from copy import deepcopy +import warnings import torch from torch.nn import Module From 12a1df27c73f05b8bfba72d68ac8f75f4e81dc35 Mon Sep 17 00:00:00 2001 From: rusty1s Date: Tue, 15 Feb 2022 11:37:35 -0800 Subject: [PATCH 060/199] `scatter_reduce` documentation (#68580) Summary: Fixes https://github.com/pytorch/pytorch/issues/63780 (part 2) Pull Request resolved: https://github.com/pytorch/pytorch/pull/68580 Reviewed By: atalman Differential Revision: D33800694 Pulled By: malfet fbshipit-source-id: 2e09492a29cef115a7cca7c8209d1dcb6ae24eb9 (cherry picked from commit 696ff7594059b8b61f93475da7af7b197829061f) --- docs/source/tensors.rst | 1 + docs/source/torch.rst | 1 + torch/_tensor_docs.py | 6 +++++ torch/_torch_docs.py | 53 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+) diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index 090824e0ee3..8f04298887b 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -593,6 +593,7 @@ Tensor class reference Tensor.scatter_ Tensor.scatter_add_ Tensor.scatter_add + Tensor.scatter_reduce Tensor.select Tensor.select_scatter Tensor.set_ diff --git a/docs/source/torch.rst b/docs/source/torch.rst index d3ae7a7151e..e09675af82a 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -118,6 +118,7 @@ Indexing, Slicing, Joining, Mutating Ops select_scatter slice_scatter scatter_add + scatter_reduce split squeeze stack diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 66ffffec87b..7ff5da2c2f4 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -3374,6 +3374,12 @@ Example:: """.format(**reproducibility_notes)) +add_docstr_all('scatter_reduce', r""" +scatter_reduce(input, dim, index, reduce, *, output_size=None) -> Tensor + +See :func:`torch.scatter_reduce` +""") + add_docstr_all('select', r""" select(dim, index) -> Tensor diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index db65dd8cd98..4ba8d92b583 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -8547,6 +8547,59 @@ scatter_add(input, dim, index, src) -> Tensor Out-of-place version of :meth:`torch.Tensor.scatter_add_` """) +add_docstr(torch.scatter_reduce, r""" +scatter_reduce(input, dim, index, reduce, *, output_size=None) -> Tensor + +Reduces all values from the :attr:`input` tensor to the indices specified in +the :attr:`index` tensor. For each value in :attr:`input`, its output index is +specified by its index in :attr:`input` for ``dimension != dim`` and by the +corresponding value in :attr:`index` for ``dimension = dim``. +The applied reduction for non-unique indices is defined via the :attr:`reduce` +argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`). +For non-existing indices, the output will be filled with the identity of the +applied reduction (1 for :obj:`"prod"` and 0 otherwise). + +It is also required that ``index.size(d) == input.size(d)`` for all dimensions ``d``. +Moreover, if :attr:`output_size` is defined the the values of :attr:`index` must be +between ``0`` and ``output_size - 1`` inclusive. + + +For a 3-D tensor with :obj:`reduce="sum"`, the output is given as:: + + out[index[i][j][k]][j][k] += input[i][j][k] # if dim == 0 + out[i][index[i][j][k]][k] += input[i][j][k] # if dim == 1 + out[i][j][index[i][j][k]] += input[i][j][k] # if dim == 2 + +Note: + This out-of-place operation is similar to the in-place versions of + :meth:`~torch.Tensor.scatter_` and :meth:`~torch.Tensor.scatter_add_`, + in which the output tensor is automatically created according to the + maximum values in :attr:`index` and filled based on the identity of the + applied reduction. + +Note: + {forward_reproducibility_note} + +Args: + input (Tensor): the input tensor + dim (int): the axis along which to index + index (LongTensor): the indices of elements to scatter and reduce. + src (Tensor): the source elements to scatter and reduce + reduce (str): the reduction operation to apply for non-unique indices + (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`) + output_size (int, optional): the size of the output at dimension :attr:`dim`. + If set to :obj:`None`, will get automatically inferred according to + :obj:`index.max() + 1` + +Example:: + + >>> input = torch.tensor([1, 2, 3, 4, 5, 6]) + >>> index = torch.tensor([0, 1, 0, 1, 2, 1]) + >>> torch.scatter_reduce(input, 0, index, reduce="sum", output_size=3) + tensor([4, 12, 5]) + +""".format(**reproducibility_notes)) + add_docstr(torch.select, r""" select(input, dim, index) -> Tensor From cb00d9601c92ad501d91cd137ec446905b233e3d Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 15 Feb 2022 12:05:30 -0800 Subject: [PATCH 061/199] Revert D33800694: [pytorch][PR] `scatter_reduce` documentation Test Plan: revert-hammer Differential Revision: D33800694 (https://github.com/pytorch/pytorch/commit/12a1df27c73f05b8bfba72d68ac8f75f4e81dc35) Original commit changeset: 2e09492a29ce Original Phabricator Diff: D33800694 (https://github.com/pytorch/pytorch/commit/12a1df27c73f05b8bfba72d68ac8f75f4e81dc35) fbshipit-source-id: 2a4775c0042551607fe3ab77f5bfe9f2e4b6b78e (cherry picked from commit 4bd6c0d2bbc8180d44db2266cdad6d7b030a6dbf) --- docs/source/tensors.rst | 1 - docs/source/torch.rst | 1 - torch/_tensor_docs.py | 6 ----- torch/_torch_docs.py | 53 ----------------------------------------- 4 files changed, 61 deletions(-) diff --git a/docs/source/tensors.rst b/docs/source/tensors.rst index 8f04298887b..090824e0ee3 100644 --- a/docs/source/tensors.rst +++ b/docs/source/tensors.rst @@ -593,7 +593,6 @@ Tensor class reference Tensor.scatter_ Tensor.scatter_add_ Tensor.scatter_add - Tensor.scatter_reduce Tensor.select Tensor.select_scatter Tensor.set_ diff --git a/docs/source/torch.rst b/docs/source/torch.rst index e09675af82a..d3ae7a7151e 100644 --- a/docs/source/torch.rst +++ b/docs/source/torch.rst @@ -118,7 +118,6 @@ Indexing, Slicing, Joining, Mutating Ops select_scatter slice_scatter scatter_add - scatter_reduce split squeeze stack diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py index 7ff5da2c2f4..66ffffec87b 100644 --- a/torch/_tensor_docs.py +++ b/torch/_tensor_docs.py @@ -3374,12 +3374,6 @@ Example:: """.format(**reproducibility_notes)) -add_docstr_all('scatter_reduce', r""" -scatter_reduce(input, dim, index, reduce, *, output_size=None) -> Tensor - -See :func:`torch.scatter_reduce` -""") - add_docstr_all('select', r""" select(dim, index) -> Tensor diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py index 4ba8d92b583..db65dd8cd98 100644 --- a/torch/_torch_docs.py +++ b/torch/_torch_docs.py @@ -8547,59 +8547,6 @@ scatter_add(input, dim, index, src) -> Tensor Out-of-place version of :meth:`torch.Tensor.scatter_add_` """) -add_docstr(torch.scatter_reduce, r""" -scatter_reduce(input, dim, index, reduce, *, output_size=None) -> Tensor - -Reduces all values from the :attr:`input` tensor to the indices specified in -the :attr:`index` tensor. For each value in :attr:`input`, its output index is -specified by its index in :attr:`input` for ``dimension != dim`` and by the -corresponding value in :attr:`index` for ``dimension = dim``. -The applied reduction for non-unique indices is defined via the :attr:`reduce` -argument (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`). -For non-existing indices, the output will be filled with the identity of the -applied reduction (1 for :obj:`"prod"` and 0 otherwise). - -It is also required that ``index.size(d) == input.size(d)`` for all dimensions ``d``. -Moreover, if :attr:`output_size` is defined the the values of :attr:`index` must be -between ``0`` and ``output_size - 1`` inclusive. - - -For a 3-D tensor with :obj:`reduce="sum"`, the output is given as:: - - out[index[i][j][k]][j][k] += input[i][j][k] # if dim == 0 - out[i][index[i][j][k]][k] += input[i][j][k] # if dim == 1 - out[i][j][index[i][j][k]] += input[i][j][k] # if dim == 2 - -Note: - This out-of-place operation is similar to the in-place versions of - :meth:`~torch.Tensor.scatter_` and :meth:`~torch.Tensor.scatter_add_`, - in which the output tensor is automatically created according to the - maximum values in :attr:`index` and filled based on the identity of the - applied reduction. - -Note: - {forward_reproducibility_note} - -Args: - input (Tensor): the input tensor - dim (int): the axis along which to index - index (LongTensor): the indices of elements to scatter and reduce. - src (Tensor): the source elements to scatter and reduce - reduce (str): the reduction operation to apply for non-unique indices - (:obj:`"sum"`, :obj:`"prod"`, :obj:`"mean"`, :obj:`"amax"`, :obj:`"amin"`) - output_size (int, optional): the size of the output at dimension :attr:`dim`. - If set to :obj:`None`, will get automatically inferred according to - :obj:`index.max() + 1` - -Example:: - - >>> input = torch.tensor([1, 2, 3, 4, 5, 6]) - >>> index = torch.tensor([0, 1, 0, 1, 2, 1]) - >>> torch.scatter_reduce(input, 0, index, reduce="sum", output_size=3) - tensor([4, 12, 5]) - -""".format(**reproducibility_notes)) - add_docstr(torch.select, r""" select(input, dim, index) -> Tensor From d2c0c0b63831b549ee5039eacfc954f453845e0f Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Tue, 15 Feb 2022 12:13:48 -0800 Subject: [PATCH 062/199] [SR] Apply all graph passes to sub-blocks (#72598) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72598 Apply all optimizations to sub-blocks by replacing loops over `graph->nodes()` with loops over nodes in `DepthFirstGraphNodeIterator` ghstack-source-id: 149155700 Test Plan: Existing unit tests Reviewed By: d1jang Differential Revision: D34111430 fbshipit-source-id: 015076030368bb67df24ed5892475534b8f8f272 (cherry picked from commit a4314520de2f3d56326434feea63c39c6d1f7cc4) --- .../static_runtime/test_static_module.cc | 9 +++- torch/csrc/jit/runtime/static/passes.cpp | 53 +++++++++++-------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/benchmarks/static_runtime/test_static_module.cc b/benchmarks/static_runtime/test_static_module.cc index 353ce93bb65..58a70ebded8 100644 --- a/benchmarks/static_runtime/test_static_module.cc +++ b/benchmarks/static_runtime/test_static_module.cc @@ -243,6 +243,14 @@ TEST(StaticRuntime, ReplaceWithCopy_replaces_reshape) { c = inp.reshape(shape) return (a, b, c) )JIT"); + ExpectToReplaceWithCopy(R"JIT( + def forward(self, cond: bool, x): + if cond: + y = x.reshape(x.shape) + else: + y = x.clone() + return y.clone() + )JIT"); } TEST( @@ -289,7 +297,6 @@ TEST( return (d) )JIT"); ExpectNotToReplaceWithCopy(reshape_inplace_script); - ExpectNotToReplaceWithCopy(reshape_inplace_script_1); } TEST(StaticRuntime, CanEnableStaticRuntime) { diff --git a/torch/csrc/jit/runtime/static/passes.cpp b/torch/csrc/jit/runtime/static/passes.cpp index 2ca6af3898a..f0638326fa8 100644 --- a/torch/csrc/jit/runtime/static/passes.cpp +++ b/torch/csrc/jit/runtime/static/passes.cpp @@ -514,10 +514,9 @@ void UseVariadicTupleUnpack(const std::shared_ptr& graph) { // v void ReplaceWithMaybeCopy( - std::shared_ptr& graph, + std::shared_ptr& graph, bool outputs_are_immutable) { AliasDb db(graph); - // for ops that have overloads, match the schema static const std::array, 3> supported_schema = {{{torch::schema( @@ -542,7 +541,8 @@ void ReplaceWithMaybeCopy( // old node, new node, select_tensor node std::vector> replacement; - for (auto* n : graph->nodes()) { + DepthFirstGraphNodeIterator graph_it(graph); + for (auto n = graph_it.next(); n != nullptr; n = graph_it.next()) { c10::Symbol new_symbol; if (!match_schema(n, new_symbol)) { continue; @@ -561,7 +561,6 @@ void ReplaceWithMaybeCopy( // Add the did_copy flag to outputs. auto* new_node = graph->create(new_symbol, n->outputs().size() + 1); - new_node->insertBefore(n); for (auto* input : n->inputs()) { new_node->addInput(input); } @@ -570,7 +569,6 @@ void ReplaceWithMaybeCopy( static const auto select_tensor_symbol = fromQualString("static_runtime::select_tensor"); auto* select_tensor_node = graph->create(select_tensor_symbol, 1); - select_tensor_node->insertBefore(n); DCHECK_EQ(new_node->outputs().size(), 2); select_tensor_node->addInput(n->input(0)); for (auto* output : new_node->outputs()) { @@ -584,6 +582,8 @@ void ReplaceWithMaybeCopy( auto* const new_node = std::get<1>(tup); auto* const select_tensor_node = std::get<2>(tup); + new_node->insertBefore(old_node); + select_tensor_node->insertBefore(old_node); new_node->outputs()[0]->copyMetadata(old_node->output()); select_tensor_node->output()->copyMetadata(old_node->output()); old_node->replaceAllUsesWith(select_tensor_node); @@ -597,10 +597,9 @@ void ReplaceWithMaybeCopy( } void ReplaceWithCopy( - std::shared_ptr& graph, + std::shared_ptr& graph, bool outputs_are_immutable) { AliasDb db(graph); - const FastMap supported = { #ifdef FBCODE_CAFFE2 OP_PAIR("aten::permute", "static_runtime::permute_copy"), @@ -626,7 +625,8 @@ void ReplaceWithCopy( }; std::vector> replacement; - for (auto* n : graph->nodes()) { + DepthFirstGraphNodeIterator graph_it(graph); + for (auto n = graph_it.next(); n != nullptr; n = graph_it.next()) { c10::Symbol new_symbol; if (supported.count(n->kind()) && opIsRegistered(supported.at(n->kind()))) { new_symbol = supported.at(n->kind()); @@ -663,7 +663,6 @@ void ReplaceWithCopy( continue; } auto* new_node = graph->create(new_symbol, n->outputs().size()); - new_node->insertBefore(n); for (auto* input : n->inputs()) { new_node->addInput(input); } @@ -673,6 +672,7 @@ void ReplaceWithCopy( for (const auto& p : replacement) { auto* old_node = p.first; auto* new_node = p.second; + new_node->insertBefore(old_node); new_node->output()->copyMetadata(old_node->output()); old_node->replaceAllUsesWith(new_node); old_node->destroy(); @@ -687,7 +687,8 @@ void ReplaceWithCopy( void EliminateTrivialEquallySplit(std::shared_ptr& graph) { const auto equally_split = fromQualString("fb::equally_split"); std::vector to_remove; - for (auto* node : graph->nodes()) { + DepthFirstGraphNodeIterator graph_it(graph); + for (auto node = graph_it.next(); node != nullptr; node = graph_it.next()) { if (node->kind() != equally_split) { continue; } @@ -708,7 +709,7 @@ void EliminateTrivialEquallySplit(std::shared_ptr& graph) { } list_unpack_node->output()->replaceAllUsesWith(node->input(0)); - list_unpack_node->destroy(); + to_remove.push_back(list_unpack_node); to_remove.push_back(node); } @@ -746,11 +747,12 @@ void FuseListUnpack(std::shared_ptr& graph) { AliasDb alias_db( graph, /*isFrozen=*/false); + // replacement contains (old_node, new_node, list_unpack_node) const std::vector graph_outputs( graph->outputs().begin(), graph->outputs().end()); - auto nodes = graph->nodes(); - std::vector to_remove; - for (auto* node : nodes) { + std::vector> replacement; + DepthFirstGraphNodeIterator graph_it(graph); + for (auto node = graph_it.next(); node != nullptr; node = graph_it.next()) { auto unfused_to_fused_it = unfused_to_fused.find(node->kind()); if (unfused_to_fused_it == unfused_to_fused.end()) { continue; @@ -799,13 +801,17 @@ void FuseListUnpack(std::shared_ptr& graph) { new_out->copyMetadata(out); out->replaceAllUsesWith(new_out); } - - new_node->insertAfter(node); - list_unpack_node->destroy(); - to_remove.push_back(node); + replacement.emplace_back(node, new_node, list_unpack_node); } - for (Node* node : to_remove) { - node->destroy(); + + for (const auto& nodes : replacement) { + auto* old_node = std::get<0>(nodes); + auto* new_node = std::get<1>(nodes); + auto* list_unpack_node = std::get<2>(nodes); + + new_node->insertAfter(old_node); + list_unpack_node->destroy(); + old_node->destroy(); } #ifndef NDEBUG @@ -820,8 +826,9 @@ void EnableStaticRuntimeLayerNorm(std::shared_ptr& graph) { fromQualString("static_runtime::layer_norm"); auto nodes = graph->nodes(); std::vector> replacement; - for (auto it = nodes.begin(); it != nodes.end(); ++it) { - Node* old_node = *it; + DepthFirstGraphNodeIterator graph_it(graph); + for (auto old_node = graph_it.next(); old_node != nullptr; + old_node = graph_it.next()) { if (!old_node->matches(torch::schema( "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, Tensor? bias=None, float eps=1e-05, bool cudnn_enable=True) -> Tensor"))) { continue; @@ -830,7 +837,6 @@ void EnableStaticRuntimeLayerNorm(std::shared_ptr& graph) { auto* new_node = graph->create( static_runtime_layer_norm_symbol, /*layer_norm*/ 1 + /*mean*/ 1 + /*rst=*/1); - new_node->insertBefore(old_node); for (auto* input : old_node->inputs()) { new_node->addInput(input); } @@ -839,6 +845,7 @@ void EnableStaticRuntimeLayerNorm(std::shared_ptr& graph) { for (const auto& p : replacement) { auto* old_node = p.first; auto* new_node = p.second; + new_node->insertBefore(old_node); new_node->output(0)->copyMetadata(old_node->output(0)); old_node->output(0)->replaceAllUsesWith(new_node->output(0)); old_node->destroy(); From 67cd98fad4d89b8c61aa160fe656ef4fa067bda9 Mon Sep 17 00:00:00 2001 From: Ivan Kobzarev Date: Tue, 15 Feb 2022 12:27:28 -0800 Subject: [PATCH 063/199] [tensorexpr] Fix isNLC segfault (#72786) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72786 Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D34204523 Pulled By: IvanKobzarev fbshipit-source-id: 9a0f2ce0a1921e261932029c3ebd842330fdf528 (cherry picked from commit b8326064f61b27695c62c08e5a393364619e1a26) --- test/cpp/tensorexpr/test_quantization.cpp | 32 ++++ .../jit/tensorexpr/operators/quantization.cpp | 158 +++++++----------- 2 files changed, 90 insertions(+), 100 deletions(-) diff --git a/test/cpp/tensorexpr/test_quantization.cpp b/test/cpp/tensorexpr/test_quantization.cpp index f6643c86846..9df2503a608 100644 --- a/test/cpp/tensorexpr/test_quantization.cpp +++ b/test/cpp/tensorexpr/test_quantization.cpp @@ -90,6 +90,38 @@ TEST_F(Quantization, QuantDequantUInt8) { CHECK_EQ(check, 1); } +TEST_F(Quantization, QuantDequantUInt8_NLC) { + const auto graph_string = R"IR( + graph(%x.1 : Float(1, 2, 2, strides=[4, 1, 2], device=cpu)): + %2 : int = prim::Constant[value=13]() + %3 : int = prim::Constant[value=122]() + %4 : float = prim::Constant[value=0.1]() + %q.1 : QUInt8(1, 2, 2) = aten::quantize_per_tensor(%x.1, %4, %3, %2) + %6 : Float(1, 2, 2) = aten::dequantize(%q.1) + return (%6))IR"; + auto graph = std::make_shared(); + parseIR(graph_string, &*graph); + + auto x = 2 * at::rand({1, 2, 2}, TensorOptions(kCPU).dtype(at::kFloat)); + x.unsafeGetTensorImpl()->set_sizes_and_strides({1, 2, 2}, {4, 1, 2}); + auto q = at::quantize_per_tensor(x, 0.1f, 122, at::kQUInt8); + auto y_expected = at::dequantize(q); + TensorExprKernel k(graph); + std::vector inputs = {x}; + StmtPtr s = k.getCodeGenStmt(); + + std::vector stack = fmap(inputs); + k.run(stack); + auto y = stack[0].toTensor(); + bool check = at::allclose(y_expected, y); + if (!check) { + std::cout << "x:\n" << x << std::endl; + std::cout << "y_expected:\n" << y_expected << std::endl; + std::cout << "y:\n" << y << std::endl; + } + CHECK_EQ(check, 1); +} + at::Tensor quantized_add( at::Tensor x1, at::Tensor x2, diff --git a/torch/csrc/jit/tensorexpr/operators/quantization.cpp b/torch/csrc/jit/tensorexpr/operators/quantization.cpp index c078478550c..e45445d622b 100644 --- a/torch/csrc/jit/tensorexpr/operators/quantization.cpp +++ b/torch/csrc/jit/tensorexpr/operators/quantization.cpp @@ -39,7 +39,34 @@ bool isQuantized(const BufHandle& qx) { return qx.node()->qscale() && qx.node()->qzero(); } -BufHandle makeQBufHandleNCHW( +BufHandle makeQBufHandleChannelsLast( + const std::string& name, + const std::vector& dims, + Dtype dtype, + const ExprPtr qscale, + const ExprPtr qzero) { + BufHandle ResultBuf(name, dims, dtype); + ResultBuf.node()->set_qscale(qscale); + ResultBuf.node()->set_qzero(qzero); + ResultBuf.node()->set_strides(make_channels_last_strides(dims)); + return ResultBuf; +} + +BufHandle makeQBufHandleChannelsLast( + const std::string& name, + const std::vector& dims, + Dtype dtype, + const double qscale, + const int64_t qzero) { + return makeQBufHandleChannelsLast( + name, + dims, + dtype, + DoubleImm::make(qscale).node(), + LongImm::make(qzero).node()); +} + +BufHandle makeQBufHandleContiguous( const std::string& name, const std::vector& dims, Dtype dtype, @@ -52,26 +79,13 @@ BufHandle makeQBufHandleNCHW( return ResultBuf; } -BufHandle makeQBufHandleNHWC( - const std::string& name, - const std::vector& dims, - Dtype dtype, - const ExprPtr qscale, - const ExprPtr qzero) { - BufHandle ResultBuf(name, dims, dtype); - ResultBuf.node()->set_qscale(qscale); - ResultBuf.node()->set_qzero(qzero); - ResultBuf.node()->set_strides(make_channels_last_strides(dims)); - return ResultBuf; -} - -BufHandle makeQBufHandleNHWC( +BufHandle makeQBufHandleContiguous( const std::string& name, const std::vector& dims, Dtype dtype, const double qscale, const int64_t qzero) { - return makeQBufHandleNHWC( + return makeQBufHandleContiguous( name, dims, dtype, @@ -79,71 +93,19 @@ BufHandle makeQBufHandleNHWC( LongImm::make(qzero).node()); } -BufHandle makeQBufHandleNLC( - const std::string& name, - const std::vector& dims, - Dtype dtype, - const ExprPtr qscale, - const ExprPtr qzero) { - BufHandle ResultBuf(name, dims, dtype); - ResultBuf.node()->set_qscale(qscale); - ResultBuf.node()->set_qzero(qzero); - ResultBuf.node()->set_strides(make_channels_last_strides(dims)); - return ResultBuf; -} - -BufHandle makeQBufHandleNLC( - const std::string& name, - const std::vector& dims, - Dtype dtype, - const double qscale, - const int64_t qzero) { - return makeQBufHandleNLC( - name, - dims, - dtype, - DoubleImm::make(qscale).node(), - LongImm::make(qzero).node()); -} - -BufHandle makeQBufHandleNCHW( - const std::string& name, - const std::vector& dims, - Dtype dtype, - const double qscale, - const int64_t qzero) { - return makeQBufHandleNCHW( - name, - dims, - dtype, - DoubleImm::make(qscale).node(), - LongImm::make(qzero).node()); -} - -bool isNHWC(const BufHandle& buf) { +bool isChannelsLast(const BufHandle& buf) { const auto& strides = buf.node()->strides(); const auto& dims = buf.node()->dims(); - if (strides.size() != 4) { + const auto rank = dims.size(); + if (rank < 3) { return false; } - auto dims1 = to(IRSimplifier::simplify(dims[1]))->value(); - auto strides1 = to(IRSimplifier::simplify(strides[1]))->value(); - auto strides3 = to(IRSimplifier::simplify(strides[3]))->value(); + auto dimsC = to(IRSimplifier::simplify(dims[1]))->value(); + auto stridesC = to(IRSimplifier::simplify(strides[1]))->value(); + auto stridesLast = + to(IRSimplifier::simplify(strides[rank - 1]))->value(); - return ((strides3 == dims1) && (strides1 == 1)); -} - -bool isNLC(const BufHandle& buf) { - const auto& strides = buf.node()->strides(); - const auto& dims = buf.node()->dims(); - if (strides.size() != 3) { - return false; - } - auto dims1 = to(IRSimplifier::simplify(dims[1]))->value(); - auto strides1 = to(IRSimplifier::simplify(strides[1]))->value(); - auto strides3 = to(IRSimplifier::simplify(strides[3]))->value(); - - return ((strides3 == dims1) && (strides1 == 1)); + return ((stridesLast == dimsC) && (stridesC == 1)); } ExprHandle quant( @@ -273,15 +235,11 @@ Tensor computeQuantizePerTensorExternalCall( throw malformed_input("Expected quantized dtype"); }(qdtype); auto ResultBuf = [&]() { - if (isNHWC(x)) { - return makeQBufHandleNHWC( + if (isChannelsLast(x)) { + return makeQBufHandleChannelsLast( "quantize_per_tensor", outputShape, dtype, qscale, qzero); } - if (isNLC(x)) { - return makeQBufHandleNLC( - "quantize_per_tensor", outputShape, dtype, qscale, qzero); - } - return makeQBufHandleNCHW( + return makeQBufHandleContiguous( "quantize_per_tensor", outputShape, dtype, qscale, qzero); }(); StmtPtr s = ExternalCall::make( @@ -376,7 +334,7 @@ Tensor computeQuantizedConv1d( const auto out_qzero = c10::get(inputs[3]); // Change to dtype based on outputType when dtype propagation implemented const auto out_qdtype = immQDType(qx); - auto ResultBuf = makeQBufHandleNLC( + auto ResultBuf = makeQBufHandleChannelsLast( "quantized_conv1d", outputShape, Dtype(out_qdtype), @@ -407,7 +365,7 @@ Tensor computeQuantizedConv2d( const auto out_qzero = c10::get(inputs[3]); // Change to dtype based on outputType when dtype propagation implemented const auto out_qdtype = immQDType(qx); - auto ResultBuf = makeQBufHandleNHWC( + auto ResultBuf = makeQBufHandleChannelsLast( "quantized_conv2d", outputShape, Dtype(out_qdtype), @@ -438,7 +396,7 @@ Tensor computeQuantizedConv2dRelu( const auto out_qzero = c10::get(inputs[3]); // Change to dtype based on outputType when dtype propagation implemented const auto out_qdtype = immQDType(qx); - auto ResultBuf = makeQBufHandleNHWC( + auto ResultBuf = makeQBufHandleChannelsLast( "quantized_conv2d_relu", outputShape, Dtype(out_qdtype), @@ -469,7 +427,7 @@ Tensor computeQuantizedLinear( const auto out_qzero = c10::get(inputs[3]); // Change to dtype based on outputType when dtype propagation implemented const auto out_qdtype = immQDType(qx); - auto ResultBuf = makeQBufHandleNCHW( + auto ResultBuf = makeQBufHandleContiguous( "quantized_linear", outputShape, Dtype(out_qdtype), @@ -500,7 +458,7 @@ Tensor computeQuantizedLinearRelu( const auto out_qzero = c10::get(inputs[3]); // Change to dtype based on outputType when dtype propagation implemented const auto out_qdtype = immQDType(qx); - auto ResultBuf = makeQBufHandleNCHW( + auto ResultBuf = makeQBufHandleContiguous( "quantized_linear_relu", outputShape, Dtype(out_qdtype), @@ -531,16 +489,16 @@ Tensor computeQuantizedAddExternalCall( const auto out_qzero = c10::get(inputs[3]); // Change to dtype based on outputType when dtype propagation implemented const auto out_qdtype = immQDType(qa); - const bool isQAChannelsLast = isNHWC(qa); - const bool isQBChannelsLast = isNHWC(qb); + const bool isQAChannelsLast = isChannelsLast(qa); + const bool isQBChannelsLast = isChannelsLast(qb); auto ResultBuf = (isQAChannelsLast || isQBChannelsLast) - ? makeQBufHandleNHWC( + ? makeQBufHandleChannelsLast( "quantized_add", outputShape, Dtype(out_qdtype), out_qscale, out_qzero) - : makeQBufHandleNCHW( + : makeQBufHandleContiguous( "quantized_add", outputShape, Dtype(out_qdtype), @@ -574,7 +532,7 @@ Tensor computeQuantizedMul( const auto out_qzero = c10::get(inputs[3]); // Change to dtype based on outputType when dtype propagation implemented const auto out_qdtype = immQDType(qa); - auto ResultBuf = makeQBufHandleNCHW( + auto ResultBuf = makeQBufHandleContiguous( "quantized_mul", outputShape, Dtype(out_qdtype), out_qscale, out_qzero); StmtPtr s = ExternalCall::make( ResultBuf, @@ -603,7 +561,7 @@ Tensor computeQuantizedMulScalar( // Change to dtype based on outputType when dtype propagation implemented const auto out_qdtype = immQDType(qa); double scale1 = immQScale(qa); - auto ResultBuf = makeQBufHandleNCHW( + auto ResultBuf = makeQBufHandleContiguous( "quantized_mul_scalar", outputShape, Dtype(out_qdtype), @@ -626,14 +584,14 @@ Tensor computeQuantizedRelu( at::Device device) { const BufHandle& qa = c10::get(inputs[0]); const auto out_qdtype = immQDType(qa); - const bool isQAChannelsLast = isNHWC(qa); - auto ResultBuf = isQAChannelsLast ? makeQBufHandleNHWC( + const bool isQAChannelsLast = isChannelsLast(qa); + auto ResultBuf = isQAChannelsLast ? makeQBufHandleChannelsLast( "quantized_relu", outputShape, Dtype(out_qdtype), immQScale(qa), immQZero(qa)) - : makeQBufHandleNCHW( + : makeQBufHandleContiguous( "quantized_relu", outputShape, Dtype(out_qdtype), @@ -674,7 +632,7 @@ Tensor computeQuantizedCat( extra_args.emplace_back(argDim); extra_args.emplace_back(out_qscale); extra_args.emplace_back(out_qzero); - auto ResultBuf = makeQBufHandleNCHW( + auto ResultBuf = makeQBufHandleContiguous( "quantized_cat", outputShape, Dtype(immQDType(inputList[0])), @@ -793,7 +751,7 @@ Tensor computeUpsampleNearest2dExternalCall( BufHandle ResultBuf = [&]() { if (isQuantized(x)) { - return makeQBufHandleNHWC( + return makeQBufHandleChannelsLast( "upsample_nearest2d", outputShape, Dtype(immQDType(x)), @@ -829,7 +787,7 @@ Tensor computeQuantizedSigmoidExternalCall( const double out_qscale = 1.0f / 256.0f; const int64_t out_qzero = (out_qdtype == ScalarType::QInt8) ? -128 : 0; - auto ResultBuf = makeQBufHandleNHWC( + auto ResultBuf = makeQBufHandleChannelsLast( "quantized_sigmoid", outputShape, Dtype(out_qdtype), From 961bbe1c6a75e8117906900c59f2258610b0d0ae Mon Sep 17 00:00:00 2001 From: Nikita Vedeneev Date: Tue, 15 Feb 2022 13:04:24 -0800 Subject: [PATCH 064/199] `linalg_det_singular`: modify samples such that CUDA IMA dissapears. (#72585) Summary: Implicitly fixes https://github.com/pytorch/pytorch/issues/72203 and https://github.com/pytorch/pytorch/issues/72204. The issues is coming from an incorrect use of `scatter` with wrong indices, see https://github.com/pytorch/pytorch/issues/72204#issuecomment-1034087199. I do not know what exactly calls to `scatter`, investigating... Pull Request resolved: https://github.com/pytorch/pytorch/pull/72585 Reviewed By: cpuhrsch Differential Revision: D34245279 Pulled By: anjali411 fbshipit-source-id: 460f030524f9228f2269eaee0a3a72e1978caeb4 (cherry picked from commit e48295716aa00078024d8b1d196358d6bc926b83) --- .../_internal/common_methods_invocations.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 411137efb6f..01d39e16dda 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -1316,7 +1316,7 @@ def sample_inputs_linalg_det(op_info, device, dtype, requires_grad, **kwargs): return [SampleInput(t) for t in inputs] def sample_inputs_linalg_det_singular(op_info, device, dtype, requires_grad, **kwargs): - make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad) + make_arg = partial(make_tensor, device=device, dtype=dtype) def make_singular_matrix_batch_base(size, rank): assert size[-1] == size[-2] @@ -1332,7 +1332,7 @@ def sample_inputs_linalg_det_singular(op_info, device, dtype, requires_grad, **k u_diag_abs_largest = u_diag_abs.max(dim=-1, keepdim=True).values u_diag_abs_smallest_idxs = torch.topk(u_diag_abs, k=(n - rank), largest=False).indices u.diagonal(0, -2, -1).div_(u_diag_abs_largest) - u[..., u_diag_abs_smallest_idxs] = torch.finfo(dtype).eps + u.diagonal(0, -2, -1)[..., u_diag_abs_smallest_idxs] = torch.finfo(dtype).eps matrix = p @ l @ u matrix.requires_grad_(requires_grad) @@ -9732,7 +9732,7 @@ op_db: List[OpInfo] = [ sample_inputs_func=sample_inputs_comparison_ops), OpInfo('linalg.det', op=torch.linalg.det, - aliases=('det', ), + aliases=('det',), dtypes=floating_and_complex_types(), backward_dtypes=floating_and_complex_types(), aten_name='linalg_det', @@ -9744,7 +9744,7 @@ op_db: List[OpInfo] = [ OpInfo('linalg.det', op=torch.linalg.det, variant_test_name='singular', - aliases=('det', ), + aliases=('det',), dtypes=double_types(), backward_dtypes=double_types(), aten_name='linalg_det', @@ -9757,14 +9757,9 @@ op_db: List[OpInfo] = [ # These tests started breaking after touching the SVD. DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad', device_type='cpu', dtypes=(torch.complex128,), active_if=IS_WINDOWS), - # For complex dtypes: Will be removed once https://github.com/pytorch/pytorch/issues/62328 is fixed - # Probable fix (open PR): https://github.com/pytorch/pytorch/pull/62570 - # Illegal Memory Access failure: https://github.com/pytorch/pytorch/issues/72203 - DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad', device_type='cuda'), - # Illegal Memory Access failure: https://github.com/pytorch/pytorch/issues/72204 - DecorateInfo(unittest.skip("Skipped!"), 'TestMathBits', 'test_neg_view', device_type='cuda'), - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'), DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad'), + # dtypes are tested in the suite above, no need to repeat it for singular + DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_dtypes'), )), OpInfo('linalg.cholesky', aten_name='linalg_cholesky', From 3d377fb4a319f43a870a3338391847996ae9b224 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 15 Feb 2022 13:28:10 -0800 Subject: [PATCH 065/199] [quant][fx][improvement] Add lowering support for BatchNormQuantizeHandler (#72490) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72490 This is an effort to move the current implementation towards the reference quantized model design: https://github.com/pytorch/rfcs/blob/master/RFC-0019-Extending-PyTorch-Quantization-to-Custom-Backends.md so that we use reference model in the default fbgemm/qnnpack path Test Plan: python test/test_quantization.py TestQuantizeFx python test/test_quantization.py TestQuantizeFxOps.test_qbatch_norm Imported from OSS Reviewed By: vkuzo, andrewor14 Differential Revision: D34062365 fbshipit-source-id: ed015c61f5b969554a6477f92cf6be2358cb558c (cherry picked from commit 9498421dddddd984c27f74a1c8c5ca87d6bdc474) --- test/quantization/fx/test_quantize_fx.py | 2 +- .../fx/_lower_to_native_backend.py | 29 ++++++++++++++----- torch/nn/quantized/modules/batchnorm.py | 13 +++++++++ 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 20bf20ea402..8bc2f6501d9 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -4979,7 +4979,7 @@ class TestQuantizeFxOps(QuantizationTestCase): # observers and also successfully fused two quantized::conv2d # patterns # one quantize_per_tensor for input - # check exact counts of quantize and dequantiz + # check exact counts of quantize and dequantize count_check = { # input of conv and two outputs of getitem ns.call_function(torch.quantize_per_tensor) : 2, diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py index 4822ff443b7..081f76ca2f3 100644 --- a/torch/ao/quantization/fx/_lower_to_native_backend.py +++ b/torch/ao/quantization/fx/_lower_to_native_backend.py @@ -41,13 +41,21 @@ def is_fixed_qparams_node(node, modules): return is_call_function, is_call_method, is_call_module # Mapping from reference module class to the replacement quantized module class for lowering -LOWER_MODULE_MAP: Dict[Type[nn.Module], Type[ReferenceableQuantizedModule]] = { +# TODO: fix typing, the key is reference module +LOWER_MODULE_MAP: Dict[Type[torch.nn.Module], Type[ReferenceableQuantizedModule]] = { nnqr.Linear: nnq.Linear, nnqr.Conv1d: nnq.Conv1d, nnqr.Conv2d: nnq.Conv2d, nnqr.Conv3d: nnq.Conv3d, } +# TODO: merge with LOWER_MODULE_MAP after we merge +# _lower_weighted_ref_module and special_pattern_replacement +SPECIAL_PATTERN_LOWER_MODULE_MAP = { + nn.BatchNorm2d: nnq.BatchNorm2d, + nn.BatchNorm3d: nnq.BatchNorm3d, +} + # Mapping from fused module class to a 2-tuple of: # 1) The inner reference module class # 2) The replacement quantized module class for lowering @@ -159,12 +167,19 @@ def special_pattern_replacement(model: QuantizedGraphModule) -> QuantizedGraphMo continue # TODO: enable we have patterns that needs to swap the modules - # if is_call_module: - # ref_module = modules[ref_node.target] - # # change this pattern to use the corresponding quantized module - # # replace reference module with quantized module - # parent_name, module_name = _parent_name(ref_node.target) - # setattr(modules[parent_name], module_name, ref_module) + if is_call_module: + ref_module = modules[ref_node.target] + if type(ref_module) in SPECIAL_PATTERN_LOWER_MODULE_MAP and is_quantize: + qmodule_cls = SPECIAL_PATTERN_LOWER_MODULE_MAP.get(type(ref_module)) + scale_node = q_node.args[1] + zero_point_node = q_node.args[2] + output_scale = getattr(model, scale_node.target) + output_zero_point = getattr(model, zero_point_node.target) + + qmodule = qmodule_cls.from_reference(ref_module, output_scale, output_zero_point) # type:ignore[union-attr] + # replace reference module with quantized module + parent_name, module_name = _parent_name(ref_node.target) + setattr(modules[parent_name], module_name, qmodule) # remove dq node: dq_nodes: List[Node] = [] diff --git a/torch/nn/quantized/modules/batchnorm.py b/torch/nn/quantized/modules/batchnorm.py index d6357a50425..f292b89958e 100644 --- a/torch/nn/quantized/modules/batchnorm.py +++ b/torch/nn/quantized/modules/batchnorm.py @@ -25,6 +25,19 @@ class _BatchNorm(torch.nn.modules.batchnorm._BatchNorm): new_mod.zero_point = zero_point return new_mod + @classmethod + def from_reference(cls, bn, output_scale, output_zero_point): + qbn = cls( + bn.num_features, + bn.eps, + bn.momentum, + device=bn.weight.device, + dtype=bn.weight.dtype + ) + qbn.scale = output_scale + qbn.zero_point = output_zero_point + return qbn + class BatchNorm2d(_BatchNorm): r"""This is the quantized version of :class:`~torch.nn.BatchNorm2d`. """ From 93a8bbbcdbe5368eede15dfea4a365c07fee2960 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Tue, 15 Feb 2022 13:28:21 -0800 Subject: [PATCH 066/199] be: Remove unused docker folder (#72884) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72884 These dockerfiles were old and unused so removing them to reduce the amount of confusion we may have over what these actually do. Signed-off-by: Eli Uriegas Test Plan: Imported from OSS Reviewed By: malfet, janeyx99 Differential Revision: D34253509 Pulled By: seemethere fbshipit-source-id: dc78b0105ccbbb95b759bc3b905ff31cb5ad71ed (cherry picked from commit a0e591c028016d7dd3ef0f9edd8074d5f32628aa) --- docker/cpu-blis/Dockerfile | 68 --------------- docker/pytorch/ubuntu_cpu_gpu/Dockerfile | 105 ----------------------- 2 files changed, 173 deletions(-) delete mode 100644 docker/cpu-blis/Dockerfile delete mode 100644 docker/pytorch/ubuntu_cpu_gpu/Dockerfile diff --git a/docker/cpu-blis/Dockerfile b/docker/cpu-blis/Dockerfile deleted file mode 100644 index adfce7e3ad7..00000000000 --- a/docker/cpu-blis/Dockerfile +++ /dev/null @@ -1,68 +0,0 @@ -# syntax = docker/dockerfile:experimental -# -# NOTE: To build this you will need a docker version > 18.06 with -# experimental enabled and DOCKER_BUILDKIT=1 -# -# For reference: -# https://docs.docker.com/develop/develop-images/build_enhancements/ -# -# This Dockerfile will build Docker Image with PyTorch + DNNL + AMD BLIS and Torchvision installed for CPU only -# -# Example commandline to build PyTorch with AMD BLIS: -# sudo DOCKER_BUILDKIT=1 docker build . -t docker-image-repo-name -# Example commandline to run the built docker container: -# sudo docker run --name container-name -it docker-image-repo-name - -ARG BASE_IMAGE=ubuntu:18.04 -ARG PYTHON_VERSION=3.8 - -FROM ${BASE_IMAGE} as dev-base -CMD echo "Welcome to the PyTorch Docker Container!" && \ - echo "Version of PyTorch Installed: " && python -c 'import torch; print(torch.__version__)' && \ - echo "Version of Torchvision Installed: " && python -c 'import torchvision; print(torchvision.__version__)' && \ - echo "LDD output showing successful linking with BLIS: " && ldd /opt/conda/lib/python3.8/site-packages/torch/_C.cpython-38-x86_64-linux-gnu.so && \ - /bin/bash -RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ - apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - ccache \ - cmake \ - curl \ - git \ - libjpeg-dev \ - libpng-dev \ - vim \ - wget && \ - rm -rf /var/lib/apt/lists/* -RUN /usr/sbin/update-ccache-symlinks -RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache -ENV PATH /opt/conda/bin:$PATH - -FROM dev-base as conda -RUN wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda-build && \ - /opt/conda/bin/conda install -y nomkl pyyaml numpy ipython ninja setuptools cmake cffi typing future && \ - /opt/conda/bin/conda clean -ya - -RUN conda install typing_extensions - -WORKDIR /root -ARG BLIS_URL=https://github.com/amd/blis.git -# Download, Build BLIS with multithreading support and place necessary library and include files at BLIS_HOME/lib and BLIS_HOME/include respectively -RUN git clone ${BLIS_URL} && cd blis && \ - ./configure --prefix=/root/BLISBuild --enable-cblas --enable-threading=openmp auto && make -j && make install && \ - if [ ! -e /root/BLISBuild/lib/libblis.so ] ; then cp /root/BLISBuild/lib/libblis*.so /root/BLISBuild/lib/libblis.so ; fi - -# Build PyTorch with DNNL+BLIS -RUN git clone https://github.com/pytorch/pytorch.git && cd pytorch && \ - git submodule update --init --recursive --jobs 0 && \ - export PATH=/root/BLISBuild/include/blis:$PATH LD_LIBRARY_PATH=/root/BLISBuild/lib:$LD_LIBRARY_PATH && \ - export BLIS_HOME=/root/BLISBuild BLAS=BLIS USE_MKLDNN_CBLAS=ON WITH_BLAS=blis && python setup.py install - -# Build Torchvision -RUN git clone https://github.com/pytorch/vision.git && cd vision && \ - python setup.py install diff --git a/docker/pytorch/ubuntu_cpu_gpu/Dockerfile b/docker/pytorch/ubuntu_cpu_gpu/Dockerfile deleted file mode 100644 index f7a1af09302..00000000000 --- a/docker/pytorch/ubuntu_cpu_gpu/Dockerfile +++ /dev/null @@ -1,105 +0,0 @@ -# This is the Dockerfile for an image that is ready to build PyTorch from source. -# PyTorch is not yet downloaded nor installed. -# -# Available BASE_IMAGE options: -# nvidia/cuda:11.2.1-cudnn8-devel-ubuntu18.04 -# nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 -# nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 -# -# Available MAGMA_CUDA_VERSION options (for GPU/CUDA builds): -# magma-cuda112 -# magma-cuda111 -# magma-cuda102 -# magma-cuda101 -# -# Available TORCH_CUDA_ARCH_LIST_VAR options (for GPU/CUDA builds): -# "3.7+PTX;5.0;6.0;6.1;7.0;7.5;8.0;8.6" for CUDA 11.2/11.1 -# "3.7+PTX;5.0;6.0;6.1;7.0;7.5;8.0" for CUDA 11.0 -# "3.7+PTX;5.0;6.0;6.1;7.0;7.5" for CUDA 10.2/10.1 -# -# Build image with CPU or GPU support with the following command: -# nvidia-docker build -t ${CONTAINER_TAG} -# --build-arg BASE_IMAGE=${BASE_IMAGE_VER} \ -# --build-arg PYTHON_VERSION=${PYTHON_VER} \ -# --build-arg MAGMA_CUDA_VERSION=${MAGMA_CUDA_VER} \ #(for GPU/CUDA builds) -# --build-arg TORCH_CUDA_ARCH_LIST_VAR=${TORCH_CUDA_ARCH_LIST} \ #(for GPU/CUDA builds): -# . -# -# For example, for a CPU Ubuntu 18.04 and Python 3.7.6 build: -# docker build -t ubuntu_1804_py_37_cpu_dev \ -# --build-arg BASE_IMAGE=ubuntu:18.04 \ -# --build-arg PYTHON_VERSION=3.7.6 . -# -# For example, for a CUDA 10.2 Ubuntu 18.04 and Python 3.9.1 build: -# nvidia-docker build -t ubuntu_1804_py_39_cuda_102_cudnn_8_dev \ -# --build-arg BASE_IMAGE=nvidia/cuda:10.2-cudnn8-devel-ubuntu18.04 \ -# --build-arg PYTHON_VERSION=3.9.1 \ -# --build-arg MAGMA_CUDA_VERSION=magma-cuda102 \ -# --build-arg TORCH_CUDA_ARCH_LIST_VAR="3.7+PTX;5.0;6.0;6.1;7.0;7.5" . - -ARG BASE_IMAGE -FROM ${BASE_IMAGE} as dev-base -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - ccache \ - cmake \ - curl \ - git \ - git-lfs \ - libjpeg-dev \ - libpng-dev \ - openmpi-bin \ - wget && \ - rm -rf /var/lib/apt/lists/* -RUN /usr/sbin/update-ccache-symlinks -RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache -ENV PATH /opt/conda/bin:$PATH - -FROM dev-base as conda -ARG PYTHON_VERSION -ENV PYTHON_VER=$PYTHON_VERSION -RUN curl -fsSL -v -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - /opt/conda/bin/conda install -y python=${PYTHON_VER} conda-build pyyaml numpy ipython cython typing typing_extensions mkl mkl-include ninja && \ - /opt/conda/bin/conda clean -ya - -ARG MAGMA_CUDA_VERSION -RUN if [ -z "$MAGMA_CUDA_VERSION" ] ; then \ - echo "Building with CPU support ..."; \ - else \ - echo "Building with GPU/CUDA support ..."; \ - conda install -y -c pytorch ${MAGMA_CUDA_VERSION} && conda clean -ya; \ - fi - -# Necessary step for Azure Pipelines Docker Build -# Docker image is build by root, but the build process -# is running from a non-priveledged user -RUN chmod -R ugo+rw /opt/conda/ - -WORKDIR /opt/pytorch -# Environment variables for PyTorch -ARG TORCH_CUDA_ARCH_LIST_VAR -RUN if [ -z "$TORCH_CUDA_ARCH_LIST_VAR" ] ; then \ - echo "Continuing CPU build ..."; \ - else \ - echo "Setting CUDA env vars and installing openmpi ..."; \ - # Set MPI links to avoid libmpi_cxx.so.1 not found error - ln -s /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.20 /usr/lib/x86_64-linux-gnu/libmpi_cxx.so.1; \ - ln -s /usr/lib/x86_64-linux-gnu/libmpi.so.20.10.1 /usr/lib/x86_64-linux-gnu/libmpi.so.12; \ - fi -# If the build argument TORCH_CUDA_ARCH_LIST_VAR is given, container will be -# set for GPU/CUDA build, else for CPU build. -ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST_VAR:+${TORCH_CUDA_ARCH_LIST_VAR}} -ENV TORCH_NVCC_FLAGS=${TORCH_CUDA_ARCH_LIST_VAR:+"-Xfatbin -compress-all"} -ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" - -# Install Azure CLI and update its site packages -RUN curl -sL https://aka.ms/InstallAzureCLIDeb | bash -RUN pip install --upgrade pip --target /opt/az/lib/python3.6/site-packages/ - -# Install MKL -RUN wget https://raw.githubusercontent.com/pytorch/builder/f121b0919d799b5ea2030c92ca266cf4cddf6656/common/install_mkl.sh -RUN bash ./install_mkl.sh && rm install_mkl.sh From 67adc0cb1186f27f317d99d20059a1d3a8ad56db Mon Sep 17 00:00:00 2001 From: soulitzer Date: Tue, 15 Feb 2022 13:58:17 -0800 Subject: [PATCH 067/199] Remove xfail for trapz and trapezoid on meta device (#72677) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72677 Test Plan: Imported from OSS Reviewed By: samdow Differential Revision: D34182326 Pulled By: soulitzer fbshipit-source-id: 9697b9e144780a4f3f60bea0978878f7edb72606 (cherry picked from commit 0386263175f2678587c597b4e551bc4eae4b92eb) --- .../testing/_internal/common_methods_invocations.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index 01d39e16dda..e984d4b7f8c 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -14046,21 +14046,13 @@ op_db: List[OpInfo] = [ supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, - sample_inputs_func=sample_trapezoid, - skips=( - # Dispatch stub: unsupported device typemeta - DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'), - )), + sample_inputs_func=sample_trapezoid), OpInfo('trapezoid', dtypes=all_types_and_complex_and(torch.float16, torch.bfloat16), supports_out=False, supports_forward_ad=True, supports_fwgrad_bwgrad=True, - sample_inputs_func=sample_trapezoid, - skips=( - # Dispatch stub: unsupported device typemeta - DecorateInfo(unittest.expectedFailure, 'TestGradients', 'test_fn_fwgrad_bwgrad', device_type='meta'), - )), + sample_inputs_func=sample_trapezoid), OpInfo('cumulative_trapezoid', dtypes=all_types_and_complex_and(), dtypesIfCUDA=all_types_and_complex_and(torch.bfloat16, torch.float16), From 277c4c9decabc2ffcc06881baee39756ec6e8c5d Mon Sep 17 00:00:00 2001 From: soulitzer Date: Tue, 15 Feb 2022 13:58:17 -0800 Subject: [PATCH 068/199] Fix vjpvmap for linalg.svdvals (#72811) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72811 See https://github.com/pytorch/pytorch/pull/72309/files#r803041089 for context Test Plan: Imported from OSS Reviewed By: samdow Differential Revision: D34222388 Pulled By: soulitzer fbshipit-source-id: 4d1469eb4eed0e36ee87ca99dcc5098835aa9cef (cherry picked from commit d1f8f111628ac365a2f48a67a957b640a50c4485) --- aten/src/ATen/native/BatchLinearAlgebra.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 225985d6048..8c10269eeca 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -3068,7 +3068,11 @@ Tensor& linalg_svdvals_out(const Tensor& A, Tensor & S) { } Tensor linalg_svdvals(const Tensor& A) { - return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false, /*comptue_uv=*/_requires_fw_or_bw_grad(A))); + // NB: Why do we need isTensorSubclassLike check for linalg_svdvals but not linalg_eigvals? + // svdvals is decomposed at the vmap level in functorch so A can be a BatchedTensor wrapping + // a TensorWrapper requiring fw or bw grad. + return std::get<1>(at::_linalg_svd(A, /*full_matrices=*/false, + /*comptue_uv=*/_requires_fw_or_bw_grad(A) || isTensorSubclassLike(A))); } std::tuple svd_out(const Tensor& self, bool some, bool compute_uv, Tensor& U, Tensor& S, Tensor& V) { From 8a43aa95382f80c98f47d62df97f5fd9a6879e8c Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Tue, 15 Feb 2022 14:36:50 -0800 Subject: [PATCH 069/199] [Kineto][Bug Fix] Avoid picking up old CUPTI headers (#72761) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72761 By default, the CUPTI_INCLUDE_DIR will pick up cupti.h from /usr/include which is old (from 2017 on AWS), and missing many cupti headers. Use NO_DEFAULT_PATH to avoid that, instead search from the list of locations provided. Test Plan: Fixes missing headers error when building on AWS. (Avoids old cupti.h from /usr/include). Instead uses cupti.h from cuda/extras/CUPTI/include. ``` In file included from /scratch/aaronshi/pytorch/third_party/kineto/libkineto/src/CuptiRangeProfilerApi.cpp:13:0: /scratch/aaronshi/pytorch/third_party/kineto/libkineto/src/CuptiRangeProfilerApi.h:12:10: fatal error: cupti_profiler_target.h: No such file or directory #include ^~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated. ``` and ``` /scratch/aaronshi/pytorch/third_party/kineto/libkineto/src/CuptiRangeProfilerApi.cpp:7:10: fatal error: nvperf_host.h: No such file or directory #include ^~~~~~~~~~~~~~~ compilation terminated. ``` Reviewed By: briancoutinho Differential Revision: D34191123 Pulled By: aaronenyeshi fbshipit-source-id: d84f80308c9939ba8ed504e667847d136a261453 (cherry picked from commit 33368bd93b2c9caed5983513511eb4f30eb3f2ed) --- cmake/Dependencies.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 0969055415b..a87131a992c 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1909,10 +1909,11 @@ if(USE_KINETO) ${CUDA_SOURCE_DIR}/lib64) find_path(CUPTI_INCLUDE_DIR cupti.h PATHS + ${CUDA_SOURCE_DIR}/extras/CUPTI/include ${CUDA_INCLUDE_DIRS} ${CUDA_SOURCE_DIR} - ${CUDA_SOURCE_DIR}/extras/CUPTI/include - ${CUDA_SOURCE_DIR}/include) + ${CUDA_SOURCE_DIR}/include + NO_DEFAULT_PATH) if(CUPTI_LIBRARY_PATH AND CUPTI_INCLUDE_DIR) message(STATUS " CUPTI_INCLUDE_DIR = ${CUPTI_INCLUDE_DIR}") From 8b0847811501cd452b131b6153f28c254a3ee44f Mon Sep 17 00:00:00 2001 From: wayi1 Date: Tue, 15 Feb 2022 15:42:59 -0800 Subject: [PATCH 070/199] Fix the doc of PostLocalSGDState (#72792) Summary: The first arg of `PostLocalSGDState` ctor, `process_group`, cannot be empty. Here to simplify the usage, does not even create a subgroup explicitly. See the example in unit test: https://github.com/pytorch/pytorch/blob/4feef6c97092cfde7d57a97d8390a79551e92369/torch/testing/_internal/distributed/distributed_test.py#L4260 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72792 Reviewed By: samdow Differential Revision: D34213221 Pulled By: rohan-varma fbshipit-source-id: 078343f3ee138e175bf835897f190032eb970662 (cherry picked from commit bf90af704fb371eef799a951007cc5d41dbe07a1) --- torch/distributed/algorithms/model_averaging/averagers.py | 3 +-- torch/distributed/optim/post_localSGD_optimizer.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/torch/distributed/algorithms/model_averaging/averagers.py b/torch/distributed/algorithms/model_averaging/averagers.py index a084ab7326c..cb67057bcde 100644 --- a/torch/distributed/algorithms/model_averaging/averagers.py +++ b/torch/distributed/algorithms/model_averaging/averagers.py @@ -60,8 +60,7 @@ class PeriodicModelAverager(ModelAverager): >>> module, device_ids=[rank], output_device=rank >>> ) >>> # Register a post-localSGD communication hook. - >>> subgroup, subgroups = dist.new_subgroups() - >>> state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100) + >>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100) >>> model.register_comm_hook(state, post_localSGD_hook) >>> >>> # In the first 100 steps, run global gradient averaging like normal DDP at every step. diff --git a/torch/distributed/optim/post_localSGD_optimizer.py b/torch/distributed/optim/post_localSGD_optimizer.py index 1a80bab01bf..f24293476e3 100644 --- a/torch/distributed/optim/post_localSGD_optimizer.py +++ b/torch/distributed/optim/post_localSGD_optimizer.py @@ -26,8 +26,7 @@ class PostLocalSGDOptimizer(torch.optim.Optimizer): >>> ) >>> >>> # Register a post-localSGD communication hook. - >>> subgroup, subgroups = dist.new_subgroups() - >>> state = PostLocalSGDState(subgroup=subgroup, start_localSGD_iter=100) + >>> state = PostLocalSGDState(process_group=None, subgroup=None, start_localSGD_iter=100) >>> model.register_comm_hook(state, post_localSGD_hook) >>> >>> # Create a post-localSGD optimizer that wraps a local optimizer. From 8e7fe87630a95633f9f67b0986c2be3c36d986a2 Mon Sep 17 00:00:00 2001 From: Kurt Mohler Date: Tue, 15 Feb 2022 15:43:57 -0800 Subject: [PATCH 071/199] Rename `Typed/UntypedStorage` to `_Typed/_UntypedStorage` (#72540) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72540 Reviewed By: jbschlosser Differential Revision: D34216823 Pulled By: bdhirsh fbshipit-source-id: 1bc9930ab582771ebf02308e035576cd1a0dbe47 (cherry picked from commit 329238f612a9d92586bb0e5b33bcc45a0ec6936b) --- docs/source/conf.py | 1 - test/test_cuda.py | 4 +- test/test_serialization.py | 8 +-- .../templates/python_variable_methods.cpp | 4 +- tools/pyi/gen_pyi.py | 4 +- torch/_C/__init__.pyi.in | 2 +- torch/__init__.py | 42 +++++------ torch/_deploy.py | 8 +-- torch/_tensor.py | 16 ++--- torch/_utils.py | 4 +- torch/csrc/DynamicTypes.cpp | 8 +-- torch/csrc/generic/Storage.cpp | 2 +- torch/csrc/utils/tensor_new.cpp | 6 +- torch/cuda/__init__.py | 30 ++++---- torch/multiprocessing/reductions.py | 12 ++-- torch/package/_directory_reader.py | 2 +- torch/package/package_exporter.py | 4 +- torch/package/package_importer.py | 4 +- torch/serialization.py | 42 +++++------ torch/storage.py | 72 +++++++++---------- torch/testing/_internal/common_utils.py | 2 +- 21 files changed, 138 insertions(+), 139 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 0e55297b27b..4b4742bfb7b 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -153,7 +153,6 @@ coverage_ignore_classes = [ "LongTensor", "ShortStorage", "ShortTensor", - "UntypedStorage", "cudaStatus", # torch.distributed.elastic.multiprocessing.errors "ChildFailedError", diff --git a/test/test_cuda.py b/test/test_cuda.py index 7df9f637274..c0ea06a0e19 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -568,8 +568,8 @@ class TestCuda(TestCase): self.assertTrue(isinstance(q_copy[0], torch.cuda.FloatTensor)) self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor)) self.assertTrue(isinstance(q_copy[2], torch.cuda.FloatTensor)) - self.assertTrue(isinstance(q_copy[3], torch.storage.TypedStorage)) - self.assertTrue(isinstance(q_copy[3]._storage, torch.cuda.UntypedStorage)) + self.assertTrue(isinstance(q_copy[3], torch.storage._TypedStorage)) + self.assertTrue(isinstance(q_copy[3]._storage, torch.cuda._UntypedStorage)) q_copy[1].fill_(10) self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10)) diff --git a/test/test_serialization.py b/test/test_serialization.py index a4fa6e8c9ba..a3c5f8c659b 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -97,7 +97,7 @@ class SerializationMixin(object): self.assertTrue(isinstance(c[1], torch.FloatTensor)) self.assertTrue(isinstance(c[2], torch.FloatTensor)) self.assertTrue(isinstance(c[3], torch.FloatTensor)) - self.assertTrue(isinstance(c[4], torch.storage.TypedStorage)) + self.assertTrue(isinstance(c[4], torch.storage._TypedStorage)) self.assertEqual(c[4].dtype, torch.float) c[0].fill_(10) self.assertEqual(c[0], c[2], atol=0, rtol=0) @@ -370,7 +370,7 @@ class SerializationMixin(object): self.assertTrue(isinstance(c[1], torch.FloatTensor)) self.assertTrue(isinstance(c[2], torch.FloatTensor)) self.assertTrue(isinstance(c[3], torch.FloatTensor)) - self.assertTrue(isinstance(c[4], torch.storage.TypedStorage)) + self.assertTrue(isinstance(c[4], torch.storage._TypedStorage)) self.assertEqual(c[4].dtype, torch.float32) c[0].fill_(10) self.assertEqual(c[0], c[2], atol=0, rtol=0) @@ -620,7 +620,7 @@ class SerializationMixin(object): a = torch.tensor([], dtype=dtype, device=device) for other_dtype in get_all_dtypes(): - s = torch.TypedStorage( + s = torch._TypedStorage( wrap_storage=a.storage()._untyped(), dtype=other_dtype) save_load_check(a, s) @@ -652,7 +652,7 @@ class SerializationMixin(object): torch.save([a.storage(), a.imag.storage()], f) a = torch.randn(10, device=device) - s_bytes = torch.TypedStorage( + s_bytes = torch._TypedStorage( wrap_storage=a.storage()._untyped(), dtype=torch.uint8) diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp index b3d6ae705c5..c2e3c417462 100644 --- a/tools/autograd/templates/python_variable_methods.cpp +++ b/tools/autograd/templates/python_variable_methods.cpp @@ -1114,7 +1114,7 @@ static PyObject* THPVariable_set_( at::Storage storage = _r.storage(0, storage_scalar_type, is_typed_storage); TORCH_CHECK(storage_scalar_type == self.dtype() || !is_typed_storage, "Expected a Storage of type ", self.dtype(), - " or an UntypedStorage, but got type ", storage_scalar_type, + " or an _UntypedStorage, but got type ", storage_scalar_type, " for argument 1 'storage'"); auto dispatch_set_ = [](const Tensor& self, Storage source) -> Tensor { pybind11::gil_scoped_release no_gil; @@ -1130,7 +1130,7 @@ static PyObject* THPVariable_set_( at::Storage storage = _r.storage(0, storage_scalar_type, is_typed_storage); TORCH_CHECK(storage_scalar_type == self.dtype() || !is_typed_storage, "Expected a Storage of type ", self.dtype(), - " or an UntypedStorage, but got type ", storage_scalar_type, + " or an _UntypedStorage, but got type ", storage_scalar_type, " for argument 1 'storage'"); auto dispatch_set_ = [](const Tensor& self, Storage source, diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index faf1fdf06d3..1edd8d32a7a 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -482,8 +482,8 @@ def gen_pyi(native_yaml_path: str, deprecated_yaml_path: str, fm: FileManager) - ], 'item': ["def item(self) -> Number: ..."], 'copy_': ["def copy_(self, src: Tensor, non_blocking: _bool=False) -> Tensor: ..."], - 'set_': ['def set_(self, storage: Union[Storage, TypedStorage], offset: _int, size: _size, stride: _size) -> Tensor: ...', - 'def set_(self, storage: Union[Storage, TypedStorage]) -> Tensor: ...'], + 'set_': ['def set_(self, storage: Union[Storage, _TypedStorage], offset: _int, size: _size, stride: _size) -> Tensor: ...', + 'def set_(self, storage: Union[Storage, _TypedStorage]) -> Tensor: ...'], 'split': ['def split(self, split_size: _int, dim: _int=0) -> Sequence[Tensor]: ...', 'def split(self, split_size: Tuple[_int, ...], dim: _int=0) -> Sequence[Tensor]: ...'], 'div': ['def div(self, other: Union[Tensor, Number], *, rounding_mode: Optional[str] = None) -> Tensor: ...'], diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index f2c0be085dc..ae77710cdbc 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -13,7 +13,7 @@ from typing_extensions import Literal from torch._six import inf from torch.types import _int, _float, _bool, _dtype, _device, _qscheme, _size, _layout, Device, Number, Storage -from torch.storage import TypedStorage +from torch.storage import _TypedStorage import builtins diff --git a/torch/__init__.py b/torch/__init__.py index 519ea3e607c..f998a2ae34b 100644 --- a/torch/__init__.py +++ b/torch/__init__.py @@ -594,101 +594,101 @@ __all__.extend(['e', 'pi', 'nan', 'inf']) ################################################################################ from ._tensor import Tensor -from .storage import _StorageBase, TypedStorage +from .storage import _StorageBase, _TypedStorage # NOTE: New Storage classes should never be added. When adding a new -# dtype, use torch.storage.TypedStorage directly. +# dtype, use torch.storage._TypedStorage directly. -class UntypedStorage(_C.ByteStorageBase, _StorageBase): +class _UntypedStorage(_C.ByteStorageBase, _StorageBase): pass -class ByteStorage(TypedStorage): +class ByteStorage(_TypedStorage): @classproperty def dtype(self): return torch.uint8 -class DoubleStorage(TypedStorage): +class DoubleStorage(_TypedStorage): @classproperty def dtype(self): return torch.double -class FloatStorage(TypedStorage): +class FloatStorage(_TypedStorage): @classproperty def dtype(self): return torch.float -class HalfStorage(TypedStorage): +class HalfStorage(_TypedStorage): @classproperty def dtype(self): return torch.half -class LongStorage(TypedStorage): +class LongStorage(_TypedStorage): @classproperty def dtype(self): return torch.long -class IntStorage(TypedStorage): +class IntStorage(_TypedStorage): @classproperty def dtype(self): return torch.int -class ShortStorage(TypedStorage): +class ShortStorage(_TypedStorage): @classproperty def dtype(self): return torch.short -class CharStorage(TypedStorage): +class CharStorage(_TypedStorage): @classproperty def dtype(self): return torch.int8 -class BoolStorage(TypedStorage): +class BoolStorage(_TypedStorage): @classproperty def dtype(self): return torch.bool -class BFloat16Storage(TypedStorage): +class BFloat16Storage(_TypedStorage): @classproperty def dtype(self): return torch.bfloat16 -class ComplexDoubleStorage(TypedStorage): +class ComplexDoubleStorage(_TypedStorage): @classproperty def dtype(self): return torch.cdouble -class ComplexFloatStorage(TypedStorage): +class ComplexFloatStorage(_TypedStorage): @classproperty def dtype(self): return torch.cfloat -class QUInt8Storage(TypedStorage): +class QUInt8Storage(_TypedStorage): @classproperty def dtype(self): return torch.quint8 -class QInt8Storage(TypedStorage): +class QInt8Storage(_TypedStorage): @classproperty def dtype(self): return torch.qint8 -class QInt32Storage(TypedStorage): +class QInt32Storage(_TypedStorage): @classproperty def dtype(self): return torch.qint32 -class QUInt4x2Storage(TypedStorage): +class QUInt4x2Storage(_TypedStorage): @classproperty def dtype(self): return torch.quint4x2 -class QUInt2x4Storage(TypedStorage): +class QUInt2x4Storage(_TypedStorage): @classproperty def dtype(self): return torch.quint2x4 _storage_classes = { - UntypedStorage, DoubleStorage, FloatStorage, LongStorage, IntStorage, + _UntypedStorage, DoubleStorage, FloatStorage, LongStorage, IntStorage, ShortStorage, CharStorage, ByteStorage, HalfStorage, BoolStorage, QUInt8Storage, QInt8Storage, QInt32Storage, BFloat16Storage, ComplexFloatStorage, ComplexDoubleStorage, QUInt4x2Storage, QUInt2x4Storage, diff --git a/torch/_deploy.py b/torch/_deploy.py index 4a27e3753d3..347f4424818 100644 --- a/torch/_deploy.py +++ b/torch/_deploy.py @@ -17,8 +17,8 @@ def _save_storages(importer, obj): importers = sys_importer def persistent_id(obj): - if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage): - if isinstance(obj, torch.storage.TypedStorage): + if torch.is_storage(obj) or isinstance(obj, torch.storage._TypedStorage): + if isinstance(obj, torch.storage._TypedStorage): # TODO: Once we decide to break serialization FC, we can # remove this case storage = obj._storage @@ -59,10 +59,10 @@ def _load_storages(id, zip_reader, obj_bytes, serialized_storages, serialized_dt if typename == 'storage': # TODO: Once we decide to break serialization FC, we can - # stop wrapping with TypedStorage + # stop wrapping with _TypedStorage storage = serialized_storages[data[0]] dtype = serialized_dtypes[data[0]] - return torch.storage.TypedStorage( + return torch.storage._TypedStorage( wrap_storage=storage._untyped(), dtype=dtype) diff --git a/torch/_tensor.py b/torch/_tensor.py index dc2f5c21624..8c33755268b 100644 --- a/torch/_tensor.py +++ b/torch/_tensor.py @@ -109,9 +109,9 @@ class Tensor(torch._C._TensorBase): else: raise RuntimeError(f"Unsupported qscheme {self.qscheme()} in deepcopy") # TODO: Once we decide to break serialization FC, no longer - # need to wrap with TypedStorage + # need to wrap with _TypedStorage new_tensor = torch._utils._rebuild_qtensor( - torch.storage.TypedStorage( + torch.storage._TypedStorage( wrap_storage=new_storage._untyped(), dtype=self.dtype), self.storage_offset(), @@ -232,9 +232,9 @@ class Tensor(torch._C._TensorBase): else: raise RuntimeError(f"Serialization is not supported for tensors of type {self.qscheme()}") # TODO: Once we decide to break serialization FC, no longer - # need to wrap with TypedStorage + # need to wrap with _TypedStorage args_qtensor = ( - torch.storage.TypedStorage( + torch.storage._TypedStorage( wrap_storage=self.storage()._untyped(), dtype=self.dtype), self.storage_offset(), @@ -267,9 +267,9 @@ class Tensor(torch._C._TensorBase): return (torch._utils._rebuild_sparse_csr_tensor, args_sparse_csr) else: # TODO: Once we decide to break serialization FC, no longer - # need to wrap with TypedStorage + # need to wrap with _TypedStorage args = ( - torch.storage.TypedStorage( + torch.storage._TypedStorage( wrap_storage=self.storage()._untyped(), dtype=self.dtype), self.storage_offset(), @@ -830,9 +830,9 @@ class Tensor(torch._C._TensorBase): Returns the type of the underlying storage. """ - # NB: this returns old fashioned TypedStorage, e.g., FloatStorage, as it + # NB: this returns old fashioned _TypedStorage, e.g., FloatStorage, as it # would be pretty pointless otherwise (it would always return - # UntypedStorage) + # _UntypedStorage) return type(self.storage()) def refine_names(self, *names): diff --git a/torch/_utils.py b/torch/_utils.py index 86272773141..324c8f3031d 100644 --- a/torch/_utils.py +++ b/torch/_utils.py @@ -128,7 +128,7 @@ def _get_async_or_non_blocking(function_name, non_blocking, kwargs): # TODO: Once we decide to break serialization FC, `storage` no longer needs to -# be a TypedStorage +# be a _TypedStorage def _rebuild_tensor(storage, storage_offset, size, stride): # first construct a tensor with the correct dtype/device t = torch.tensor([], dtype=storage.dtype, device=storage._untyped().device) @@ -210,7 +210,7 @@ def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad): # TODO: Once we decide to break serialization FC, `storage` no longer needs to -# be a TypedStorage +# be a _TypedStorage def _rebuild_qtensor(storage, storage_offset, size, stride, quantizer_params, requires_grad, backward_hooks): qscheme = quantizer_params[0] if qscheme == torch.per_tensor_affine: diff --git a/torch/csrc/DynamicTypes.cpp b/torch/csrc/DynamicTypes.cpp index a2bf143aede..502bb0fa29b 100644 --- a/torch/csrc/DynamicTypes.cpp +++ b/torch/csrc/DynamicTypes.cpp @@ -66,7 +66,7 @@ PyTypeObject* getPyTypeObject(const at::Storage& storage) { scalarType); auto it = attype_to_py_storage_type.find(attype); TORCH_INTERNAL_ASSERT(it != attype_to_py_storage_type.end(), - "Failed to get the Python type of `UntypedStorage`."); + "Failed to get the Python type of `_UntypedStorage`."); return it->second; } } // namespace @@ -115,10 +115,10 @@ PyTypeObject* loadTypedStorageTypeObject() { PyObject* storage_module = PyImport_ImportModule("torch.storage"); TORCH_INTERNAL_ASSERT(storage_module && PyModule_Check(storage_module)); - PyObject* typed_storage_obj = PyObject_GetAttrString(storage_module, "TypedStorage"); + PyObject* typed_storage_obj = PyObject_GetAttrString(storage_module, "_TypedStorage"); TORCH_INTERNAL_ASSERT(typed_storage_obj && PyType_Check(typed_storage_obj)); return reinterpret_cast( - PyObject_GetAttrString(storage_module, "TypedStorage")); + PyObject_GetAttrString(storage_module, "_TypedStorage")); } PyTypeObject* getTypedStorageTypeObject() { @@ -169,7 +169,7 @@ at::Storage createStorageGetType(PyObject* obj, at::ScalarType& scalar_type, boo } if (obj_type == storage_type) { auto& type = *item.second; - // UntypedStorage should always be interpreted with byte dtype + // _UntypedStorage should always be interpreted with byte dtype scalar_type = at::kByte; return type.unsafeStorageFromTH(((THPVoidStorage*)obj)->cdata, true); } diff --git a/torch/csrc/generic/Storage.cpp b/torch/csrc/generic/Storage.cpp index 539c01cad24..99499ef9a01 100644 --- a/torch/csrc/generic/Storage.cpp +++ b/torch/csrc/generic/Storage.cpp @@ -344,7 +344,7 @@ bool THPStorage_(init)(PyObject *module) void THPStorage_(postInit)(PyObject *module) { - THPStorageClass = PyObject_GetAttrString(module, "UntypedStorage"); + THPStorageClass = PyObject_GetAttrString(module, "_UntypedStorage"); if (!THPStorageClass) throw python_error(); at::Backend backend = at::Backend::CPU; diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp index 4b85cd81fdf..580f572977e 100644 --- a/torch/csrc/utils/tensor_new.cpp +++ b/torch/csrc/utils/tensor_new.cpp @@ -297,7 +297,7 @@ Tensor internal_new_from_data( Storage storage = createStorageGetType(data, storage_scalar_type, is_typed_storage); TORCH_CHECK(!is_typed_storage || storage_scalar_type == scalar_type, "Expected a Storage of type ", scalar_type, - " or an UntypedStorage, but got ", storage_scalar_type); + " or an _UntypedStorage, but got ", storage_scalar_type); tensor = at::empty(sizes, at::initialTensorOptions().dtype(is_typed_storage ? storage_scalar_type : inferred_scalar_type).pinned_memory(pin_memory).device(storage.device())); tensor.set_(storage); @@ -534,7 +534,7 @@ Tensor legacy_tensor_ctor(c10::DispatchKey dispatch_key, at::ScalarType scalar_t TORCH_CHECK( storage_scalar_type == scalar_type, "Expected a Storage of type ", scalar_type, - " or an UntypedStorage, but got type ", storage_scalar_type, + " or an _UntypedStorage, but got type ", storage_scalar_type, " for argument 1 'storage'"); } return new_with_storage(options, scalar_type, storage); @@ -596,7 +596,7 @@ Tensor legacy_tensor_new(c10::DispatchKey dispatch_key, at::ScalarType scalar_ty TORCH_CHECK( storage_scalar_type == scalar_type, "Expected a Storage of type ", scalar_type, - " or an UntypedStorage, but got type ", storage_scalar_type, + " or an _UntypedStorage, but got type ", storage_scalar_type, " for argument 1 'storage'"); } return new_with_storage(options, scalar_type, storage); diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py index d782c300c33..ac7026ea0dd 100644 --- a/torch/cuda/__init__.py +++ b/torch/cuda/__init__.py @@ -674,72 +674,72 @@ class _CudaBase(object): __new__ = _lazy_new -from torch.storage import TypedStorage +from torch.storage import _TypedStorage -class UntypedStorage(_CudaBase, torch._C.CudaByteStorageBase, _StorageBase): +class _UntypedStorage(_CudaBase, torch._C.CudaByteStorageBase, _StorageBase): pass -class ByteStorage(TypedStorage): +class ByteStorage(_TypedStorage): @classproperty def dtype(self): return torch.uint8 -class DoubleStorage(TypedStorage): +class DoubleStorage(_TypedStorage): @classproperty def dtype(self): return torch.double -class FloatStorage(TypedStorage): +class FloatStorage(_TypedStorage): @classproperty def dtype(self): return torch.float -class HalfStorage(TypedStorage): +class HalfStorage(_TypedStorage): @classproperty def dtype(self): return torch.half -class LongStorage(TypedStorage): +class LongStorage(_TypedStorage): @classproperty def dtype(self): return torch.long -class IntStorage(TypedStorage): +class IntStorage(_TypedStorage): @classproperty def dtype(self): return torch.int -class ShortStorage(TypedStorage): +class ShortStorage(_TypedStorage): @classproperty def dtype(self): return torch.short -class CharStorage(TypedStorage): +class CharStorage(_TypedStorage): @classproperty def dtype(self): return torch.int8 -class BoolStorage(TypedStorage): +class BoolStorage(_TypedStorage): @classproperty def dtype(self): return torch.bool -class BFloat16Storage(TypedStorage): +class BFloat16Storage(_TypedStorage): @classproperty def dtype(self): return torch.bfloat16 -class ComplexDoubleStorage(TypedStorage): +class ComplexDoubleStorage(_TypedStorage): @classproperty def dtype(self): return torch.cdouble -class ComplexFloatStorage(TypedStorage): +class ComplexFloatStorage(_TypedStorage): @classproperty def dtype(self): return torch.cfloat -torch._storage_classes.add(UntypedStorage) +torch._storage_classes.add(_UntypedStorage) torch._storage_classes.add(DoubleStorage) torch._storage_classes.add(FloatStorage) torch._storage_classes.add(LongStorage) diff --git a/torch/multiprocessing/reductions.py b/torch/multiprocessing/reductions.py index 5eff241da45..2da1ab8582b 100644 --- a/torch/multiprocessing/reductions.py +++ b/torch/multiprocessing/reductions.py @@ -123,7 +123,7 @@ def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset, storage_cls._release_ipc_counter(ref_counter_handle, ref_counter_offset) t = torch._utils._rebuild_tensor( - torch.storage.TypedStorage(wrap_storage=storage._untyped(), dtype=dtype), + torch.storage._TypedStorage(wrap_storage=storage._untyped(), dtype=dtype), tensor_offset, tensor_size, tensor_stride) if tensor_cls == torch.nn.parameter.Parameter: @@ -317,16 +317,16 @@ def rebuild_storage_empty(cls): return cls() def rebuild_typed_storage(storage, dtype): - return torch.storage.TypedStorage(wrap_storage=storage, dtype=dtype) + return torch.storage._TypedStorage(wrap_storage=storage, dtype=dtype) -# Use for torch.storage.TypedStorage +# Use for torch.storage._TypedStorage def reduce_typed_storage(storage): return (rebuild_typed_storage, (storage._storage, storage.dtype)) def rebuild_typed_storage_child(storage, storage_type): return storage_type(wrap_storage=storage) -# Use for child classes of torch.storage.TypedStorage, like torch.FloatStorage +# Use for child classes of torch.storage._TypedStorage, like torch.FloatStorage def reduce_typed_storage_child(storage): return (rebuild_typed_storage_child, (storage._storage, type(storage))) @@ -358,12 +358,12 @@ def init_reductions(): ForkingPickler.register(torch.cuda.Event, reduce_event) for t in torch._storage_classes: - if t.__name__ == 'UntypedStorage': + if t.__name__ == '_UntypedStorage': ForkingPickler.register(t, reduce_storage) else: ForkingPickler.register(t, reduce_typed_storage_child) - ForkingPickler.register(torch.storage.TypedStorage, reduce_typed_storage) + ForkingPickler.register(torch.storage._TypedStorage, reduce_typed_storage) for t in torch._tensor_classes: ForkingPickler.register(t, reduce_tensor) diff --git a/torch/package/_directory_reader.py b/torch/package/_directory_reader.py index 30833493c4f..14d20181cd3 100644 --- a/torch/package/_directory_reader.py +++ b/torch/package/_directory_reader.py @@ -35,7 +35,7 @@ class DirectoryReader(object): def get_storage_from_record(self, name, numel, dtype): filename = f"{self.directory}/{name}" nbytes = torch._utils._element_size(dtype) * numel - storage = cast(Storage, torch.UntypedStorage) + storage = cast(Storage, torch._UntypedStorage) return _HasStorage(storage.from_file(filename=filename, nbytes=nbytes)) def has_record(self, path): diff --git a/torch/package/package_exporter.py b/torch/package/package_exporter.py index 5bfd4444f8e..8d19892d6f6 100644 --- a/torch/package/package_exporter.py +++ b/torch/package/package_exporter.py @@ -849,8 +849,8 @@ class PackageExporter: ) def _persistent_id(self, obj): - if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage): - if isinstance(obj, torch.storage.TypedStorage): + if torch.is_storage(obj) or isinstance(obj, torch.storage._TypedStorage): + if isinstance(obj, torch.storage._TypedStorage): # TODO: Once we decide to break serialization FC, we can # remove this case storage = obj._storage diff --git a/torch/package/package_importer.py b/torch/package/package_importer.py index 5b0f0037b14..a5d602d3a71 100644 --- a/torch/package/package_importer.py +++ b/torch/package/package_importer.py @@ -217,8 +217,8 @@ class PackageImporter(Importer): ) storage = loaded_storages[key] # TODO: Once we decide to break serialization FC, we can - # stop wrapping with TypedStorage - return torch.storage.TypedStorage( + # stop wrapping with _TypedStorage + return torch.storage._TypedStorage( wrap_storage=storage._untyped(), dtype=dtype ) elif typename == "reduce_package": diff --git a/torch/serialization.py b/torch/serialization.py index c63a115b67b..3cc92349bbc 100644 --- a/torch/serialization.py +++ b/torch/serialization.py @@ -162,7 +162,7 @@ register_package(10, _cpu_tag, _cpu_deserialize) register_package(20, _cuda_tag, _cuda_deserialize) -def location_tag(storage: Union[Storage, torch.storage.TypedStorage]): +def location_tag(storage: Union[Storage, torch.storage._TypedStorage]): for _, tagger, _ in _package_registry: location = tagger(storage) if location: @@ -413,8 +413,8 @@ def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None: "for correctness upon loading.") return ('module', obj, source_file, source) - if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): - if isinstance(obj, torch.storage.TypedStorage): + if isinstance(obj, torch.storage._TypedStorage) or torch.is_storage(obj): + if isinstance(obj, torch.storage._TypedStorage): # TODO: Once we decide to break serialization FC, this case # can be deleted storage = obj._storage @@ -463,8 +463,8 @@ def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None: # effectively saving nbytes in this case. We'll be able to load it # and the tensor back up with no problems in _this_ and future # versions of pytorch, but in older versions, here's the problem: - # the storage will be loaded up as a UntypedStorage, and then the - # FloatTensor will loaded and the UntypedStorage will be assigned to + # the storage will be loaded up as a _UntypedStorage, and then the + # FloatTensor will loaded and the _UntypedStorage will be assigned to # it. Since the storage dtype does not match the tensor dtype, this # will cause an error. If we reverse the list, like `[tensor, # storage]`, then we will save the `tensor.storage()` as a faked @@ -472,7 +472,7 @@ def _legacy_save(obj, f, pickle_module, pickle_protocol) -> None: # dtype-specific numel count that old versions expect. `tensor` # will be able to load up properly in old versions, pointing to # a FloatStorage. However, `storage` is still being translated to - # a UntypedStorage, and it will try to resolve to the same + # a _UntypedStorage, and it will try to resolve to the same # FloatStorage that `tensor` contains. This will also cause an # error. It doesn't seem like there's any way around this. # Probably, we just cannot maintain FC for the legacy format if the @@ -539,9 +539,9 @@ def _save(obj, zip_file, pickle_module, pickle_protocol): # see # https://docs.python.org/2/library/pickle.html#pickling-and-unpickling-external-objects # https://github.com/python/cpython/blob/master/Lib/pickle.py#L527-L537 - if isinstance(obj, torch.storage.TypedStorage) or torch.is_storage(obj): + if isinstance(obj, torch.storage._TypedStorage) or torch.is_storage(obj): - if isinstance(obj, torch.storage.TypedStorage): + if isinstance(obj, torch.storage._TypedStorage): # TODO: Once we decide to break serialization FC, this case # can be deleted storage = obj._storage @@ -806,11 +806,11 @@ def _legacy_load(f, map_location, pickle_module, **pickle_load_args): args = pickle_module.load(f, **pickle_load_args) key, location, storage_type = args dtype = storage_type.dtype - obj = cast(Storage, torch.UntypedStorage)._new_with_file(f, torch._utils._element_size(dtype)) + obj = cast(Storage, torch._UntypedStorage)._new_with_file(f, torch._utils._element_size(dtype)) obj = restore_location(obj, location) # TODO: Once we decide to break serialization FC, we can - # stop wrapping with TypedStorage - deserialized_objects[key] = torch.storage.TypedStorage( + # stop wrapping with _TypedStorage + deserialized_objects[key] = torch.storage._TypedStorage( wrap_storage=obj, dtype=dtype) @@ -820,8 +820,8 @@ def _legacy_load(f, map_location, pickle_module, **pickle_load_args): element_size = torch._utils._element_size(root.dtype) offset_bytes = offset * element_size # TODO: Once we decide to break serialization FC, we can - # stop wrapping with TypedStorage - deserialized_objects[target_cdata] = torch.storage.TypedStorage( + # stop wrapping with _TypedStorage + deserialized_objects[target_cdata] = torch.storage._TypedStorage( wrap_storage=root._storage[offset_bytes:offset_bytes + numel * element_size], dtype=root.dtype) @@ -868,11 +868,11 @@ def _legacy_load(f, map_location, pickle_module, **pickle_load_args): nbytes = numel * torch._utils._element_size(dtype) if root_key not in deserialized_objects: - obj = cast(Storage, torch.UntypedStorage(nbytes)) + obj = cast(Storage, torch._UntypedStorage(nbytes)) obj._torch_load_uninitialized = True # TODO: Once we decide to break serialization FC, we can - # stop wrapping with TypedStorage - deserialized_objects[root_key] = torch.storage.TypedStorage( + # stop wrapping with _TypedStorage + deserialized_objects[root_key] = torch.storage._TypedStorage( wrap_storage=restore_location(obj, location), dtype=dtype) @@ -883,8 +883,8 @@ def _legacy_load(f, map_location, pickle_module, **pickle_load_args): view_size_bytes = view_size * torch._utils._element_size(dtype) if view_key not in deserialized_objects: # TODO: Once we decide to break serialization FC, we can - # stop wrapping with TypedStorage - deserialized_objects[view_key] = torch.storage.TypedStorage( + # stop wrapping with _TypedStorage + deserialized_objects[view_key] = torch.storage._TypedStorage( wrap_storage=typed_storage._storage[offset_bytes:offset_bytes + view_size_bytes], dtype=dtype) res = deserialized_objects[view_key] @@ -994,10 +994,10 @@ def _load(zip_file, map_location, pickle_module, pickle_file='data.pkl', **pickl def load_tensor(dtype, numel, key, location): name = f'data/{key}' - storage = zip_file.get_storage_from_record(name, numel, torch.UntypedStorage).storage()._untyped() + storage = zip_file.get_storage_from_record(name, numel, torch._UntypedStorage).storage()._untyped() # TODO: Once we decide to break serialization FC, we can - # stop wrapping with TypedStorage - loaded_storages[key] = torch.storage.TypedStorage( + # stop wrapping with _TypedStorage + loaded_storages[key] = torch.storage._TypedStorage( wrap_storage=restore_location(storage, location), dtype=dtype) diff --git a/torch/storage.py b/torch/storage.py index 620c891fe2b..54e8df59584 100644 --- a/torch/storage.py +++ b/torch/storage.py @@ -8,7 +8,7 @@ import copy import collections from functools import lru_cache -T = TypeVar('T', bound='Union[_StorageBase, TypedStorage]') +T = TypeVar('T', bound='Union[_StorageBase, _TypedStorage]') class _StorageBase(object): _cdata: Any is_cuda: bool = False @@ -213,7 +213,7 @@ def _storage_type_to_dtype_map(): val: key for key, val in _dtype_to_storage_type_map().items()} return dtype_map -class TypedStorage: +class _TypedStorage: is_sparse = False def fill_(self, value): @@ -229,17 +229,17 @@ class TypedStorage: ' * no arguments\n' ' * (int size)\n' ' * (Sequence data)\n') - if type(self) == TypedStorage: - arg_error_msg += ' * (wrap_storage=, dtype=)' + if type(self) == _TypedStorage: + arg_error_msg += ' * (wrap_storage=<_UntypedStorage>, dtype=)' else: - arg_error_msg += ' * (wrap_storage=)' + arg_error_msg += ' * (wrap_storage=<_UntypedStorage>)' if 'wrap_storage' in kwargs: assert len(args) == 0, ( "No positional arguments should be given when using " "'wrap_storage'") - if type(self) == TypedStorage: + if type(self) == _TypedStorage: assert 'dtype' in kwargs, ( "When using 'wrap_storage', 'dtype' also must be specified") assert len(kwargs) == 2, ( @@ -257,9 +257,9 @@ class TypedStorage: storage = kwargs['wrap_storage'] - if not isinstance(storage, (torch.UntypedStorage, torch.cuda.UntypedStorage)): + if not isinstance(storage, (torch._UntypedStorage, torch.cuda._UntypedStorage)): raise TypeError(arg_error_msg) - if type(self) != TypedStorage and storage.__module__ != self.__module__: + if type(self) != _TypedStorage and storage.__module__ != self.__module__: raise TypeError(( arg_error_msg + f'\n`storage` `module {storage.__module__}` does not match ' @@ -267,9 +267,9 @@ class TypedStorage: self._storage = storage else: - assert type(self) != TypedStorage, ( - "Calling __init__ this way is only supported in TypedStorage's " - "child classes. TypedStorage can only be directly instantiated " + assert type(self) != _TypedStorage, ( + "Calling __init__ this way is only supported in _TypedStorage's " + "child classes. _TypedStorage can only be directly instantiated " "when kwargs 'wrap_storage' and 'dtype' are given.") assert len(kwargs) == 0, "invalid keyword arguments" @@ -282,10 +282,10 @@ class TypedStorage: return True if len(args) == 0: - self._storage = eval(self.__module__).UntypedStorage() + self._storage = eval(self.__module__)._UntypedStorage() elif len(args) == 1 and isint(args[0]): - self._storage = eval(self.__module__).UntypedStorage(int(args[0]) * self.element_size()) + self._storage = eval(self.__module__)._UntypedStorage(int(args[0]) * self.element_size()) elif len(args) == 1 and isinstance(args[0], collections.abc.Sequence): if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]: @@ -321,10 +321,10 @@ class TypedStorage: def _new_wrapped_storage(self, untyped_storage): module = eval(untyped_storage.__module__) - assert type(untyped_storage) == module.UntypedStorage + assert type(untyped_storage) == module._UntypedStorage - if type(self) == TypedStorage: - return TypedStorage(wrap_storage=untyped_storage, dtype=self.dtype) + if type(self) == _TypedStorage: + return _TypedStorage(wrap_storage=untyped_storage, dtype=self.dtype) else: # NOTE: We need to use the module of untyped_storage in case self's # module is different, e.g. if self is on CPU and untyped_storage @@ -371,7 +371,7 @@ class TypedStorage: torch.qint8: torch.int8 } tmp_dtype = interpret_dtypes[self.dtype] - tmp_tensor = torch.tensor([], dtype=tmp_dtype, device=self.device).set_(TypedStorage( + tmp_tensor = torch.tensor([], dtype=tmp_dtype, device=self.device).set_(_TypedStorage( wrap_storage=self._storage, dtype=tmp_dtype)) else: @@ -380,12 +380,12 @@ class TypedStorage: tmp_tensor[idx] = value def __getitem__(self, idx): - # NOTE: Before TypedStorage existed, indexing with a slice used to be + # NOTE: Before _TypedStorage existed, indexing with a slice used to be # possible for Storage objects. However, it would return - # a storage view, which would be a hassle to implement in TypedStorage, + # a storage view, which would be a hassle to implement in _TypedStorage, # so it was disabled if isinstance(idx, slice): - raise RuntimeError('slices are only supported in UntypedStorage.__getitem__') + raise RuntimeError('slices are only supported in _UntypedStorage.__getitem__') elif not isinstance(idx, int): raise RuntimeError(f"can't index a {type(self)} with {type(idx)}") @@ -397,7 +397,7 @@ class TypedStorage: torch.qint32: torch.int32, torch.qint8: torch.int8 } - return TypedStorage( + return _TypedStorage( wrap_storage=self._storage, dtype=interpret_dtypes[self.dtype])[idx] @@ -430,7 +430,7 @@ class TypedStorage: def __str__(self): data_str = ' ' + '\n '.join(str(self[i]) for i in range(self.size())) - if type(self) == TypedStorage: + if type(self) == _TypedStorage: return data_str + ( f'\n[{torch.typename(self)} with dtype {self.dtype} ' f'of size {len(self)}]') @@ -450,7 +450,7 @@ class TypedStorage: return self._new_wrapped_storage(copy.deepcopy(self._storage, memo)) def __sizeof__(self): - return super(TypedStorage, self).__sizeof__() + self.nbytes() + return super(_TypedStorage, self).__sizeof__() + self.nbytes() def clone(self): """Returns a copy of this storage""" @@ -484,7 +484,7 @@ class TypedStorage: def _new_shared(cls, size): """Creates a new storage in shared memory with the same data type""" module = eval(cls.__module__) - untyped_storage = module.UntypedStorage._new_shared(size * cls().element_size()) + untyped_storage = module._UntypedStorage._new_shared(size * cls().element_size()) return cls(wrap_storage=untyped_storage) @property @@ -517,25 +517,25 @@ class TypedStorage: @classmethod def _free_weak_ref(cls, *args, **kwargs): - return eval(cls.__module__).UntypedStorage._free_weak_ref(*args, **kwargs) + return eval(cls.__module__)._UntypedStorage._free_weak_ref(*args, **kwargs) def _weak_ref(self, *args, **kwargs): return self._storage._weak_ref(*args, **kwargs) @classmethod def from_buffer(cls, *args, **kwargs): - if cls == TypedStorage: + if cls == _TypedStorage: raise RuntimeError( - 'from_buffer: only supported for subclasses of TypedStorage') + 'from_buffer: only supported for subclasses of _TypedStorage') if 'dtype' in kwargs or len(args) == 5: raise RuntimeError(( "from_buffer: 'dtype' can only be specified in " - "UntypedStorage.from_buffer")) + "_UntypedStorage.from_buffer")) kwargs['dtype'] = cls().dtype - untyped_storage = eval(cls.__module__).UntypedStorage.from_buffer(*args, **kwargs) + untyped_storage = eval(cls.__module__)._UntypedStorage.from_buffer(*args, **kwargs) return cls(wrap_storage=untyped_storage) def _to(self, dtype): @@ -594,9 +594,9 @@ class TypedStorage: @classmethod def from_file(cls, filename, shared, size): - if cls == TypedStorage: + if cls == _TypedStorage: raise RuntimeError('from_file can only be called on derived classes') - untyped_storage = eval(cls.__module__).UntypedStorage.from_file( + untyped_storage = eval(cls.__module__)._UntypedStorage.from_file( filename, shared, size * torch._utils._element_size(cls.dtype)) @@ -605,7 +605,7 @@ class TypedStorage: @classmethod def _expired(cls, *args, **kwargs): - return eval(cls.__module__).UntypedStorage._expired(*args, **kwargs) + return eval(cls.__module__)._UntypedStorage._expired(*args, **kwargs) def is_pinned(self): return self._storage.is_pinned() @@ -627,11 +627,11 @@ class TypedStorage: @classmethod def _new_shared_cuda(cls, *args, **kwargs): - return eval(cls.__module__).UntypedStorage._new_shared_cuda(*args, **kwargs) + return eval(cls.__module__)._UntypedStorage._new_shared_cuda(*args, **kwargs) @classmethod def _new_with_weak_ptr(cls, *args, **kwargs): - return eval(cls.__module__).UntypedStorage._new_with_weak_ptr(*args, **kwargs) + return eval(cls.__module__)._UntypedStorage._new_with_weak_ptr(*args, **kwargs) def _share_filename_(self, *args, **kwargs): manager_handle, storage_handle, size = self._storage._share_filename_(*args, **kwargs) @@ -640,7 +640,7 @@ class TypedStorage: @classmethod def _new_shared_filename(cls, manager, obj, size): bytes_size = size * torch._utils._element_size(cls.dtype) - return cls(wrap_storage=eval(cls.__module__).UntypedStorage._new_shared_filename(manager, obj, bytes_size)) + return cls(wrap_storage=eval(cls.__module__)._UntypedStorage._new_shared_filename(manager, obj, bytes_size)) def _shared_decref(self): self._storage._shared_decref() @@ -648,7 +648,7 @@ class TypedStorage: @classmethod def _release_ipc_counter(cls, *args, **kwargs): - return eval(cls.__module__).UntypedStorage._release_ipc_counter(*args, **kwargs) + return eval(cls.__module__)._UntypedStorage._release_ipc_counter(*args, **kwargs) def _shared_incref(self, *args, **kwargs): return self._storage._shared_incref(*args, **kwargs) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index ccb538fa3f2..6e67b77613c 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -2133,7 +2133,7 @@ class TestCase(expecttest.TestCase): ), sequence_types=( Sequence, - torch.storage.TypedStorage, + torch.storage._TypedStorage, Sequential, ModuleList, ParameterList, From b01d1ad17153de504fa9663c4585b28cced979e5 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 15 Feb 2022 15:54:37 -0800 Subject: [PATCH 072/199] [FSDP] Fix summon_full_params when not sharded (#72572) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72572 Use `continue` instead of `pass` which would result in AttributeError because `_full_param_padded` is not created for unsharded parameter when world_size == 1. Add a test to cover this case. ghstack-source-id: 149111044 Test Plan: CI Reviewed By: zhaojuanmao Differential Revision: D34101124 fbshipit-source-id: 71d82bf94a091ef90f52b31c213192a5dd547332 (cherry picked from commit cc7899a5eaf5bc091eb772ade68a0a24a1fdab80) --- .../fsdp/test_fsdp_summon_full_params.py | 84 ++++++++++++------- .../fsdp/fully_sharded_data_parallel.py | 6 +- 2 files changed, 60 insertions(+), 30 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py index 93a8004c972..34d1cf920fa 100644 --- a/test/distributed/fsdp/test_fsdp_summon_full_params.py +++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py @@ -34,6 +34,54 @@ if TEST_WITH_DEV_DBG_ASAN: ) sys.exit(0) +def _run_test_summon_full_param_writeback(cls, writeback, cpu_offload, modify_outer): + model = FSDP( + nn.Sequential( + FSDP(nn.Linear(5, 5, bias=False)), nn.Linear(5, 3, bias=False) + ) + ).cuda(cls.rank) + + # set the value + outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") + inner_param = model.get_parameter( + "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" + ) + p = outer_param if modify_outer else inner_param + + with torch.no_grad(): + # This sets the local shard value + p[0] = cls.rank + 2 + + with model._summon_full_params(writeback=writeback): + with torch.no_grad(): + p.copy_(torch.zeros_like(p)) + + if writeback or cls.world_size == 1: + # When world_size = 1, FSDP does not shard and parameter is not set to + # a local shard, so write is always reflected. + cls.assertEqual(p.cpu()[0], 0) + else: + cls.assertEqual(p.cpu()[0], cls.rank + 2) + +class TestSummonFullParamsNoShard(FSDPTest): + @property + def world_size(self): + return 1 # does not shard + + @skip_if_lt_x_gpu(2) + @parametrize("writeback", [True, False]) + @parametrize( + "cpu_offload", + [CPUOffload(offload_params=True), CPUOffload(offload_params=False)], + ) + @parametrize("modify_outer", [True, False]) + def test_summon_full_param_writeback(self, writeback, cpu_offload, modify_outer): + return _run_test_summon_full_param_writeback( + self, + writeback, + cpu_offload, + modify_outer, + ) class TestSummonFullParams(FSDPTest): @property @@ -54,34 +102,13 @@ class TestSummonFullParams(FSDPTest): [CPUOffload(offload_params=True), CPUOffload(offload_params=False)], ) @parametrize("modify_outer", [True, False]) - def test_summon_full_param_writeback( - self, writeback, cpu_offload, modify_outer - ): - model = FSDP( - nn.Sequential( - FSDP(nn.Linear(5, 5, bias=False)), nn.Linear(5, 3, bias=False) - ) - ).cuda(self.rank) - - # set the value - outer_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - inner_param = model.get_parameter( - "_fsdp_wrapped_module._fpw_module.0._fsdp_wrapped_module.flat_param" + def test_summon_full_param_writeback(self, writeback, cpu_offload, modify_outer): + return _run_test_summon_full_param_writeback( + self, + writeback, + cpu_offload, + modify_outer ) - p = outer_param if modify_outer else inner_param - - with torch.no_grad(): - # This sets the local shard value - p[0] = self.rank + 2 - - with model._summon_full_params(writeback=writeback): - with torch.no_grad(): - p.copy_(torch.zeros_like(p)) - - if writeback: - self.assertEqual(p.cpu()[0], 0) - else: - self.assertEqual(p.cpu()[0], self.rank + 2) @skip_if_lt_x_gpu(2) def test_summon_full_param_shard_value(self): @@ -280,7 +307,7 @@ class TestSummonFullParams(FSDPTest): self.assertEqual(0, inner_param._full_param_padded.storage().size()) @skip_if_lt_x_gpu(2) - def test_params_are_unflatenned(self): + def test_params_are_unflattenned(self): model = FSDP(nn.Linear(self.world_size, 1, bias=False)).cuda(self.rank) flattened_param = model.get_parameter("_fsdp_wrapped_module.flat_param") @@ -313,6 +340,7 @@ class TestSummonFullParams(FSDPTest): instantiate_parametrized_tests(TestSummonFullParams) +instantiate_parametrized_tests(TestSummonFullParamsNoShard) if __name__ == "__main__": diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index 8519114a734..d270230eba1 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -686,7 +686,7 @@ class FullyShardedDataParallel(nn.Module): def _write_back_current_shard(self): for p in self.params: if not p._is_sharded: # type: ignore[attr-defined] - pass + continue # Already copied because no sharding. chunks = p._full_param_padded.chunk(self.world_size) # type: ignore[attr-defined] assert len(chunks) > self.rank chunk = chunks[self.rank] @@ -714,7 +714,9 @@ class FullyShardedDataParallel(nn.Module): .. note:: The full parameters can be modified, but only the portion corresponding to the local param shard will persist after the context manager exits (unless ``writeback=False``, in which case - changes will be discarded). + changes will be discarded). In the case where FSDP does not shard + the parameters, currently only when world_size == 1, the + modification is persisted regardless of ``writeback``. Args: recurse (bool, Optional): recursively summon all params for nested FSDP instances (default: True) From 08889b24df608a896a4fb8708082a9554816c2ab Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 15 Feb 2022 15:54:37 -0800 Subject: [PATCH 073/199] [FSDP] Improved shape unflattening test (#72573) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72573 Verify shapes are restored appropriately in this test. ghstack-source-id: 149111043 Test Plan: CI Reviewed By: zhaojuanmao Differential Revision: D34101125 fbshipit-source-id: 94260da2b7420cf58c5569e596885aa65fe7726e (cherry picked from commit e57a30e8e4caea0593836e52084194a3d3497b72) --- .../fsdp/test_fsdp_summon_full_params.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py index 34d1cf920fa..fbb2ef2bd2e 100644 --- a/test/distributed/fsdp/test_fsdp_summon_full_params.py +++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py @@ -1,5 +1,6 @@ # Owner(s): ["oncall: distributed"] import itertools +from copy import deepcopy import math import sys @@ -308,15 +309,15 @@ class TestSummonFullParams(FSDPTest): @skip_if_lt_x_gpu(2) def test_params_are_unflattenned(self): - model = FSDP(nn.Linear(self.world_size, 1, bias=False)).cuda(self.rank) + layer_shape = (10, 12) + model = nn.Linear(*layer_shape, bias=False).cuda(self.rank) + fsdp_model = FSDP(deepcopy(model)).cuda(self.rank) - flattened_param = model.get_parameter("_fsdp_wrapped_module.flat_param") - self.assertEqual(1, flattened_param.numel()) + flattened_param = fsdp_model.get_parameter("_fsdp_wrapped_module.flat_param") + self.assertEqual(layer_shape[0] * layer_shape[1] / 2, flattened_param.numel()) - with model._summon_full_params(): - a = model.weight.flatten().detach() - b = flattened_param.detach() - self.assertTrue(torch.equal(a, b)) + with fsdp_model._summon_full_params(): + self.assertEqual(fsdp_model.weight.shape, model.weight.shape) @skip_if_lt_x_gpu(2) def test_params_count_and_value(self): From aeacf910b56d3b3fa3a648b5995538dd1abedc42 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 15 Feb 2022 16:00:19 -0800 Subject: [PATCH 074/199] [Checkpoint] Rename file (#72748) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72748 Removes underscore from file/class as directory is already private ghstack-source-id: 149109295 Test Plan: Ci Reviewed By: samdow Differential Revision: D34179308 fbshipit-source-id: 8e956f3c83f21159c5e0fcdce09624ecb8a73ac0 (cherry picked from commit adfd8bc357b2ee4920054a3c984464b51daf0e35) --- test/distributed/fsdp/test_fsdp_checkpoint.py | 2 +- .../{_checkpoint_wrapper.py => checkpoint_wrapper.py} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename torch/distributed/algorithms/_checkpoint/{_checkpoint_wrapper.py => checkpoint_wrapper.py} (95%) diff --git a/test/distributed/fsdp/test_fsdp_checkpoint.py b/test/distributed/fsdp/test_fsdp_checkpoint.py index e3dd483eaf6..1d7dee78832 100644 --- a/test/distributed/fsdp/test_fsdp_checkpoint.py +++ b/test/distributed/fsdp/test_fsdp_checkpoint.py @@ -10,7 +10,7 @@ from torch.distributed.fsdp.fully_sharded_data_parallel import ( FullyShardedDataParallel as FSDP, CPUOffload, ) -from torch.distributed.algorithms._checkpoint._checkpoint_wrapper import ( +from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( checkpoint_wrapper, ) from torch.testing._internal.common_distributed import ( diff --git a/torch/distributed/algorithms/_checkpoint/_checkpoint_wrapper.py b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py similarity index 95% rename from torch/distributed/algorithms/_checkpoint/_checkpoint_wrapper.py rename to torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py index 163f4b457eb..077cf45569f 100644 --- a/torch/distributed/algorithms/_checkpoint/_checkpoint_wrapper.py +++ b/torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py @@ -11,7 +11,7 @@ class CheckpointImpl(Enum): NO_REENTRANT = auto() -class _CheckpointWrapper(torch.nn.Module): +class CheckpointWrapper(torch.nn.Module): """ An nn.Module that wraps another nn.Module with checkpointing. """ @@ -76,4 +76,4 @@ def checkpoint_wrapper( "checkpoint implementation." ) - return _CheckpointWrapper(module, checkpoint_impl, offload_to_cpu) + return CheckpointWrapper(module, checkpoint_impl, offload_to_cpu) From ec3a5ca6d3f7762922effc3d44132a7723bf8e16 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 15 Feb 2022 16:00:19 -0800 Subject: [PATCH 075/199] [monitored barrier] Slight logging enhancement (#72754) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72754 Log to clarify that this msg is coming from rank 0 ghstack-source-id: 149109294 Test Plan: CI Reviewed By: H-Huang Differential Revision: D34188480 fbshipit-source-id: 38d772392148b0dba97b619f8c8dbef1daf86008 (cherry picked from commit 00e2d3e44bd8ccf578693edb1689c3b694ac11e1) --- torch/csrc/distributed/c10d/ProcessGroupGloo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp index d95afa32ec8..1297af592d9 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp @@ -2814,7 +2814,7 @@ void ProcessGroupGloo::monitoredBarrier( TORCH_INTERNAL_ASSERT(!failedRanks.empty()); const std::string ranksStr = c10::Join(", ", failedRanks); const std::string error = c10::str( - "Ranks ", + "[Rank 0]: Ranks ", ranksStr, " failed to pass monitoredBarrier in ", monitoredBarrierTimeout.count(), From ad38b92f5db6e4b0a467cd2303563450c1dde6d7 Mon Sep 17 00:00:00 2001 From: dzdang Date: Tue, 15 Feb 2022 16:55:47 -0800 Subject: [PATCH 076/199] [Quant][core][devs] Separated implementations for quantized & non-quantized tensors in reflection_pad2d_cpu (#72442) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72442 This PR is part of a series of PRs addressing https://github.com/pytorch/pytorch/issues/54150, related to using dispatcher for calls to quantized backends as opposed to if/else conditionals. This particular PR separates the calls to quantized & non-quantized backends for reflection_pad2d_cpu using a dispatcher. Differential Revision: D34046271 D34046271 Test Plan: Imported from OSS Reviewed By: jerryzh168 Pulled By: dzdang fbshipit-source-id: fef7148ba7589856b3a82d37106ac4018efeec37 (cherry picked from commit 5cbdb264c0dbaf061a230b3ebca04c1c1d52b8c6) --- aten/src/ATen/native/ReflectionPad.cpp | 18 ++++++++---------- aten/src/ATen/native/native_functions.yaml | 3 ++- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp index 81eba80af1d..9a5fa021bb4 100644 --- a/aten/src/ATen/native/ReflectionPad.cpp +++ b/aten/src/ATen/native/ReflectionPad.cpp @@ -940,18 +940,16 @@ Tensor& reflection_pad2d_out_cpu(const Tensor& input, IntArrayRef padding, } Tensor reflection_pad2d_cpu(const Tensor& input, IntArrayRef padding) { - Tensor output; - if (input.is_quantized()) { - if (input.qscheme() == kPerTensorAffine) { - output = at::_empty_affine_quantized({0}, input.options(), + Tensor output = at::empty({0}, input.options()); + reflection_pad2d_out_template(output, input, padding); + return output; +} + +Tensor reflection_pad2d_quantized_cpu(const Tensor& input, IntArrayRef padding) { + TORCH_CHECK(input.qscheme() == kPerTensorAffine, "Only per tensor quantization is supported"); + Tensor output = at::_empty_affine_quantized({0}, input.options(), input.q_scale(), input.q_zero_point()); - } else { - TORCH_CHECK(false, "Only per tensor quantization is supported"); - } - } else { - output = at::empty({0}, input.options()); - } reflection_pad2d_out_template(output, input, padding); return output; } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 7c252141099..824ff73091e 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -9438,7 +9438,8 @@ - func: reflection_pad2d(Tensor self, int[4] padding) -> Tensor python_module: nn dispatch: - CPU, QuantizedCPU: reflection_pad2d_cpu + CPU: reflection_pad2d_cpu + QuantizedCPU: reflection_pad2d_quantized_cpu CUDA: reflection_pad2d_cuda - func: reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, int[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) From 7a031ec17f5cfc052e618694d4bee28381a6051d Mon Sep 17 00:00:00 2001 From: Terry Chen Date: Tue, 15 Feb 2022 17:40:18 -0800 Subject: [PATCH 077/199] [Qunat] Add ConvTranspose reference module (#72473) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72473 Add ConvTranspose reference module Test Plan: python3 test/test_quantization.py TestQuantizeEagerOps.test_conv_transpose_op Imported from OSS Reviewed By: jerryzh168 Differential Revision: D34126542 fbshipit-source-id: 7da167695a1fd9c141059bce14cce4f0608b086c (cherry picked from commit dee22dcf483e5b7f057ff6038acb893a6b01c91e) --- .../eager/test_quantize_eager_ptq.py | 126 +++++++++++++ torch/ao/quantization/quantize.py | 12 +- .../quantized/_reference/modules/__init__.py | 5 +- torch/nn/quantized/_reference/modules/conv.py | 169 +++++++++++++++++- 4 files changed, 308 insertions(+), 4 deletions(-) diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py index 6587740bdf9..f022072406f 100644 --- a/test/quantization/eager/test_quantize_eager_ptq.py +++ b/test/quantization/eager/test_quantize_eager_ptq.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn import torch.nn.quantized as nnq +import torch.nn.quantized._reference as nnqr from torch.nn.utils.rnn import PackedSequence from torch.ao.quantization import ( quantize, @@ -74,6 +75,131 @@ import unittest import numpy as np class TestQuantizeEagerOps(QuantizationTestCase): + def _test_reference_module_impl(self, + float_module_class, + quantized_module_class, + extra_module_kwargs, + input_size): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = float_module_class(**extra_module_kwargs) + self.quant = QuantStub() + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.dequant(x) + return x + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = float_module_class(**extra_module_kwargs) + self.quant1 = QuantStub() + self.dequant1 = DeQuantStub() + self.quant2 = QuantStub() + self.dequant2 = DeQuantStub() + + def forward(self, x): + x = self.quant1(x) + x = self.dequant1(x) + x = self.conv(x) + x = self.quant2(x) + x = self.dequant2(x) + return x + + + data = torch.randn(*input_size, dtype=torch.float) + original_m = M() + original_ref_m = RefM() + torch.quantization.engine = 'qnnpack' + + original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach()) + original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach()) + + original_m.qconfig = torch.quantization.get_default_qconfig('qnnpack') + + m = prepare(original_m) + # calibration + m(data) + + m = convert(m) + # check if the module is properly quantized + self.assertEqual(type(m.quant), nnq.Quantize) + self.assertEqual(type(m.conv), quantized_module_class) + self.assertEqual(type(m.dequant), nnq.DeQuantize) + res = m(data) + + # quantize the reference model + original_ref_m.eval() + original_ref_m.qconfig = torch.quantization.get_default_qconfig('qnnpack') + + ref_m = prepare(original_ref_m) + ref_m(data) + reference_module_mapping = { + QuantStub: nnq.Quantize, + DeQuantStub: nnq.DeQuantize, + nn.Conv1d: nnqr.Conv1d, + nn.Conv2d: nnqr.Conv2d, + nn.Conv3d: nnqr.Conv3d, + nn.ConvTranspose1d: nnqr.ConvTranspose1d, + nn.ConvTranspose2d: nnqr.ConvTranspose2d, + nn.ConvTranspose3d: nnqr.ConvTranspose3d, + } + ref_m = convert(ref_m, mapping=reference_module_mapping) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) + + def test_conv_1d(self): + self._test_reference_module_impl( + nn.Conv1d, + nnq.Conv1d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 1) + ) + + def test_conv_2d(self): + self._test_reference_module_impl( + nn.Conv2d, + nnq.Conv2d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10) + ) + + def test_conv_3d(self): + self._test_reference_module_impl( + nn.Conv3d, + nnq.Conv3d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10, 10) + ) + + def test_conv_transpose_1d(self): + self._test_reference_module_impl( + nn.ConvTranspose1d, + nnq.ConvTranspose1d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 1) + ) + + def test_conv_transpose_2d(self): + self._test_reference_module_impl( + nn.ConvTranspose2d, + nnq.ConvTranspose2d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10) + ) + + def test_conv_transpose_3d(self): + self._test_reference_module_impl( + nn.ConvTranspose3d, + nnq.ConvTranspose3d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10, 10) + ) + def _test_activation_op_impl( self, float_module_class, quantized_module_class, extra_module_kwargs): """ Implementation for testing common activation ops like leaky relu diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py index 5afff09b64b..fad2b8abe6e 100644 --- a/torch/ao/quantization/quantize.py +++ b/torch/ao/quantization/quantize.py @@ -16,7 +16,7 @@ from torch.ao.quantization.quantization_mappings import ( _has_special_act_post_process, _get_special_act_post_process, ) - +from .utils import get_qparam_dict from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper from torch.ao.quantization.qconfig import ( add_module_to_qconfig_obs_ctr, @@ -565,7 +565,15 @@ def swap_module(mod, mapping, custom_module_class_mapping): new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) swapped = True elif type(mod) in mapping: - new_mod = mapping[type(mod)].from_float(mod) + qmod = mapping[type(mod)] + if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE: + assert mod.qconfig is not None + weight_post_process = mod.qconfig.weight() + weight_post_process(mod.weight) + weight_qparams = get_qparam_dict(weight_post_process) + new_mod = qmod.from_float(mod, weight_qparams) + else: + new_mod = qmod.from_float(mod) swapped = True if swapped: diff --git a/torch/nn/quantized/_reference/modules/__init__.py b/torch/nn/quantized/_reference/modules/__init__.py index 441852c38f9..efbefdbde60 100644 --- a/torch/nn/quantized/_reference/modules/__init__.py +++ b/torch/nn/quantized/_reference/modules/__init__.py @@ -1,9 +1,12 @@ from .linear import Linear -from .conv import Conv1d, Conv2d, Conv3d +from .conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d __all__ = [ 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', + 'ConvTranspose1d', + 'ConvTranspose2d', + 'ConvTranspose3d', ] diff --git a/torch/nn/quantized/_reference/modules/conv.py b/torch/nn/quantized/_reference/modules/conv.py index ed151cb7f5e..60aed0a91ac 100644 --- a/torch/nn/quantized/_reference/modules/conv.py +++ b/torch/nn/quantized/_reference/modules/conv.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, List from torch.nn.common_types import _size_1_t from .utils import _quantize_weight, _quantize_and_dequantize_weight from .utils import _save_weight_qparams @@ -14,6 +14,7 @@ class _ConvNd(torch.nn.modules.conv._ConvNd): this is useful when user want to use this module in other backends like Glow. """ __annotations__ = {"bias": Optional[torch.Tensor]} + _IS_REFERENCE = True def _save_to_state_dict(self, destination, prefix, keep_vars): super()._save_to_state_dict(destination, prefix, keep_vars) @@ -217,3 +218,169 @@ class Conv3d(_ConvNd, nn.Conv3d): @classmethod def from_float(cls, float_conv, weight_qparams): return _ConvNd.from_float(cls, float_conv, weight_qparams) + +class _ConvTransposeNd(_ConvNd, torch.nn.modules.conv._ConvTransposeNd): + """ A reference version of nn.quantized.ConvTranspose2d + we will not pack the parameters in this module, since weight packing is an + optimization for quantized backends supported in PyTorch (fbgemm/qnnpack), + this is useful when user want to use this module in other backends like Glow. + """ + @staticmethod + def from_float(cls, float_conv, weight_qparams): + qref_conv = cls( + float_conv.in_channels, + float_conv.out_channels, + float_conv.kernel_size, # type: ignore[arg-type] + float_conv.stride, # type: ignore[arg-type] + float_conv.padding, # type: ignore[arg-type] + float_conv.output_padding, # type: ignore[arg-type] + float_conv.groups, + float_conv.bias is not None, # type: ignore[arg-type] + float_conv.dilation, # type: ignore[arg-type] + float_conv.padding_mode, + device=float_conv.weight.device, + dtype=float_conv.weight.dtype, + weight_qparams=weight_qparams) + qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach()) + if float_conv.bias is not None: + qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach()) + return qref_conv + + +class ConvTranspose1d(_ConvTransposeNd, nn.ConvTranspose1d): + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + output_padding: _size_1_t = 0, + groups: int = 1, + bias: bool = True, + dilation: _size_1_t = 1, + padding_mode: str = "zeros", + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + nn.ConvTranspose1d.__init__( + self, in_channels, out_channels, kernel_size, stride, padding, output_padding, + groups, bias, dilation, padding_mode, device, dtype) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose1d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose1d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv1d + """ + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] + + weight_dequant = self.get_weight() + result = F.conv_transpose1d( + x, weight_dequant, self.bias, self.stride, + self.padding, output_padding, self.groups, self.dilation) + return result + + def _get_name(self): + return "QuantizedConvTranspose1d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) + +class ConvTranspose2d(_ConvTransposeNd, nn.ConvTranspose2d): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, + padding=0, output_padding=0, + groups=1, bias=True, dilation=1, + padding_mode='zeros', + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + + nn.ConvTranspose2d.__init__( + self, in_channels, out_channels, kernel_size, stride, padding, output_padding, + groups, bias, dilation, padding_mode, device, dtype) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose2d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose2d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv2d + """ + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] + + weight_dequant = self.get_weight() + result = F.conv_transpose2d( + x, weight_dequant, self.bias, self.stride, + self.padding, output_padding, self.groups, self.dilation) + + return result + + def _get_name(self): + return "QuantizedConvTranspose2d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) + +class ConvTranspose3d(_ConvTransposeNd, nn.ConvTranspose3d): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, + padding=0, output_padding=0, + groups=1, bias=True, dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + nn.ConvTranspose3d.__init__( + self, in_channels, out_channels, kernel_size, stride, padding, output_padding, + groups, bias, dilation, padding_mode, device, dtype) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose3d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose3d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv3d + """ + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] + + weight_dequant = self.get_weight() + result = F.conv_transpose3d( + x, weight_dequant, self.bias, self.stride, + self.padding, output_padding, self.groups, self.dilation) + return result + + def _get_name(self): + return "QuantizedConvTranspose3d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) From f67cf03526561229ae583fa878277dc515d9c6a9 Mon Sep 17 00:00:00 2001 From: Terry Chen Date: Tue, 15 Feb 2022 18:26:02 -0800 Subject: [PATCH 078/199] [Quant] Add qint32 quantization support (#72472) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72472 Add dtype=int32 support for observer Test Plan: python3 test/test_quantization.py TestObserver.test_per_tensor_observers Imported from OSS Reviewed By: jerryzh168 Differential Revision: D34056640 fbshipit-source-id: 4fa15a7274cfbb6a7dd4e698e3989cc0c0626e7b (cherry picked from commit bf4351de45812a9143b179b7049cc1e93611daf6) --- .../quantization/core/test_workflow_module.py | 64 ++++++++++++------- torch/ao/quantization/observer.py | 1 + torch/ao/quantization/utils.py | 18 ++++-- 3 files changed, 57 insertions(+), 26 deletions(-) diff --git a/test/quantization/core/test_workflow_module.py b/test/quantization/core/test_workflow_module.py index 5415e2b03dc..77fb492984c 100644 --- a/test/quantization/core/test_workflow_module.py +++ b/test/quantization/core/test_workflow_module.py @@ -68,50 +68,70 @@ NP_RANDOM_SEED = 19 tolerance = 1e-6 class TestObserver(QuantizationTestCase): - @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)), + @given(qdtype=st.sampled_from((torch.qint8, torch.quint8, torch.qint32)), qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)), reduce_range=st.booleans()) def test_per_tensor_observers(self, qdtype, qscheme, reduce_range): # reduce_range cannot be true for symmetric quantization with uint8 - if qdtype == torch.quint8 and qscheme == torch.per_tensor_symmetric: + if (qdtype == torch.quint8 and qscheme == torch.per_tensor_symmetric) or qdtype == torch.qint32: reduce_range = False ObserverList = [MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range), MovingAverageMinMaxObserver(averaging_constant=0.5, dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range)] + + def _get_ref_params(reduce_range, qscheme, dtype, input_scale, min_val, max_val): + eps = torch.tensor([tolerance]) + if dtype == torch.qint8: + if reduce_range: + quant_min, quant_max = -64, 63 + else: + quant_min, quant_max = -128, 127 + elif dtype == torch.quint8: + if reduce_range: + quant_min, quant_max = 0, 127 + else: + quant_min, quant_max = 0, 255 + elif dtype == torch.qint32: + quant_min, quant_max = -1 * (2 ** 31), (2 ** 31) - 1 + + min_val_neg = torch.tensor([0.]) + max_val_pos = torch.tensor([input_scale * max_val]) if qdtype is torch.qint32 else torch.tensor([max_val]) + + scale, zero_point = 1.0, 0 + if qscheme == torch.per_tensor_symmetric or qscheme == torch.per_channel_symmetric: + scale = torch.max(-min_val_neg, max_val_pos) / (float(quant_max - quant_min) / 2) + scale = torch.max(scale, eps) + if dtype == torch.quint8: + zero_point = 128 + else: + scale = torch.max((max_val_pos - min_val_neg) / float(quant_max - quant_min), eps) + zero_point = quant_min - torch.round(min_val_neg / scale).to(torch.int) + zero_point = torch.clamp(zero_point, quant_min, quant_max) + return scale, zero_point + for myobs in ObserverList: # Calculate Qparams should return with a warning for observers with no data qparams = myobs.calculate_qparams() + input_scale = 2**16 if qdtype is torch.qint32 else 1 if type(myobs) == MinMaxObserver: - x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) - y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0]) + x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) * input_scale + y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0]) * input_scale else: # Moving average of min/max for x and y matches that of # extreme values for x/y used for minmax observer - x = torch.tensor([0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) - y = torch.tensor([2.0, 5.0, 5.0, 6.0, 7.0, 10.0]) + x = torch.tensor([0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0]) * input_scale + y = torch.tensor([2.0, 5.0, 5.0, 6.0, 7.0, 10.0]) * input_scale result = myobs(x) result = myobs(y) self.assertEqual(result, y) - self.assertEqual(myobs.min_val, 1.0) - self.assertEqual(myobs.max_val, 8.0) + self.assertEqual(myobs.min_val, 1.0 * input_scale) + self.assertEqual(myobs.max_val, 8.0 * input_scale) qparams = myobs.calculate_qparams() - if reduce_range: - if qscheme == torch.per_tensor_symmetric: - ref_scale = 0.062745 * 255 / 127 - ref_zero_point = 0 if qdtype is torch.qint8 else 128 - else: - ref_scale = 0.0313725 * 255 / 127 - ref_zero_point = -64 if qdtype is torch.qint8 else 0 - else: - if qscheme == torch.per_tensor_symmetric: - ref_scale = 0.062745 - ref_zero_point = 0 if qdtype is torch.qint8 else 128 - else: - ref_scale = 0.0313725 - ref_zero_point = -128 if qdtype is torch.qint8 else 0 + ref_scale, ref_zero_point = _get_ref_params(reduce_range, qscheme, qdtype, input_scale, 1.0, 8.0) + self.assertEqual(qparams[1].item(), ref_zero_point) self.assertEqual(qparams[0].item(), ref_scale, atol=1e-5, rtol=0) state_dict = myobs.state_dict() diff --git a/torch/ao/quantization/observer.py b/torch/ao/quantization/observer.py index 4263f4e40b6..73f911a68f7 100644 --- a/torch/ao/quantization/observer.py +++ b/torch/ao/quantization/observer.py @@ -195,6 +195,7 @@ class _ObserverBase(ObserverBase): torch.qint8, torch.quint8, torch.quint4x2, + torch.qint32, ), "Default Observer only works for qint8, quint8 and quint4x2 data type" self.has_customized_qrange = (quant_min is not None) and (quant_max is not None) if self.has_customized_qrange: diff --git a/torch/ao/quantization/utils.py b/torch/ao/quantization/utils.py index aee250c0ca0..a51db1beeee 100644 --- a/torch/ao/quantization/utils.py +++ b/torch/ao/quantization/utils.py @@ -271,7 +271,10 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b # This initialization here is to be resolve TorchScript compilation issues and allow # using of refinement to decouple initial_qmin and initial_qmax from quantization range. # The actual values of initial_qmin and initial_qmax will be reset below. - initial_quant_min, initial_quant_max = 0, 255 + if dtype == torch.qint32: + initial_quant_min, initial_quant_max = 0, 2**31 - 1 + else: + initial_quant_min, initial_quant_max = 0, 255 # The following assignment of self.qmin and self.qmax to the local variables and the if check refine the # attribute from Optional valid integers for use, based on TorchScript's requirements. custom_quant_min, custom_quant_max = quant_min, quant_max @@ -282,9 +285,14 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b ) qrange_len = initial_quant_max - initial_quant_min + 1 - assert ( - 0 < qrange_len <= 256 - ), "quantization range should be positive and not exceed the maximum bit range (=256)." + if dtype == torch.qint8: + assert ( + 0 < qrange_len <= 256 + ), "quantization range should be positive and not exceed the maximum bit range (=256)." + elif dtype == torch.qint32: + assert ( + 0 < qrange_len <= 2**31 + ), "quantization range should be positive and not exceed the maximum bit range (=4294967296)." if dtype == torch.qint8: quant_min, quant_max = -qrange_len // 2, qrange_len // 2 - 1 else: @@ -303,6 +311,8 @@ def calculate_qmin_qmax(quant_min: int, quant_max: int, has_customized_qrange: b quant_min, quant_max = 0, 127 else: quant_min, quant_max = 0, 255 + elif dtype == torch.qint32: + quant_min, quant_max = -1 * (2 ** 31), (2 ** 31) - 1 else: quant_min, quant_max = 0, 15 return quant_min, quant_max From 1750c0177ed9ed45c6b470b94a96c4a0e25935a5 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Tue, 15 Feb 2022 18:41:50 -0800 Subject: [PATCH 079/199] Move dyn fusion api to jit/api/module/ (#72638) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72638 All of the other user-facing apis are in this header, this should be too. Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D34159122 Pulled By: eellison fbshipit-source-id: 71110ad5543246d0fa822c426ad1cf2f65e017f6 (cherry picked from commit f1862b32e15b32c0e9bc18d3c7769e2a668b2678) --- torch/csrc/jit/api/module.h | 53 ++++++++++++++++++ .../runtime/profiling_graph_executor_impl.h | 54 +------------------ 2 files changed, 54 insertions(+), 53 deletions(-) diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h index c2506c6a9ec..e16f8b6b998 100644 --- a/torch/csrc/jit/api/module.h +++ b/torch/csrc/jit/api/module.h @@ -301,6 +301,59 @@ TORCH_API Module optimize_for_inference( Module& module, const std::vector& other_methods = {}); +enum class FusionBehavior { STATIC, DYNAMIC }; + +using FusionStrategy = std::vector>; +// FusionStrategy is used to control the type and number of specializations that +// can occur during fusion +// +// Usage: provide a list of pairs (type, depth) where type is one of "STATIC" or +// "DYNAMIC" and depth is an integer. +// +// Behavior - static vs dynamic: +// - in STATIC fusion, fused ops are compiled to have fixed input shapes. The +// input shapes are determined based on a number of initial profiling runs. +// The shape is determined based on some initial profiling runs. For example, +// if on the first run an input of shape [2, 4] is observed, then the compiled +// op will only work on shapes of size [2, 4]. +// - in DYNAMIC fusion, fused ops are compiled to have variable input shapes, so +// that multiple shapes are possible. Dynamic fusion uses "symbolic shapes", +// where any dimensions of the same value that are observed in profiling runs +// are assumed to have the same value. For example, if inputs of [2,3,4] and +// [3,4,5] are observed, then it is assumed that future inputs will have +// shapes [a,b,c] and [b,c,d] for some values of a,b,c,d. +// +// In both cases, we also recompile on new striding behavior, device, or +// dtype. +// +// Behavior - fallback functions & depth: +// When an input doesn't match the format required by the specialized compiled +// op, it will run a fallback function. +// Fallback functions can also recursively be compiled and specialized based +// on the input shape. Since compilation can be slow, the "depth" parameter is +// provided to limit the number of specializations that can be compiled, +// before JIT gives up on recompiling and falls back to a completely un-fused, +// un-specialized implementation. +// +// The list of (type, depth) pairs controls the type of specializations and the +// number of specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)] +// indicates that the first two specializations will use static fusions, the +// following two specializations will use dynamic fusion, and any inputs that +// satisfy none of the 4 options will run an unfused implementation. +// Below an example of the fallback function structure is shown, if given a +// strategy of [("STATIC", 2), ("DYNAMIC", 2)] and if consecutive runs had +// these input shapes: +// [2, 2], [3, 3], [4, 4], [3, 5], ... +// +// + specialized: statically fused, shape [2, 2] +// \-> + fallback 1; statically fused, shape [3, 3] +// \-> + fallback 2; dynamically fused, shape [A, A] +// \-> + fallback 3: dynamically fused, shape [A, B] +// \-> final fallback: unspecialized, unfused +TORCH_API FusionStrategy getFusionStrategy(); +// returns previous strategy +TORCH_API FusionStrategy setFusionStrategy(FusionStrategy& fusion_strategy); + namespace detail { struct TORCH_API SlotCursor { diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h index af9d1a25c6f..5ae3241d6f3 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h @@ -1,62 +1,10 @@ #pragma once #include +#include namespace torch { namespace jit { -enum class FusionBehavior { STATIC, DYNAMIC }; - -using FusionStrategy = std::vector>; -// FusionStrategy is used to control the type and number of specializations that -// can occur during fusion -// -// Usage: provide a list of pairs (type, depth) where type is one of "STATIC" or -// "DYNAMIC" and depth is an integer. -// -// Behavior - static vs dynamic: -// - in STATIC fusion, fused ops are compiled to have fixed input shapes. The -// input shapes are determined based on a number of initial profiling runs. -// The shape is determined based on some initial profiling runs. For example, -// if on the first run an input of shape [2, 4] is observed, then the compiled -// op will only work on shapes of size [2, 4]. -// - in DYNAMIC fusion, fused ops are compiled to have variable input shapes, so -// that multiple shapes are possible. Dynamic fusion uses "symbolic shapes", -// where any dimensions of the same value that are observed in profiling runs -// are assumed to have the same value. For example, if inputs of [2,3,4] and -// [3,4,5] are observed, then it is assumed that future inputs will have -// shapes [a,b,c] and [b,c,d] for some values of a,b,c,d. -// -// In both cases, we also recompile on new striding behavior, device, or -// dtype. -// -// Behavior - fallback functions & depth: -// When an input doesn't match the format required by the specialized compiled -// op, it will run a fallback function. -// Fallback functions can also recursively be compiled and specialized based -// on the input shape. Since compilation can be slow, the "depth" parameter is -// provided to limit the number of specializations that can be compiled, -// before JIT gives up on recompiling and falls back to a completely un-fused, -// un-specialized implementation. -// -// The list of (type, depth) pairs controls the type of specializations and the -// number of specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)] -// indicates that the first two specializations will use static fusions, the -// following two specializations will use dynamic fusion, and any inputs that -// satisfy none of the 4 options will run an unfused implementation. -// Below an example of the fallback function structure is shown, if given a -// strategy of [("STATIC", 2), ("DYNAMIC", 2)] and if consecutive runs had -// these input shapes: -// [2, 2], [3, 3], [4, 4], [3, 5], ... -// -// + specialized: statically fused, shape [2, 2] -// \-> + fallback 1; statically fused, shape [3, 3] -// \-> + fallback 2; dynamically fused, shape [A, A] -// \-> + fallback 3: dynamically fused, shape [A, B] -// \-> final fallback: unspecialized, unfused -TORCH_API FusionStrategy getFusionStrategy(); -// returns previous strategy -TORCH_API FusionStrategy setFusionStrategy(FusionStrategy& fusion_strategy); - struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase { ProfilingGraphExecutorImpl( const std::shared_ptr& graph, From f8a2efc190fa512b17d044a41d0a7896b835e291 Mon Sep 17 00:00:00 2001 From: Elias Ellison Date: Tue, 15 Feb 2022 18:41:50 -0800 Subject: [PATCH 080/199] Make fusion strategy api public (#72639) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72639 Test Plan: Imported from OSS Reviewed By: soulitzer Differential Revision: D34159123 Pulled By: eellison fbshipit-source-id: 27e4d9694a83e8d6829009882715be4308c96a9f (cherry picked from commit 1cadcd2f756ae3b52bcc5c60ff266dd950acfc56) --- docs/source/jit.rst | 1 + torch/csrc/jit/api/module.h | 78 +++++++++++++++---------------------- torch/jit/__init__.py | 3 +- torch/jit/_fuser.py | 57 +++++++++++---------------- 4 files changed, 57 insertions(+), 82 deletions(-) diff --git a/docs/source/jit.rst b/docs/source/jit.rst index 8a80b6471e1..23426fb3d9e 100644 --- a/docs/source/jit.rst +++ b/docs/source/jit.rst @@ -61,6 +61,7 @@ Creating TorchScript Code ScriptFunction freeze optimize_for_inference + set_fusion_strategy save load ignore diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h index e16f8b6b998..a040b953be1 100644 --- a/torch/csrc/jit/api/module.h +++ b/torch/csrc/jit/api/module.h @@ -304,52 +304,38 @@ TORCH_API Module optimize_for_inference( enum class FusionBehavior { STATIC, DYNAMIC }; using FusionStrategy = std::vector>; -// FusionStrategy is used to control the type and number of specializations that -// can occur during fusion -// -// Usage: provide a list of pairs (type, depth) where type is one of "STATIC" or -// "DYNAMIC" and depth is an integer. -// -// Behavior - static vs dynamic: -// - in STATIC fusion, fused ops are compiled to have fixed input shapes. The -// input shapes are determined based on a number of initial profiling runs. -// The shape is determined based on some initial profiling runs. For example, -// if on the first run an input of shape [2, 4] is observed, then the compiled -// op will only work on shapes of size [2, 4]. -// - in DYNAMIC fusion, fused ops are compiled to have variable input shapes, so -// that multiple shapes are possible. Dynamic fusion uses "symbolic shapes", -// where any dimensions of the same value that are observed in profiling runs -// are assumed to have the same value. For example, if inputs of [2,3,4] and -// [3,4,5] are observed, then it is assumed that future inputs will have -// shapes [a,b,c] and [b,c,d] for some values of a,b,c,d. -// -// In both cases, we also recompile on new striding behavior, device, or -// dtype. -// -// Behavior - fallback functions & depth: -// When an input doesn't match the format required by the specialized compiled -// op, it will run a fallback function. -// Fallback functions can also recursively be compiled and specialized based -// on the input shape. Since compilation can be slow, the "depth" parameter is -// provided to limit the number of specializations that can be compiled, -// before JIT gives up on recompiling and falls back to a completely un-fused, -// un-specialized implementation. -// -// The list of (type, depth) pairs controls the type of specializations and the -// number of specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)] -// indicates that the first two specializations will use static fusions, the -// following two specializations will use dynamic fusion, and any inputs that -// satisfy none of the 4 options will run an unfused implementation. -// Below an example of the fallback function structure is shown, if given a -// strategy of [("STATIC", 2), ("DYNAMIC", 2)] and if consecutive runs had -// these input shapes: -// [2, 2], [3, 3], [4, 4], [3, 5], ... -// -// + specialized: statically fused, shape [2, 2] -// \-> + fallback 1; statically fused, shape [3, 3] -// \-> + fallback 2; dynamically fused, shape [A, A] -// \-> + fallback 3: dynamically fused, shape [A, B] -// \-> final fallback: unspecialized, unfused +// clang-format off +/* +Sets the type and number of specializations that can occur during fusion. + +Usage: provide a list of pairs (type, depth) where type is one of STATIC or DYNAMIC +and depth is an integer. + +Behavior - static vs dynamic: + In STATIC fusion, fused ops are compiled to have fixed input shapes. The shape is determined + based on some initial profiling runs. + In DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple + shapes are possible. + +In both cases, we also recompile on new striding behavior, device, or dtype. + +Behavior - fallback functions & depth: + When an input doesn't match the format required by the specialized compiled op, it will run + a fallback function. Fallback functions are recursively be compiled and specialized based + on the observed tensor shapes. Since compilation can be slow, the "depth" parameter is provided to + limit the number of specializations that can be compiled, before giving up on recompiling and + falling back to a completely un-fused, un-specialized implementation. + +The list of (type, depth) pairs controls the type of specializations and the number of +specializations. For example: [(STATIC, 2), (DYNAMIC, 2)] indicates that the first +two specializations will use static fusions, the following two specializations will use +dynamic fusion, and any inputs that satisfy none of the 4 options will run an +unfused implementation. + +NB: in the future, if more as more fusion backends are added there may be more granular +apis for specific fusers. +*/ +// clang-format on TORCH_API FusionStrategy getFusionStrategy(); // returns previous strategy TORCH_API FusionStrategy setFusionStrategy(FusionStrategy& fusion_strategy); diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py index 77770129a6d..46f99fe7e23 100644 --- a/torch/jit/__init__.py +++ b/torch/jit/__init__.py @@ -48,13 +48,14 @@ from torch.jit._trace import ( ) from torch.jit._async import fork, wait from torch.jit._serialization import save, load -from torch.jit._fuser import optimized_execution, fuser, last_executed_optimized_graph, _set_fusion_strategy +from torch.jit._fuser import optimized_execution, fuser, last_executed_optimized_graph, set_fusion_strategy from torch.jit._freeze import freeze, optimize_for_inference, run_frozen_optimizations # For backwards compatibility _fork = fork _wait = wait +_set_fusion_strategy = set_fusion_strategy def export_opnames(m): diff --git a/torch/jit/_fuser.py b/torch/jit/_fuser.py index c704a67c5a3..62167beaf13 100644 --- a/torch/jit/_fuser.py +++ b/torch/jit/_fuser.py @@ -106,48 +106,35 @@ def _script_method_graph_for(self, parent, *args, **kwargs): self(*args, **kwargs) return last_executed_optimized_graph() -def _set_fusion_strategy(strategy: List[Tuple[str, int]]): +def set_fusion_strategy(strategy: List[Tuple[str, int]]): """ - Sets the type and number of specializations that can occur during fusion + Sets the type and number of specializations that can occur during fusion. Usage: provide a list of pairs (type, depth) where type is one of "STATIC" or "DYNAMIC" - and depth is an integer. - // + and depth is an integer. + Behavior - static vs dynamic: - - in STATIC fusion, fused ops are compiled to have fixed input shapes. The input shapes - are determined based on a number of initial profiling runs. The shape is determined based - on some initial profiling runs. For example, if on the first run an input of shape - [2, 4] is observed, then the compiled op will only work on shapes of size [2, 4]. - - in DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple - shapes are possible. Dynamic fusion uses "symbolic shapes", where any dimensions of the - same value that are observed in profiling runs are assumed to have the same value. - For example, if inputs of [2,3,4] and [3,4,5] are observed, then it is assumed that future - inputs will have shapes [a,b,c] and [b,c,d] for some values of a,b,c,d. + In STATIC fusion, fused ops are compiled to have fixed input shapes. The shape is determined + based on some initial profiling runs. + In DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple + shapes are possible. - In both cases, we also recompile on new striding behavior, device, or dtype. + In both cases, we also recompile on new striding behavior, device, or dtype. - // Behavior - fallback functions & depth: - When an input doesn't match the format required by the specialized compiled op, it will run - a fallback function. - Fallback functions can also recursively be compiled and specialized based on the input shape - Since compilation can be slow, the "depth" parameter is provided to limit the number of - specializations that can be compiled, before JIT gives up on recompiling and falls back - to a completely un-fused, un-specialized implementation. - // + When an input doesn't match the format required by the specialized compiled op, it will run + a fallback function. Fallback functions are recursively be compiled and specialized based + on the observed tensor shapes. Since compilation can be slow, the "depth" parameter is provided to + limit the number of specializations that can be compiled, before giving up on recompiling and + falling back to a completely un-fused, un-specialized implementation. + The list of (type, depth) pairs controls the type of specializations and the number of - specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)] indicates that the first - two specializations will use static fusions, the following two specializations will use - dynamic fusion, and any inputs that satisfy none of the 4 options will run an - unfused implementation. - Below an example of the fallback function structure is shown, if given a strategy of - [("STATIC", 2), ("DYNAMIC", 2)] and if consecutive runs had these input shapes: - [2, 2], [3, 3], [4, 4], [3, 5], ... - // - + specialized: statically fused, shape [2, 2] - |-> + fallback 1; statically fused, shape [3, 3] - |-> + fallback 2; dynamically fused, shape [A, A] - |-> + fallback 3: dynamically fused, shape [A, B] - |-> final fallback: unspecialized, unfused + specializations. For example: [("STATIC", 2), ("DYNAMIC", 2)] indicates that the first + two specializations will use static fusions, the following two specializations will use + dynamic fusion, and any inputs that satisfy none of the 4 options will run an + unfused implementation. + + NB: in the future, if more as more fusion backends are added there may be more granular + apis for specific fusers. """ return torch._C._jit_set_fusion_strategy(strategy) From 763ad1bf251ddde7515cf959f4936e6da0325584 Mon Sep 17 00:00:00 2001 From: Shunting Zhang Date: Tue, 15 Feb 2022 19:08:53 -0800 Subject: [PATCH 081/199] (2/2) Make TorchScript Preserve Fully Qualified Class Name for Python Exceptions: frontend change (#72899) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72899 Reland D33282878 (https://github.com/pytorch/pytorch/commit/911d527b870bb4371da39be0c18a1ce109acb1d5). This is the frontend change. ghstack-source-id: 149204031 Test Plan: Refer to D33282878 (https://github.com/pytorch/pytorch/commit/911d527b870bb4371da39be0c18a1ce109acb1d5). Also check CI Reviewed By: gmagogsfm Differential Revision: D34252127 fbshipit-source-id: 27b17ddd4d05d904eb91fd9ee094d9121f00e388 (cherry picked from commit 1d276baca308110ac40111ccd622400b3bbdc864) --- BUILD.bazel | 3 + test/cpp/jit/test_exception.cpp | 159 ++++++++++++++++ test/jit/myexception.py | 8 + test/jit/test_exception.py | 176 ++++++++++++++++++ test/test_jit.py | 147 --------------- torch/_jit_internal.py | 19 +- torch/csrc/jit/frontend/ir_emitter.cpp | 8 +- torch/csrc/jit/frontend/sugared_value.h | 12 +- .../csrc/jit/python/python_sugared_value.cpp | 5 +- torch/csrc/jit/python/python_sugared_value.h | 10 +- 10 files changed, 388 insertions(+), 159 deletions(-) create mode 100644 test/cpp/jit/test_exception.cpp create mode 100644 test/jit/myexception.py create mode 100644 test/jit/test_exception.py diff --git a/BUILD.bazel b/BUILD.bazel index 6590a7b1c3c..d9780aa23c3 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -1880,6 +1880,9 @@ cc_test( "test/cpp/jit/*.h", "test/cpp/tensorexpr/*.cpp", "test/cpp/tensorexpr/*.h", + ], exclude=[ + # skip this since is not found in OSS build + "test/cpp/jit/test_exception.cpp", ]), linkstatic = True, tags = [ diff --git a/test/cpp/jit/test_exception.cpp b/test/cpp/jit/test_exception.cpp new file mode 100644 index 00000000000..b6b3cbcd679 --- /dev/null +++ b/test/cpp/jit/test_exception.cpp @@ -0,0 +1,159 @@ +/* + * We have a python unit test for exceptions in test/jit/test_exception.py . + * Add a CPP version here to verify that excepted exception types thrown from + * C++. This is hard to test in python code since C++ exceptions will be + * translated to python exceptions. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch { +namespace jit { + +namespace py = pybind11; + +TEST(TestException, TestAssertion) { + std::string pythonCode = R"PY( + def foo(): + raise AssertionError("An assertion failed") + )PY"; + auto cu_ptr = torch::jit::compile(pythonCode); + torch::jit::GraphFunction* gf = + (torch::jit::GraphFunction*)&cu_ptr->get_function("foo"); + std::cerr << "Graph is\n" << *gf->graph() << std::endl; + + bool is_jit_exception = false; + std::string message; + c10::optional exception_class; + try { + cu_ptr->run_method("foo"); + } catch (JITException& e) { + is_jit_exception = true; + message = e.what(); + exception_class = e.getPythonClassName(); + } + EXPECT_TRUE(is_jit_exception); + EXPECT_FALSE(exception_class); + EXPECT_TRUE( + message.find("RuntimeError: AssertionError: An assertion failed") != + std::string::npos); +} + +struct MyPythonExceptionValue : public torch::jit::SugaredValue { + explicit MyPythonExceptionValue(const py::object& exception_class) { + qualified_name_ = + (py::str(py::getattr(exception_class, "__module__", py::str(""))) + + py::str(".") + + py::str(py::getattr(exception_class, "__name__", py::str("")))) + .cast(); + } + + std::string kind() const override { + return "My Python exception"; + } + + // Simplified from PythonExceptionValue::call + std::shared_ptr call( + const torch::jit::SourceRange& loc, + torch::jit::GraphFunction& caller, + at::ArrayRef args, + at::ArrayRef kwargs, + size_t n_binders) override { + TORCH_CHECK(args.size() == 1); + Value* error_message = args.at(0).value(*caller.graph()); + Value* qualified_class_name = + insertConstant(*caller.graph(), qualified_name_, loc); + return std::make_shared( + error_message, qualified_class_name); + } + + private: + std::string qualified_name_; +}; + +class SimpleResolver : public torch::jit::Resolver { + public: + explicit SimpleResolver() {} + + std::shared_ptr resolveValue( + const std::string& name, + torch::jit::GraphFunction& m, + const torch::jit::SourceRange& loc) override { + // follows toSugaredValue (toSugaredValue is defined in caffe2:_C which is + // a python extension. We can not add that as a cpp_binary's dep) + if (name == "SimpleValueError") { + py::object obj = py::globals()["SimpleValueError"]; + return std::make_shared(obj); + } + TORCH_CHECK(false, "resolveValue: can not resolve '", name, "{}'"); + } + + torch::jit::TypePtr resolveType( + const std::string& name, + const torch::jit::SourceRange& loc) override { + return nullptr; + } +}; + +/* + * - The python source code parsing for TorchScript here is learned from + * torch::jit::compile. + * - The code only parses one Def. If there are multiple in the code, those + * except the first one are skipped. + */ +TEST(TestException, TestCustomException) { + py::scoped_interpreter guard{}; + py::exec(R"PY( + class SimpleValueError(ValueError): + def __init__(self, message): + super(SimpleValueError, self).__init__(message) + )PY"); + + std::string pythonCode = R"PY( + def foo(): + raise SimpleValueError("An assertion failed") + )PY"; + + torch::jit::Parser p( + std::make_shared(pythonCode, "", 1)); + auto def = torch::jit::Def(p.parseFunction(/*is_method=*/false)); + std::cerr << "Def is:\n" << def << std::endl; + auto cu = std::make_shared(); + (void)cu->define( + c10::nullopt, + {}, + {}, + {def}, + // class PythonResolver is defined in + // torch/csrc/jit/python/script_init.cpp. It's not in a header file so I + // can not use it. Create a SimpleResolver insteand + {std::make_shared()}, + nullptr); + torch::jit::GraphFunction* gf = + (torch::jit::GraphFunction*)&cu->get_function("foo"); + std::cerr << "Graph is\n" << *gf->graph() << std::endl; + bool is_jit_exception = false; + c10::optional exception_class; + std::string message; + try { + cu->run_method("foo"); + } catch (JITException& e) { + is_jit_exception = true; + exception_class = e.getPythonClassName(); + message = e.what(); + } + EXPECT_TRUE(is_jit_exception); + EXPECT_EQ("__main__.SimpleValueError", *exception_class); + EXPECT_TRUE( + message.find("__main__.SimpleValueError: An assertion failed") != + std::string::npos); +} + +} // namespace jit +} // namespace torch diff --git a/test/jit/myexception.py b/test/jit/myexception.py new file mode 100644 index 00000000000..5937bd3c91b --- /dev/null +++ b/test/jit/myexception.py @@ -0,0 +1,8 @@ +r""" +Define exceptions used in test_exception.py. We define them in a +separate file on purpose to make sure the fully qualified exception class name +is captured correctly in suce cases. +""" +class MyKeyError(KeyError): + def __init__(self, msg): + super(KeyError, self).__init__(msg) diff --git a/test/jit/test_exception.py b/test/jit/test_exception.py new file mode 100644 index 00000000000..dce38e3be89 --- /dev/null +++ b/test/jit/test_exception.py @@ -0,0 +1,176 @@ +# Owner(s): ["oncall: jit"] +from torch.testing._internal.common_utils import TestCase +import torch +from torch import nn + +r""" +Test TorchScript exception handling. +""" +class TestException(TestCase): + def test_pyop_exception_message(self): + class Foo(torch.jit.ScriptModule): + def __init__(self): + super(Foo, self).__init__() + self.conv = nn.Conv2d(1, 10, kernel_size=5) + + @torch.jit.script_method + def forward(self, x): + return self.conv(x) + foo = Foo() + # testing that the correct error message propagates + with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"): + foo(torch.ones([123])) # wrong size + + def test_builtin_error_messsage(self): + with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"): + @torch.jit.script + def close_match(x): + return x.masked_fill(True) + + with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently " + "supported in TorchScript"): + @torch.jit.script + def unknown_op(x): + torch.set_anomaly_enabled(True) + return x + + def test_exceptions(self): + cu = torch.jit.CompilationUnit(''' + def foo(cond): + if bool(cond): + raise ValueError(3) + return 1 + ''') + + cu.foo(torch.tensor(0)) + with self.assertRaisesRegex(torch.jit.Error, "3"): + cu.foo(torch.tensor(1)) + + def foo(cond): + a = 3 + if bool(cond): + raise ArbitraryError(a, "hi") + if 1 == 2: + raise ArbitraryError + return a + + with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"): + torch.jit.script(foo) + + def exception_as_value(): + a = Exception() + print(a) + + with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"): + torch.jit.script(exception_as_value) + + @torch.jit.script + def foo_no_decl_always_throws(): + raise RuntimeError("Hi") + + # function that has no declared type but always throws set to None + output_type = next(foo_no_decl_always_throws.graph.outputs()).type() + self.assertTrue(str(output_type) == "NoneType") + + @torch.jit.script + def foo_decl_always_throws(): + # type: () -> Tensor + raise Exception("Hi") + + output_type = next(foo_decl_always_throws.graph.outputs()).type() + self.assertTrue(str(output_type) == "Tensor") + + def foo(): + raise 3 + 4 + + with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"): + torch.jit.script(foo) + + # a escapes scope + @torch.jit.script + def foo(): + if 1 == 1: + a = 1 + else: + if 1 == 1: + raise Exception("Hi") + else: + raise Exception("Hi") + return a + self.assertEqual(foo(), 1) + + @torch.jit.script + def tuple_fn(): + raise RuntimeError("hello", "goodbye") + + with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"): + tuple_fn() + + @torch.jit.script + def no_message(): + raise RuntimeError + + with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"): + no_message() + + def test_assertions(self): + cu = torch.jit.CompilationUnit(''' + def foo(cond): + assert bool(cond), "hi" + return 0 + ''') + + cu.foo(torch.tensor(1)) + with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): + cu.foo(torch.tensor(0)) + + @torch.jit.script + def foo(cond): + assert bool(cond), "hi" + + foo(torch.tensor(1)) + # we don't currently validate the name of the exception + with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): + foo(torch.tensor(0)) + + def test_python_op_exception(self): + @torch.jit.ignore + def python_op(x): + raise Exception("bad!") + + @torch.jit.script + def fn(x): + return python_op(x) + + with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"): + fn(torch.tensor(4)) + + def test_dict_expansion_raises_error(self): + def fn(self): + d = {"foo": 1, "bar": 2, "baz": 3} + return {**d} + + with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, + "Dict expansion "): + torch.jit.script(fn) + + def test_custom_python_exception(self): + class MyValueError(ValueError): + def __init__(self, msg): + super(MyValueError, self).__init__(msg) + + @torch.jit.script + def fn(): + raise MyValueError("test custom exception") + + with self.assertRaisesRegex(torch.jit.Error, "jit.test_exception.MyValueError: test custom exception"): + fn() + + def test_custom_python_exception_defined_elsewhere(self): + from jit.myexception import MyKeyError + + @torch.jit.script + def fn(): + raise MyKeyError("This is a user defined key error") + with self.assertRaisesRegex(torch.jit.Error, "jit.myexception.MyKeyError: This is a user defined key error"): + fn() diff --git a/test/test_jit.py b/test/test_jit.py index 37cd9b5d53c..2527fbf941b 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -13013,153 +13013,6 @@ dedent """ self.checkScript(dedent(code), (101,)) - def test_pyop_exception_message(self): - class Foo(torch.jit.ScriptModule): - def __init__(self): - super(Foo, self).__init__() - self.conv = nn.Conv2d(1, 10, kernel_size=5) - - @torch.jit.script_method - def forward(self, x): - return self.conv(x) - foo = Foo() - # testing that the correct error message propagates - with self.assertRaisesRegex(RuntimeError, r"Expected 3D \(unbatched\) or 4D \(batched\) input to conv2d"): - foo(torch.ones([123])) # wrong size - - def test_builtin_error_messsage(self): - with self.assertRaisesRegex(RuntimeError, "Arguments for call are not valid"): - @torch.jit.script - def close_match(x): - return x.masked_fill(True) - - with self.assertRaisesRegex(RuntimeError, "This op may not exist or may not be currently " - "supported in TorchScript"): - @torch.jit.script - def unknown_op(x): - torch.set_anomaly_enabled(True) - return x - - def test_exceptions(self): - cu = torch.jit.CompilationUnit(''' - def foo(cond): - if bool(cond): - raise ValueError(3) - return 1 - ''') - - cu.foo(torch.tensor(0)) - with self.assertRaisesRegex(torch.jit.Error, "3"): - cu.foo(torch.tensor(1)) - - def foo(cond): - a = 3 - if bool(cond): - raise ArbitraryError(a, "hi") - if 1 == 2: - raise ArbitraryError - return a - - with self.assertRaisesRegex(RuntimeError, "undefined value ArbitraryError"): - torch.jit.script(foo) - - def exception_as_value(): - a = Exception() - print(a) - - with self.assertRaisesRegex(RuntimeError, "cannot be used as a value"): - torch.jit.script(exception_as_value) - - @torch.jit.script - def foo_no_decl_always_throws(): - raise RuntimeError("Hi") - - # function that has no declared type but always throws set to None - output_type = next(foo_no_decl_always_throws.graph.outputs()).type() - self.assertTrue(str(output_type) == "NoneType") - - @torch.jit.script - def foo_decl_always_throws(): - # type: () -> Tensor - raise Exception("Hi") - - output_type = next(foo_decl_always_throws.graph.outputs()).type() - self.assertTrue(str(output_type) == "Tensor") - - def foo(): - raise 3 + 4 - - with self.assertRaisesRegex(RuntimeError, "must derive from BaseException"): - torch.jit.script(foo) - - # a escapes scope - @torch.jit.script - def foo(): - if 1 == 1: - a = 1 - else: - if 1 == 1: - raise Exception("Hi") - else: - raise Exception("Hi") - return a - self.assertEqual(foo(), 1) - - @torch.jit.script - def tuple_fn(): - raise RuntimeError("hello", "goodbye") - - with self.assertRaisesRegex(torch.jit.Error, "hello, goodbye"): - tuple_fn() - - @torch.jit.script - def no_message(): - raise RuntimeError - - with self.assertRaisesRegex(torch.jit.Error, "RuntimeError"): - no_message() - - def test_assertions(self): - cu = torch.jit.CompilationUnit(''' - def foo(cond): - assert bool(cond), "hi" - return 0 - ''') - - cu.foo(torch.tensor(1)) - with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): - cu.foo(torch.tensor(0)) - - @torch.jit.script - def foo(cond): - assert bool(cond), "hi" - - foo(torch.tensor(1)) - # we don't currently validate the name of the exception - with self.assertRaisesRegex(torch.jit.Error, "AssertionError: hi"): - foo(torch.tensor(0)) - - def test_python_op_exception(self): - @torch.jit.ignore - def python_op(x): - raise Exception("bad!") - - @torch.jit.script - def fn(x): - return python_op(x) - - with self.assertRaisesRegex(RuntimeError, "operation failed in the TorchScript interpreter"): - fn(torch.tensor(4)) - - def test_dict_expansion_raises_error(self): - def fn(self): - d = {"foo": 1, "bar": 2, "baz": 3} - return {**d} - - with self.assertRaisesRegex(torch.jit.frontend.NotSupportedError, - "Dict expansion "): - torch.jit.script(fn) - def test_module_parameters_and_buffers(self): weights = torch.randn(10, 10) bias = torch.randn(10) diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py index 20616a978d4..ba570b35391 100644 --- a/torch/_jit_internal.py +++ b/torch/_jit_internal.py @@ -977,7 +977,7 @@ def is_scripting() -> bool: # Retrieves a fully-qualified name (module hierarchy + classname) for a given obj. -def _qualified_name(obj) -> str: +def _qualified_name(obj, mangle_name=True) -> str: # This special case allows us to override the qualified name on a type. # It's currently used in conjunction with tracing, where we create a # fake module to filter only supported attributes. However, since this @@ -1026,13 +1026,16 @@ def _qualified_name(obj) -> str: module_name = module_name.replace("<", "_") module_name = module_name.replace(">", "_") - # __main__ is a builtin module, so rewrite it to "__torch__". - if module_name == "__main__": - module_name = "__torch__" - else: - # Everything else gets a "__torch__" prefix to avoid name collisions - # with the names of user values. - module_name = "__torch__." + module_name + # The PythonExceptionValue C++ class in torch/csrc/jit/python/python_sugared_value.h + # does not need mangle the python class name. + if mangle_name: + # __main__ is a builtin module, so rewrite it to "__torch__". + if module_name == "__main__": + module_name = "__torch__" + else: + # Everything else gets a "__torch__" prefix to avoid name collisions + # with the names of user values. + module_name = "__torch__." + module_name if "." in name: raise RuntimeError(f"Could not get qualified name for class '{name}': " diff --git a/torch/csrc/jit/frontend/ir_emitter.cpp b/torch/csrc/jit/frontend/ir_emitter.cpp index 20cab7c7499..eac6161c923 100644 --- a/torch/csrc/jit/frontend/ir_emitter.cpp +++ b/torch/csrc/jit/frontend/ir_emitter.cpp @@ -2478,12 +2478,14 @@ struct to_ir { void emitRaise(const Raise& raise) { auto sv = emitSugaredExpr(raise.expr(), 1); Value* error_message = nullptr; + Value* qualified_class_name = nullptr; if (auto exception_instance = std::dynamic_pointer_cast(sv)) { // The typical case, an instance of the exception class was thrown: // raise RuntimeError("error") error_message = exception_instance->getValue(); + qualified_class_name = exception_instance->getQualifiedClassName(); } else if ( auto exception_class = std::dynamic_pointer_cast(sv)) { // A bare exception was thrown so add an empty message. e.g. @@ -2500,7 +2502,11 @@ struct to_ir { error_message = graph->insert(aten::str, {error_message}); } - graph->insert(prim::RaiseException, {error_message}, {}, raise.range()); + graph->insert( + prim::RaiseException, + {error_message, qualified_class_name}, + {}, + raise.range()); exit_blocks.insert(environment_stack->block()); } diff --git a/torch/csrc/jit/frontend/sugared_value.h b/torch/csrc/jit/frontend/sugared_value.h index f6a3f72a59d..6ddd9bed753 100644 --- a/torch/csrc/jit/frontend/sugared_value.h +++ b/torch/csrc/jit/frontend/sugared_value.h @@ -744,7 +744,10 @@ struct SimpleSelf : public Self { // This is not a SimpleValue so it can not pass through the code paths that // expect a SimpleValue as a sugared value. struct TORCH_API ExceptionMessageValue : public SugaredValue { - explicit ExceptionMessageValue(Value* value) : value_(value) {} + explicit ExceptionMessageValue( + Value* value, + Value* qualified_class_name = nullptr) + : value_(value), qualified_class_name_(qualified_class_name) {} std::string kind() const override { return "exception message"; @@ -754,7 +757,14 @@ struct TORCH_API ExceptionMessageValue : public SugaredValue { return value_; } + // qualified python class name + Value* getQualifiedClassName() { + return qualified_class_name_; + } + + private: Value* value_; + Value* qualified_class_name_; }; struct TORCH_API ExceptionValue : public SugaredValue { diff --git a/torch/csrc/jit/python/python_sugared_value.cpp b/torch/csrc/jit/python/python_sugared_value.cpp index 87ab27a5552..f014150d8a2 100644 --- a/torch/csrc/jit/python/python_sugared_value.cpp +++ b/torch/csrc/jit/python/python_sugared_value.cpp @@ -914,8 +914,11 @@ std::shared_ptr PythonExceptionValue::call( ->insertNode(caller.graph()->createTuple(message_values)) ->output(); } + Value* qualified_class_name = + insertConstant(*caller.graph(), exception_class_qualified_name_, loc); - return std::make_shared(error_message); + return std::make_shared( + error_message, qualified_class_name); } bool isNamedTupleClass(const py::object& obj) { diff --git a/torch/csrc/jit/python/python_sugared_value.h b/torch/csrc/jit/python/python_sugared_value.h index d3559abda5c..5fef124cf2b 100644 --- a/torch/csrc/jit/python/python_sugared_value.h +++ b/torch/csrc/jit/python/python_sugared_value.h @@ -328,7 +328,12 @@ struct VISIBILITY_HIDDEN PythonClassValue : public ClassValue { struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue { explicit PythonExceptionValue(const py::object& exception_class) : ExceptionValue( - py::str(py::getattr(exception_class, "__name__", py::str("")))) {} + py::str(py::getattr(exception_class, "__name__", py::str("")))), + exception_class_qualified_name_( + py::str(py::module::import("torch._jit_internal") + .attr("_qualified_name")( + exception_class, + /*mangle_name=*/false))) {} std::string kind() const override { return "Python exception"; @@ -340,6 +345,9 @@ struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue { at::ArrayRef args, at::ArrayRef kwargs, size_t n_binders) override; + + private: + std::string exception_class_qualified_name_; }; // Python Slice class. From 84cb810b3f5ec0541878329abd3759de74944930 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Tue, 15 Feb 2022 19:34:32 -0800 Subject: [PATCH 082/199] Revert D34106940: [ZeRO] Add ctor support for multiple param groups Test Plan: revert-hammer Differential Revision: D34106940 (https://github.com/pytorch/pytorch/commit/5dd07324578f5110a2ec5c213fb559bc49004c7a) Original commit changeset: 7e70fc0b3cec Original Phabricator Diff: D34106940 (https://github.com/pytorch/pytorch/commit/5dd07324578f5110a2ec5c213fb559bc49004c7a) fbshipit-source-id: 08f846c9c02be8756475f4e0b57eb381f10c27bd (cherry picked from commit 7675497d8358cb289549539dae98579353d85834) --- .../optim/test_zero_redundancy_optimizer.py | 146 +++--------------- .../optim/zero_redundancy_optimizer.py | 72 +++------ 2 files changed, 44 insertions(+), 174 deletions(-) diff --git a/test/distributed/optim/test_zero_redundancy_optimizer.py b/test/distributed/optim/test_zero_redundancy_optimizer.py index bd819cf2a5c..de8ea511b63 100644 --- a/test/distributed/optim/test_zero_redundancy_optimizer.py +++ b/test/distributed/optim/test_zero_redundancy_optimizer.py @@ -33,7 +33,7 @@ from torch.distributed.algorithms.join import Join, Joinable, JoinHook from torch.distributed.optim import ZeroRedundancyOptimizer from torch.distributed.optim.zero_redundancy_optimizer import _broadcast_object from torch.nn.parallel import DistributedDataParallel as DDP -from torch.optim import SGD, AdamW +from torch.optim import SGD from torch.testing._internal import common_distributed, common_utils from torch.testing._internal.common_utils import ( TEST_WITH_ASAN, @@ -247,6 +247,30 @@ class TestZeroRedundancyOptimizerSingleRank(TestZeroRedundancyOptimizer): self.assertFalse(m.weight.grad) self.assertFalse(m.bias.grad) + def test_constructor(self): + """Check the robustness of the ZeroRedundancyOptimizer constructor by + passing different values for `params`""" + self.dist_init(self.rank) + + m = torch.nn.Linear(1, 1) + # (input, expected error) + inputs = [ + ([], ValueError), # empty parameter list + (torch.randn(1), TypeError), # non-iterable: `torch.Tensor` + (1.2, TypeError), # non-iterable: `float` + ([{"params": m.parameters()}], TypeError), # iterable of dict + (list(m.parameters()) + [42], TypeError), # iterable containing non-`torch.Tensor` + (m.parameters(), None), # `params` as a generator + (list(m.parameters()), None) # `params` as a list + ] + + for input, error in inputs: + if (error): + with self.assertRaises(error): + ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1) + else: + ZeroRedundancyOptimizer(input, optimizer_class=SGD, lr=0.1) + def test_same_dense_param_type(self): """Check that ZeroRedundancyOptimizer raises an exception if the input parameters include sparse tensors or different dense types. @@ -272,58 +296,6 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer): def world_size(self): return min(4, max(2, torch.cuda.device_count())) - def test_constructor(self): - """Check the robustness of the ZeroRedundancyOptimizer constructor by - passing different values for the ``params`` argument.""" - self.dist_init(self.rank) - - m = torch.nn.Sequential( - torch.nn.Linear(5, 10), - torch.nn.Linear(10, 10), - torch.nn.Linear(10, 10), - ).to(self.device) - - # Test various constructor inputs in the form: (input, expected error) - ctor_inputs = [ - ([], ValueError), # empty parameter list - (torch.randn(1), TypeError), # non-iterable: `torch.Tensor` - (1.2, TypeError), # non-iterable: `float` - ([ - {"params": [l.weight for l in m]}, - {"params": [l.bias for l in m]}, - ], None), # iterable of dict - (list(m.parameters()) + [42], TypeError), # iterable containing invalid type - (m.parameters(), None), # `params` as a generator - (list(m.parameters()), None) # `params` as a list - ] - - for ctor_input, error in ctor_inputs: - if error: - with self.assertRaises(error): - ZeroRedundancyOptimizer(ctor_input, optimizer_class=SGD, lr=0.01) - else: - ZeroRedundancyOptimizer(ctor_input, optimizer_class=SGD, lr=0.01) - - # Test constructing with multiple parameter groups more thoroughly - weight_decay = 0.01 - lr = 0.01 - betas = (0.9, 0.999) - eps = 1e-8 - params = [ - {"params": [l.weight for l in m], "weight_decay": 0.}, - {"params": [l.bias for l in m], "weight_decay": weight_decay}, - ] - o = ZeroRedundancyOptimizer( - params, optimizer_class=AdamW, - lr=lr, betas=betas, eps=eps, - ) - assert len(o.param_groups) == 2, \ - f"Expected 2 ZeRO param groups, but got {len(o.param_groups)}" - assert len(o.optim.param_groups) == 2, \ - "Expected 2 local optimizer param groups, but got " \ - f"{len(o.optim.param_groups)}" - - @common_distributed.skip_if_rocm def test_step(self): """ Check that the ZeroRedundancyOptimizer wrapper properly exposes the `.step()` interface""" @@ -487,75 +459,7 @@ class TestZeroRedundancyOptimizerDistributed(TestZeroRedundancyOptimizer): all_trainable() some_trainable() - def test_multiple_param_groups(self): - """ - Tests parity between constructing ZeRO with multiple parameter groups - upfront versus adding parameter groups to ZeRO after construction - versus a non-sharded optimizer. - """ - self.dist_init(self.rank) - - model1 = torch.nn.Sequential( - torch.nn.Linear(5, 10), - torch.nn.Linear(10, 10), - torch.nn.Linear(10, 5), - ) - model2 = copy.deepcopy(model1) - model3 = copy.deepcopy(model1) - model1 = model1.to(self.device) - model2 = model2.to(self.device) - model3 = model3.to(self.device) - - batch_size = 8 - num_iters = 3 - inputs = [ - torch.randn(batch_size, 5).to(self.device) for _ in range(num_iters) - ] - wd = 0.01 - lr = 0.01 - # Construct `optim1` with both parameter groups upfront - optim1 = ZeroRedundancyOptimizer( - [ - {"params": [l.weight for l in model1], "weight_decay": 0.}, - {"params": [l.bias for l in model1], "weight_decay": wd}, - ], - optimizer_class=AdamW, lr=lr, - ) - # Construct `optim2` by adding the second parameter after - optim2 = ZeroRedundancyOptimizer( - [l.weight for l in model2], - optimizer_class=AdamW, lr=lr, weight_decay=0., - ) - optim2.add_param_group( - {"params": [l.bias for l in model2], "weight_decay": wd} - ) - # Construct `optim3` as a non-sharded optimizer - optim3 = AdamW( - [ - {"params": [l.weight for l in model3], "weight_decay": 0.}, - {"params": [l.bias for l in model3], "weight_decay": wd}, - ], lr=lr, - ) - - # Check parity over a few iterations - for iter in range(num_iters): - for model, optim in ( - (model1, optim1), (model2, optim2), (model3, optim3), - ): - optim.zero_grad() - out = model(inputs[iter]) - loss = out.sum() - loss.backward() - optim.step() - - for layer1, layer2, layer3 in zip(model1, model2, model3): - assert torch.allclose(layer1.weight, layer2.weight) - assert torch.allclose(layer1.weight, layer3.weight) - assert torch.allclose(layer1.bias, layer2.bias) - assert torch.allclose(layer1.bias, layer3.bias) - @common_distributed.skip_if_lt_x_gpu(2) - @common_distributed.skip_if_rocm def test_collect_shards(self): """ Check the state consolidation mechanism, and the state dict exposed by ZeroRedundancyOptimizer""" self.dist_init(self.rank) diff --git a/torch/distributed/optim/zero_redundancy_optimizer.py b/torch/distributed/optim/zero_redundancy_optimizer.py index a87bfdaf5fd..70779eac3f1 100644 --- a/torch/distributed/optim/zero_redundancy_optimizer.py +++ b/torch/distributed/optim/zero_redundancy_optimizer.py @@ -10,16 +10,7 @@ import inspect import io import logging from itertools import chain -from typing import ( - Any, - Callable, - Dict, - List, - Optional, - Set, - Type, - Union, -) +from typing import Any, Callable, Dict, List, Optional, Set, Type import torch import torch.distributed as dist @@ -296,8 +287,7 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable): Arguments: params (``Iterable``): an ``Iterable`` of :class:`torch.Tensor` s - or :class:`dict` s giving all parameters, which will be sharded - across ranks. + giving all parameters, which will be sharded across ranks. Keyword Args: optimizer_class (:class:`torch.nn.Optimizer`): the class of the local @@ -374,7 +364,7 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable): **defaults: Any, ): # Perform type and assumption checks on the input parameters - params = self._verify_and_init_params(params) + self._verify_and_init_params(params) self._verify_same_dense_param_type() # NOTE: The parent constructor uses `add_param_group()` which is @@ -383,7 +373,7 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable): # between the parent and child. self.initialized = False - Optimizer.__init__(self, params, defaults) + Optimizer.__init__(self, self._all_params, defaults) Joinable.__init__(self) # Now, all parameters are held in both `self._all_params` and # `self.param_groups` @@ -1299,60 +1289,36 @@ class ZeroRedundancyOptimizer(Optimizer, Joinable): offset = offset_next bucket_assignment.tensor = tensor - def _verify_and_init_params( - self, params: Any, - ) -> Union[List[torch.Tensor], List[dict]]: + def _verify_and_init_params(self, params: Any) -> None: r""" Verifies the type of ``params`` and initializes ``self._all_params`` - as a :class:`list` of all parameters if ``params`` is valid. + if ``params`` is valid. - Arguments: - params (Any): Candidate parameter list or parameter groups to - verify. + While :class:`optim.Optimizer ` allows + ``params`` to be an iterable of :class:`dict` s, currently + ``ZeroRedundancyOptimizer`` strictly requires ``params`` to be an + iterable of :class:`torch.Tensor` s. Raises: TypeError: ``params`` has an invalid type. ValueError: ``params`` is empty. - - Returns: - The persistent form of ``params`` to be passed into the parent - :class:`Optimizer` constructor -- i.e. returns ``params`` as a - :class:`list` to ensure that it can be iterated over again. """ if isinstance(params, torch.Tensor): - raise TypeError("`params` argument should be an iterable of " + raise TypeError("params argument should be an iterable of " f"Tensors, but got {torch.typename(params)}") try: - all_params = list(params) + self._all_params = list(params) except TypeError: - raise TypeError("`params` argument should be an iterable of " + raise TypeError("params argument should be an iterable of " f"Tensors, but got {torch.typename(params)}") - if len(all_params) == 0: + if len(self._all_params) == 0: raise ValueError("ZeroRedundancyOptimizer got an empty parameter " "list") - all_tensors = True - all_dicts = True - for param in all_params: - all_tensors &= isinstance(param, torch.Tensor) - all_dicts &= isinstance(param, dict) - if not all_tensors and not all_dicts: - raise TypeError("`params` argument should be an iterable of " - "Tensors or dicts") - # Ensure that `self._all_params` contains a list of all parameters - if all_tensors: - self._all_params = all_params - elif all_dicts: - self._all_params = [] - # `all_params` contains parameter groups (not parameters) - for param_group in all_params: - if "params" not in param_group: - raise ValueError( - "Each parameter group passed-in via `params` must " - "have a 'params' key mapping to the parameters in " - "the group" - ) - self._all_params.extend(param_group["params"]) - return all_params + for param in self._all_params: + if not isinstance(param, torch.Tensor): + raise TypeError("params argument should be an iterable of " + "Tensors, but got an iterable containing " + f"{torch.typename(param)}") def _verify_same_dense_param_type(self) -> None: r""" From 85d7e73a8aa2dd74970017d11c7411b36b89dfc4 Mon Sep 17 00:00:00 2001 From: Tao Xu Date: Tue, 15 Feb 2022 19:36:09 -0800 Subject: [PATCH 083/199] [Perf] Reduce unnecessary ref count bumps (#72523) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72523 `toTuple()` returns a new intrusive pointer that bumps its underlying ref count. Whereas, `toTupeRef` returns a reference. We can save an unnecessary ref count bump. ghstack-source-id: 149173308 Test Plan: Sandcastle CI Reviewed By: swolchok Differential Revision: D34047666 fbshipit-source-id: 8c821e45f7af4f3f1d098871926b9df288e329fb (cherry picked from commit 34797e508d533c578a40f74ffc82b34e1c3ea40e) --- torch/csrc/jit/mobile/import.cpp | 56 +++++++++++++++----------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp index a48f4a298e5..b1c707974dc 100644 --- a/torch/csrc/jit/mobile/import.cpp +++ b/torch/csrc/jit/mobile/import.cpp @@ -248,9 +248,8 @@ void BytecodeDeserializer::parseFunctionSchema( auto parseArgList = [this, function](c10::ivalue::TupleElements&& argTables) { std::vector args; - for (auto&& argTable : std::move(argTables)) { - auto argTableElements = - std::move(*std::move(argTable).toTuple()).elements(); + for (auto& argTable : argTables) { + auto argTableElements = std::move(argTable.toTupleRef()).elements(); auto name = expect_field(argTableElements, "name", BYTECODE_INDEX_ARGUMENT_NAME) .toStringRef(); @@ -271,19 +270,18 @@ void BytecodeDeserializer::parseFunctionSchema( tryRegisterMethod(args, *function); return args; }; - auto schemaTableElements = - std::move(*std::move(*schemaTable).toTuple()).elements(); - auto arg_list = std::move(*expect_field( - schemaTableElements, - "arguments", - BYTECODE_INDEX_SCHEMA_ARGUMENTS) - .toTuple()) + auto schemaTableElements = std::move(schemaTable->toTupleRef()).elements(); + auto arg_list = std::move(expect_field( + schemaTableElements, + "arguments", + BYTECODE_INDEX_SCHEMA_ARGUMENTS) + .toTupleRef()) .elements(); auto ret_list = std::move( - *expect_field( - schemaTableElements, "returns", BYTECODE_INDEX_SCHEMA_RETURNS) - .toTuple()) + expect_field( + schemaTableElements, "returns", BYTECODE_INDEX_SCHEMA_RETURNS) + .toTupleRef()) .elements(); c10::FunctionSchema schema( function_name, @@ -338,10 +336,10 @@ void BytecodeDeserializer::parseMethods( // Process all methods in this mobile module. for (const auto i : c10::irange(method_i_start, vals.size())) { auto element = std::move(vals[i]); - auto m_tuple = std::move(*element.toTuple()).elements(); + auto m_tuple = std::move(element.toTupleRef()).elements(); const std::string& function_name = m_tuple[0].toStringRef(); auto codeTableElements = - std::move(*std::move(m_tuple[1]).toTuple()).elements(); + std::move(std::move(m_tuple[1]).toTupleRef()).elements(); IValue* schemaTable = // older files do not store function schema (model_version > 0x4L || (model_version == 0x4L && m_tuple.size() >= 3)) ? &m_tuple[2] @@ -351,23 +349,23 @@ void BytecodeDeserializer::parseMethods( auto ins_list = std::move( - *expect_field( - codeTableElements, "instructions", BYTECODE_INDEX_INSTRUCTION) - .toTuple()) + expect_field( + codeTableElements, "instructions", BYTECODE_INDEX_INSTRUCTION) + .toTupleRef()) .elements(); auto ops_list = - std::move(*expect_field( - codeTableElements, "operators", BYTECODE_INDEX_OPERATOR) - .toTuple()) + std::move(expect_field( + codeTableElements, "operators", BYTECODE_INDEX_OPERATOR) + .toTupleRef()) .elements(); auto consts_list = - std::move(*expect_field( - codeTableElements, "constants", BYTECODE_INDEX_CONSTANT) - .toTuple()) + std::move(expect_field( + codeTableElements, "constants", BYTECODE_INDEX_CONSTANT) + .toTupleRef()) .elements(); auto types_list = - std::move(*expect_field(codeTableElements, "types", BYTECODE_INDEX_TYPE) - .toTuple()) + std::move(expect_field(codeTableElements, "types", BYTECODE_INDEX_TYPE) + .toTupleRef()) .elements(); int64_t register_size = expect_field( @@ -377,7 +375,7 @@ void BytecodeDeserializer::parseMethods( c10::ivalue::TupleElements debug_handles_m_tuple; if (debug_handles) { debug_handles_m_tuple = - std::move(*std::move((*debug_handles)[i]).toTuple()).elements(); + std::move(std::move((*debug_handles)[i]).toTupleRef()).elements(); } init_upgrader(function.get()); // 1. First pass all operators from models @@ -454,13 +452,13 @@ mobile::Module BytecodeDeserializer::deserialize( // being a Tuple (int, table), and the integer stands for the bytecode version // number. The rest of the elements are the same as before. // - auto bvals = std::move(*readArchive("bytecode", mcu).toTuple()).elements(); + auto bvals = std::move(readArchive("bytecode", mcu).toTupleRef()).elements(); c10::optional debug_handles; bool has_debug_handles{false}; if (reader_->hasRecord("mobile_debug_handles.pkl")) { debug_handles = - std::move(*readArchive("mobile_debug_handles", mcu).toTuple()) + std::move(readArchive("mobile_debug_handles", mcu).toTupleRef()) .elements(); has_debug_handles = true; } From e4214929c5755b97ff728bcc0bb55aa983991547 Mon Sep 17 00:00:00 2001 From: Ansley Ussery Date: Tue, 15 Feb 2022 19:42:25 -0800 Subject: [PATCH 084/199] Port `amax` to structured kernel (#72124) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72124 Reviewed By: bdhirsh Differential Revision: D34215708 Pulled By: ansley fbshipit-source-id: fee887e331cb8bd9fab3d9d958ff13ac8d07be27 (cherry picked from commit 94dbb5b7e7e14a663dc02ecf5013fad10b8701b3) --- aten/src/ATen/TensorIterator.cpp | 2 +- aten/src/ATen/native/BatchLinearAlgebra.cpp | 30 ++++++------- .../ATen/native/BatchLinearAlgebraKernel.cpp | 6 +-- aten/src/ATen/native/ComplexHelper.h | 2 +- aten/src/ATen/native/LinearAlgebra.cpp | 34 +++++++-------- aten/src/ATen/native/Loss.cpp | 2 +- aten/src/ATen/native/Normalization.cpp | 2 +- aten/src/ATen/native/ReduceOps.cpp | 42 ++++++++++--------- aten/src/ATen/native/UnaryOps.cpp | 6 +-- aten/src/ATen/native/cuda/SpectralOps.cpp | 4 +- .../native/cuda/linalg/BatchLinearAlgebra.cpp | 4 +- aten/src/ATen/native/mkl/SpectralOps.cpp | 8 ++-- aten/src/ATen/native/native_functions.yaml | 4 +- c10/core/ScalarType.h | 2 +- torch/csrc/TypeInfo.cpp | 2 +- torch/csrc/autograd/FunctionsManual.cpp | 4 +- torch/csrc/jit/passes/shape_analysis.cpp | 2 +- 17 files changed, 80 insertions(+), 76 deletions(-) diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index f978456754d..050db7810a9 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -1501,7 +1501,7 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { view_offsets_ = DimVector(ndim_offsets, 0); } -// This is the structured kernels implementation of set_output. It is +// This is the structured kernels' implementation of set_output. It is // NEVER actually called directly; instead, a subclass of TensorIteratorBase // will override set_output to actually do the operation, and then call // set_output on the TensorIteratorBase to setup TI's metadata. diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 8c10269eeca..5fc486c44f5 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -289,7 +289,7 @@ TORCH_META_FUNC(_linalg_svd)(const Tensor& A, // Prepare sizes for S. S is always real, even when A is complex. sizes.pop_back(); sizes.end()[-1] = k; - set_output(1, sizes, {}, A.options().dtype(c10::toValueType(A.scalar_type())), {}); + set_output(1, sizes, {}, A.options().dtype(c10::toRealValueType(A.scalar_type())), {}); } } // namespace meta @@ -2307,7 +2307,7 @@ void linalg_eigh_out_info( // eigenvalues are always real-valued // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(values.scalar_type() == real_dtype); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.scalar_type() == vectors.scalar_type()); @@ -2354,7 +2354,7 @@ void linalg_eigh_out_info( std::tuple linalg_eigh(const Tensor& input, c10::string_view uplo) { squareCheckInputs(input, "linalg.eigh"); checkUplo(uplo); - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); Tensor values = at::empty({0}, input.options().dtype(real_dtype)); Tensor vectors = at::empty({0}, input.options()); Tensor infos = at::zeros({std::max(1, batchCount(input))}, input.options().dtype(kInt)); @@ -2370,7 +2370,7 @@ std::tuple linalg_eigh_out(const Tensor& input, c10::string_vi checkLinalgCompatibleDtype("torch.linalg.eigh", eigvecs, input, "eigenvectors"); // eigenvalues are always real-valued here - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); checkLinalgCompatibleDtype("torch.linalg.eigh", eigvals.scalar_type(), real_dtype, "eigenvalues"); Tensor eigvals_tmp, eigvecs_tmp; @@ -2393,14 +2393,14 @@ Tensor linalg_eigvalsh(const Tensor& input, c10::string_view uplo) { return values; } - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); Tensor values = at::empty({0}, input.options().dtype(real_dtype)); values = at::linalg_eigvalsh_outf(input, uplo, values); return values; } Tensor& linalg_eigvalsh_out(const Tensor& input, c10::string_view uplo, Tensor& result) { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); checkLinalgCompatibleDtype("torch.linalg.eigvalsh", result.scalar_type(), real_dtype); squareCheckInputs(input, "linalg.eigvalsh"); @@ -2461,7 +2461,7 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool value_t* rwork_data = nullptr; if (isComplexType(at::typeMetaToScalarType(self.dtype()))) { int64_t lrwork = std::max(int64_t(1), 3 * n - 2); - ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype())); + ScalarType dtype = toRealValueType(typeMetaToScalarType(self.dtype())); rwork = at::empty({lrwork}, self.options().dtype(dtype)); rwork_data = rwork.data_ptr(); } @@ -2489,7 +2489,7 @@ std::tuple _symeig_helper_cpu(const Tensor& self, bool eigenvect auto self_sizes = self.sizes().vec(); self_sizes.pop_back(); - ScalarType dtype = toValueType(typeMetaToScalarType(self.dtype())); + ScalarType dtype = toRealValueType(typeMetaToScalarType(self.dtype())); auto eigvals = at::empty(self_sizes, self.options().dtype(dtype)); if (self.numel() == 0) { @@ -2549,7 +2549,7 @@ std::tuple symeig_out(const Tensor& self, bool eigenvectors, b checkSameDevice("symeig", vecs, self, "eigenvectors"); checkLinalgCompatibleDtype("symeig", vecs, self, "eigenvectors"); // eigenvalues are always real-valued here - ScalarType real_dtype = toValueType(self.scalar_type()); + ScalarType real_dtype = toRealValueType(self.scalar_type()); checkLinalgCompatibleDtype("symeig", vals.scalar_type(), real_dtype, "eigenvalues"); Tensor vals_tmp, vecs_tmp; @@ -3199,7 +3199,7 @@ static void linalg_lstsq_out_info( TORCH_INTERNAL_ASSERT(rank.scalar_type() == at::kLong); TORCH_INTERNAL_ASSERT(rank.device() == input.device()); - auto real_dtype = toValueType(input.scalar_type()); + auto real_dtype = toRealValueType(input.scalar_type()); TORCH_INTERNAL_ASSERT(singular_values.scalar_type() == real_dtype); TORCH_INTERNAL_ASSERT(singular_values.device() == input.device()); @@ -3397,7 +3397,7 @@ std::tuple linalg_lstsq_out( checkLinalgCompatibleDtype("torch.linalg.lstsq", solution, input, "solution"); // 'residuals' is expected to have real float dtype - ScalarType real_dtype = c10::toValueType(input.scalar_type()); + ScalarType real_dtype = c10::toRealValueType(input.scalar_type()); checkLinalgCompatibleDtype("torch.linalg.lstsq", residuals.scalar_type(), real_dtype, "solution"); // 'rank' is expected to have integer dtype @@ -3414,7 +3414,7 @@ std::tuple linalg_lstsq_out( // set default rcond value double rcond_value = rcond.has_value() ? rcond.value() - : _get_epsilon(c10::toValueType(input.scalar_type())) * std::max(input.size(-2), input.size(-1)); + : _get_epsilon(c10::toRealValueType(input.scalar_type())) * std::max(input.size(-2), input.size(-1)); auto infos = at::zeros({std::max(1, batchCount(input))}, input.options().dtype(kInt)); @@ -3528,9 +3528,9 @@ std::tuple linalg_lstsq( c10::optional rcond, c10::optional driver) { Tensor solution = at::empty({0}, input.options()); - Tensor residuals = at::empty({0}, input.options().dtype(toValueType(input.scalar_type()))); + Tensor residuals = at::empty({0}, input.options().dtype(toRealValueType(input.scalar_type()))); Tensor rank = at::empty({0}, input.options().dtype(at::kLong)); - Tensor singular_values = at::empty({0}, input.options().dtype(toValueType(input.scalar_type()))); + Tensor singular_values = at::empty({0}, input.options().dtype(toRealValueType(input.scalar_type()))); std::tie(solution, residuals, rank, singular_values) = at::linalg_lstsq_outf(input, other, rcond, driver, solution, residuals, rank, singular_values); return std::make_tuple(solution, residuals, rank, singular_values); @@ -3704,7 +3704,7 @@ Tensor _det_lu_based_helper_backward_helper( const Tensor& lu, const Tensor& pivs ) { - auto eps = at::native::_get_epsilon(c10::toValueType(self.scalar_type())); + auto eps = at::native::_get_epsilon(c10::toRealValueType(self.scalar_type())); auto n = self.size(-1); auto eps_tensor = at::tensor(eps, self.options()); auto condition_diagonal = [&](const Tensor& x) { diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index 2bfac093f22..117bbdb9093 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -149,7 +149,7 @@ void apply_eig(const Tensor& self, bool eigenvectors, Tensor& vals_, Tensor& vec Tensor rwork; value_t* rwork_data = nullptr; if (self.is_complex()) { - ScalarType real_dtype = toValueType(typeMetaToScalarType(self.dtype())); + ScalarType real_dtype = toRealValueType(typeMetaToScalarType(self.dtype())); rwork = at::empty({n*2}, self.options().dtype(real_dtype)); rwork_data = rwork.data_ptr(); } @@ -242,7 +242,7 @@ void apply_linalg_eig(Tensor& values, Tensor& vectors, Tensor& input, Tensor& in Tensor rwork; value_t* rwork_data = nullptr; if (input.is_complex()) { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); rwork = at::empty({lda * 2}, input.options().dtype(real_dtype)); rwork_data = rwork.data_ptr(); } @@ -647,7 +647,7 @@ void apply_lstsq(const Tensor& A, Tensor& B, Tensor& rank, Tensor& singular_valu default: rwork_len = std::max(1, rwork_opt); } - rwork = at::empty({rwork_len}, A.options().dtype(c10::toValueType(A.scalar_type()))); + rwork = at::empty({rwork_len}, A.options().dtype(c10::toRealValueType(A.scalar_type()))); rwork_data = rwork.data_ptr(); } diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h index e9efd4b7c88..88668d13145 100644 --- a/aten/src/ATen/native/ComplexHelper.h +++ b/aten/src/ATen/native/ComplexHelper.h @@ -40,7 +40,7 @@ Tensor _view_as_real_physical(const Tensor& self) { new_sizes.back() = 2; auto new_strides = computeStrideForViewAsReal(self.strides()); auto new_storage_offset = 2 * self.storage_offset(); - const auto float_type = c10::toValueType(self.scalar_type()); + const auto float_type = c10::toRealValueType(self.scalar_type()); auto real_tensor = view_tensor(self, float_type, new_storage_offset, new_sizes, new_strides); return real_tensor; } diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index f43c52576d0..4c7ff90e011 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -209,7 +209,7 @@ std::tuple linalg_slogdet_out(const Tensor& input, Tensor& sig checkSameDevice("linalg.slogdet", sign, input, "sign"); checkSameDevice("linalg.slogdet", logabsdet, input, "logabsdet"); checkLinalgCompatibleDtype("linalg.slogdet", sign, input, "sign"); - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); // logabsdet is always real-valued here checkLinalgCompatibleDtype("linalg.slogdet", logabsdet.scalar_type(), real_dtype, "logabsdet"); @@ -248,7 +248,7 @@ std::tuple get_atol_rtol( rtol = rtol_opt.value(); checkNotComplexTolerance(rtol, function_name, "rtol"); } else { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); auto default_rtol = at::full({}, _get_epsilon(real_dtype) * std::max(input.size(-1), input.size(-2)), options); rtol = atol_opt.has_value() ? at::where(atol_opt.value() > 0, at::zeros({}, options), default_rtol) @@ -266,7 +266,7 @@ std::tuple get_atol_rtol( if (rtol_opt.has_value()) { rtol = rtol_opt.value(); } else { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); auto default_rtol = _get_epsilon(real_dtype) * std::max(input.size(-1), input.size(-2)); rtol = (atol_opt.has_value() && atol_opt.value() > 0.0) ? 0.0 @@ -1847,7 +1847,7 @@ inline Tensor _blob_to_Tensor( // we also insert a fake dimension so that the result could directly // be used in _compute_linear_combination auto tensor = at::from_blob((void*)blob.begin(), blob.size(), - c10::toValueType(in.scalar_type())).unsqueeze(0); + c10::toRealValueType(in.scalar_type())).unsqueeze(0); return _move_memory_if_cuda_input(tensor, in); } @@ -1980,7 +1980,7 @@ Tensor compute_T12(const Tensor& A) { reinterpret_cast(&b), {num_prods, num_prods}, {num_prods, 1}, - c10::toValueType(A.scalar_type()) + c10::toRealValueType(A.scalar_type()) ); bs = _move_memory_if_cuda_input(bs, A); @@ -2052,7 +2052,7 @@ Tensor compute_T18(const Tensor& A) { reinterpret_cast(&b), {num_prods, num_prods}, {num_prods, 1}, - c10::toValueType(A.scalar_type()) + c10::toRealValueType(A.scalar_type()) ); bs = _move_memory_if_cuda_input(bs, A); @@ -2287,7 +2287,7 @@ Tensor frobenius_norm(const Tensor& self) { Tensor frobenius_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { // NOTE: As frobenius_norm_out is currently implemented, it will always produce a // strided tensor result, even if the input is sparse. - auto options = self.options().layout(c10::Layout::Strided).dtype(toValueType(self.scalar_type())); + auto options = self.options().layout(c10::Layout::Strided).dtype(toRealValueType(self.scalar_type())); Tensor result = at::empty({0}, options); return at::native::frobenius_norm_out(self, dim, keepdim, result); } @@ -2339,7 +2339,7 @@ Tensor &nuclear_norm_out(const Tensor& self, bool keepdim, Tensor& result) { } Tensor nuclear_norm(const Tensor& self, IntArrayRef dim, bool keepdim) { - Tensor result = at::empty({0}, self.options().dtype(toValueType(self.scalar_type()))); + Tensor result = at::empty({0}, self.options().dtype(toRealValueType(self.scalar_type()))); return at::native::nuclear_norm_out(self, dim, keepdim, result); } @@ -2557,11 +2557,11 @@ static Tensor& linalg_vector_norm_impl(const Tensor& self, const Scalar& scalar_ // linalg_vector_norm_stub. See issue: // https://github.com/pytorch/pytorch/issues/52648 self_ = self.to(in_dtype).abs(); - in_dtype = toValueType(in_dtype); + in_dtype = toRealValueType(in_dtype); } else { self_ = self; } - ScalarType out_dtype = opt_dtype.value_or(toValueType(self.scalar_type())); + ScalarType out_dtype = opt_dtype.value_or(toRealValueType(self.scalar_type())); TORCH_CHECK(!result.defined() || out_dtype == result.scalar_type(), "linalg.vector_norm expected out tensor dtype ", out_dtype, " but got: ", result.scalar_type()); @@ -2575,7 +2575,7 @@ static Tensor& linalg_vector_norm_impl(const Tensor& self, const Scalar& scalar_ } Tensor linalg_vector_norm(const Tensor& self, const Scalar& ord, optional opt_dim, bool keepdim, optional opt_dtype) { - ScalarType out_dtype = opt_dtype.value_or(toValueType(self.scalar_type())); + ScalarType out_dtype = opt_dtype.value_or(toRealValueType(self.scalar_type())); Tensor result = create_reduction_result(self, opt_dim.value_or(IntArrayRef{}), keepdim, out_dtype); return at::native::linalg_vector_norm_impl(self, ord, opt_dim, keepdim, opt_dtype, result); } @@ -2650,7 +2650,7 @@ Tensor& linalg_matrix_norm_out( // Numerical or None norms Tensor linalg_norm(const Tensor& self, const optional& opt_ord, optional opt_dim, bool keepdim, optional opt_dtype) { - auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())).device(self.device()); + auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toRealValueType(self.scalar_type())).device(self.device()); Tensor result = at::empty({0}, options); return at::native::linalg_norm_out( self, opt_ord, opt_dim, keepdim, opt_dtype, result); @@ -2658,7 +2658,7 @@ Tensor linalg_norm(const Tensor& self, const optional& opt_ord, optional // Frobenius and nuclear norms Tensor linalg_norm(const Tensor& self, c10::string_view ord, optional opt_dim, bool keepdim, optional opt_dtype) { - auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toValueType(self.scalar_type())).device(self.device()); + auto options = TensorOptions().dtype(opt_dtype.has_value() ? opt_dtype.value() : toRealValueType(self.scalar_type())).device(self.device()); Tensor result = at::empty({0}, options); return at::native::linalg_norm_out( self, ord, opt_dim, keepdim, opt_dtype, result); @@ -2694,7 +2694,7 @@ Tensor _linalg_cond_helper(const Tensor& self, c10::variant& opt_ord) { // NumPy doesn't define the condition number for 0x0 matrices, we return 0.0 for such input if (self.numel() == 0) { - auto real_dtype = toValueType(typeMetaToScalarType(self.dtype())); + auto real_dtype = toRealValueType(typeMetaToScalarType(self.dtype())); return _linalg_cond_empty_matrix(self, real_dtype); } @@ -2757,7 +2757,7 @@ Tensor linalg_cond(const Tensor& self, const optional& opt_ord) { Tensor& linalg_cond_out(const Tensor& self, const optional& opt_ord, Tensor& result) { checkSameDevice("linalg.cond", result, self); - ScalarType real_dtype = toValueType(self.scalar_type()); + ScalarType real_dtype = toRealValueType(self.scalar_type()); checkLinalgCompatibleDtype("linalg.cond", result.scalar_type(), real_dtype); Tensor result_tmp = at::linalg_cond(self, opt_ord); @@ -2791,7 +2791,7 @@ Tensor linalg_cond(const Tensor& self, c10::string_view ord) { // TODO: implement _out variant avoiding copy and using already allocated storage directly Tensor& linalg_cond_out(const Tensor& self, c10::string_view ord, Tensor& result) { checkSameDevice("linalg.cond", result, self); - ScalarType real_dtype = toValueType(self.scalar_type()); + ScalarType real_dtype = toRealValueType(self.scalar_type()); checkLinalgCompatibleDtype("linalg.cond", result.scalar_type(), real_dtype); Tensor result_tmp = at::linalg_cond(self, ord); diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 1812e61febc..414a2bcb9fd 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -497,7 +497,7 @@ Tensor& mse_loss_backward_out(const Tensor& grad_output, } Tensor l1_loss(const Tensor& input, const Tensor& target, int64_t reduction) { - const auto float_type = c10::toValueType(input.scalar_type()); + const auto float_type = c10::toRealValueType(input.scalar_type()); Tensor result = at::empty({0}, input.options().dtype(float_type)); return at::l1_loss_out(result, input, target, reduction); } diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index fdce903c080..981e568b6b9 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -692,7 +692,7 @@ TORCH_IMPL_FUNC(renorm_out)(const Tensor& self, const Scalar& p, int64_t dim, /*keepdim=*/true); } - auto factor = (acc_type == c10::toValueType(dtype)) ? + auto factor = (acc_type == c10::toRealValueType(dtype)) ? norm : at::empty(norm.sizes(), self.options()); auto iter = TensorIteratorConfig() .add_output(factor) diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 38eafedbeeb..cce0f1a3d3b 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -221,7 +221,7 @@ ScalarType get_result_or_self_value_dtype( if (result.defined()) { return result.scalar_type(); } else { - return dtype.value_or(toValueType(self.scalar_type())); + return dtype.value_or(toRealValueType(self.scalar_type())); } } @@ -266,6 +266,20 @@ TORCH_META_FUNC(aminmax) this->set_output(1, shape, options); } +TORCH_META_FUNC(amax) +(const Tensor& self, IntArrayRef dims, bool keepdim) { + auto maybe_result = maybe_get_output(); + if (maybe_result.defined()) { + TORCH_CHECK(self.scalar_type() == maybe_result.scalar_type(), "Expected the dtype for input and out to match, but got ", + self.scalar_type(), " for input's dtype and ", maybe_result.scalar_type(), " for out's dtype."); + } + if (self.numel() == 0) { + at::native::zero_numel_check_dims(self, dims, "amax()"); + } + const ScalarType& out_dtype = maybe_result.defined() ? maybe_result.scalar_type() : self.scalar_type(); + resize_reduction(*this, self, dims, keepdim, out_dtype); +} + } // namespace meta namespace native { @@ -1434,23 +1448,13 @@ Tensor amin(const Tensor& self, IntArrayRef dim, bool keepdim) { return at::amin_out(result, self, dim, keepdim); } -Tensor &amax_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tensor& result) { - TORCH_CHECK(self.scalar_type() == result.scalar_type(), "Expected the dtype for input and out to match, but got ", - self.scalar_type(), " for input's dtype and ", result.scalar_type(), " for out's dtype."); - if (self.numel() == 0) { - zero_numel_check_dims(self, dim, "amax()"); - } - - auto iter = make_reduction("amax", result, self, dim, keepdim, self.scalar_type()); +TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, const Tensor& result) { + c10::MaybeOwned in = c10::MaybeOwned::borrowed(self); + auto iter = + meta::make_reduction(*in, result, dim, keepdim, self.scalar_type()); if (iter.numel() != 0) { max_values_stub(iter.device_type(), iter); } - return result; -} - -Tensor amax(const Tensor& self, IntArrayRef dim, bool keepdim) { - Tensor result = at::empty({0}, self.options()); - return at::amax_out(result, self, dim, keepdim); } template @@ -1569,7 +1573,7 @@ static Tensor& std_var_out( if (at::isComplexType(self.scalar_type())) { // For complex, calculate variance of real and imaginary components // seperately then add to get overall variance. - ScalarType dtype = c10::toValueType(get_dtype_from_result(result, {})); + ScalarType dtype = c10::toRealValueType(get_dtype_from_result(result, {})); Tensor real_in = at::real(self); Tensor real_out = at::empty({0}, self.options().dtype(dtype)); std_var_out( @@ -1634,7 +1638,7 @@ static std::tuple std_var_mean_out( fname, " only supports strided layout, got: ", self.layout()); TORCH_CHECK(at::isFloatingType(self.scalar_type()) || at::isComplexType(self.scalar_type()), fname, " only support floating point and complex dtypes"); - TORCH_CHECK(result1.scalar_type() == c10::toValueType(result2.scalar_type()), + TORCH_CHECK(result1.scalar_type() == c10::toRealValueType(result2.scalar_type()), fname, " expected result1 to be real and match the precision of result2. Got ", result1.scalar_type(), " and ", result2.scalar_type(), "."); @@ -1642,7 +1646,7 @@ static std::tuple std_var_mean_out( // For complex, calculate for real and imaginary components seperately then combine as: // variance = var_real + var_imag // mean = mean_real + j * mean_imag - ScalarType dtype = c10::toValueType(get_dtype_from_result(result1, {})); + ScalarType dtype = c10::toRealValueType(get_dtype_from_result(result1, {})); Tensor real_in = at::real(self); Tensor real_out_var = at::empty({0}, self.options().dtype(dtype)); Tensor real_out_mean = at::empty({0}, self.options().dtype(dtype)); @@ -1724,7 +1728,7 @@ std::tuple var_mean_out( static TensorOptions options_to_value_type(TensorOptions opts) { auto scalar_type = typeMetaToScalarType(opts.dtype()); - return opts.dtype(c10::toValueType(scalar_type)); + return opts.dtype(c10::toRealValueType(scalar_type)); } std::tuple var_mean( diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index e8cfeba2df0..64e17dd9dd0 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -250,7 +250,7 @@ template static inline Tensor& unary_op_impl_with_complex_to_float_out(Tensor& result, const Tensor& self, Stub& stub, bool promotes_integer_to_float) { if (self.is_complex() && !result.is_complex()) { // Checks if the corresponding float type can be cast to the desired dtype - const auto float_type = c10::toValueType(self.scalar_type()); + const auto float_type = c10::toRealValueType(self.scalar_type()); TORCH_CHECK(canCast(float_type, result.scalar_type()), "result type ", float_type, " can't be cast to the desired output type ", result.scalar_type()); @@ -288,7 +288,7 @@ static inline Tensor unary_op_impl(const Tensor& self, OutImpl& out_impl) { template static inline Tensor unary_op_impl_with_complex_to_float(const Tensor& self, OutImpl& out_impl) { if (self.is_complex()) { - const auto float_type = c10::toValueType(self.scalar_type()); + const auto float_type = c10::toRealValueType(self.scalar_type()); Tensor result = at::empty({0}, self.options().dtype(float_type)); return out_impl(result, self); } @@ -385,7 +385,7 @@ Tensor& angle_out(const Tensor& self, Tensor& result) { } Tensor angle(const Tensor& self) { if (self.is_complex()) { - const auto float_type = c10::toValueType(self.scalar_type()); + const auto float_type = c10::toRealValueType(self.scalar_type()); Tensor result = at::empty({0}, self.options().dtype(float_type)); return at::angle_out(result, self); } diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp index 95fef7d0915..f431e1e31cb 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cpp +++ b/aten/src/ATen/native/cuda/SpectralOps.cpp @@ -248,7 +248,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ out.resize_(batched_out_sizes, MemoryFormat::Contiguous); // Create the transform plan (either from cache or locally) - const auto value_type = c10::toValueType(input.scalar_type()); + const auto value_type = c10::toRealValueType(input.scalar_type()); auto fft_type = GetCuFFTTransformType(input.is_complex(), out.is_complex()); CuFFTParams Params(input.strides(), out.strides(), signal_size, fft_type, value_type); CuFFTParamsLRUCache& plan_cache = cufft_get_plan_cache(input.device().index()); @@ -445,7 +445,7 @@ Tensor _fft_c2r_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization DimVector out_sizes(in_sizes.begin(), in_sizes.end()); out_sizes[dim.back()] = lastdim; - auto output = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); + auto output = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type()))); if (use_optimized_cufft_path(dim)) { Tensor temp; diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp index 9910859d8b8..4c9df8c6196 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp @@ -2417,7 +2417,7 @@ std::tuple _symeig_helper_cuda(const Tensor& self, bool eigenvec Tensor infos = at::zeros({std::max(1, batchCount(self))}, self.options().dtype(kInt).device(at::kCPU)); auto eigvals_shape = IntArrayRef(self.sizes().data(), self.dim()-1); // self.shape[:-1] - ScalarType real_dtype = toValueType(self.scalar_type()); + ScalarType real_dtype = toRealValueType(self.scalar_type()); // magmaSyevd uses a hybrid CPU-GPU algorithm to compute the eigenvalues and eigenvectors. // The driver routine magma_(d/s)syev_gpu accepts a tensor on the CPU for eigvalenvalues. @@ -2635,7 +2635,7 @@ TORCH_CHECK(false, "Calling torch.linalg.eig on a CUDA tensor requires compiling Tensor rwork; value_t* rwork_data = nullptr; if (input.is_complex()) { - ScalarType real_dtype = toValueType(input.scalar_type()); + ScalarType real_dtype = toRealValueType(input.scalar_type()); rwork = at::empty({lda * 2}, input.options().dtype(real_dtype)); rwork_data = rwork.data_ptr(); } diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index bcf8afe2a37..470c3a48e5e 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -250,7 +250,7 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, auto in_sizes = self.sizes(); DimVector out_sizes(in_sizes.begin(), in_sizes.end()); out_sizes[dim.back()] = last_dim_size; - auto out = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); + auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type()))); pocketfft::shape_t axes(dim.begin(), dim.end()); if (self.scalar_type() == kComplexFloat) { pocketfft::c2r(shape_from_tensor(out), stride_from_tensor(self), stride_from_tensor(out), axes, false, @@ -347,7 +347,7 @@ static DftiDescriptor _plan_mkl_fft( // precision const DFTI_CONFIG_VALUE prec = [&]{ - switch (c10::toValueType(dtype)) { + switch (c10::toRealValueType(dtype)) { case ScalarType::Float: return DFTI_SINGLE; case ScalarType::Double: return DFTI_DOUBLE; default: TORCH_CHECK(false, "MKL FFT doesn't support tensors of type: ", dtype); @@ -466,7 +466,7 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, batched_out_sizes[i + 1] = out_sizes[dim[i]]; } - const auto value_type = c10::toValueType(input.scalar_type()); + const auto value_type = c10::toRealValueType(input.scalar_type()); out.resize_(batched_out_sizes, MemoryFormat::Contiguous); auto descriptor = _plan_mkl_fft( @@ -523,7 +523,7 @@ Tensor _fft_c2r_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, auto in_sizes = input.sizes(); DimVector out_sizes(in_sizes.begin(), in_sizes.end()); out_sizes[dim.back()] = last_dim_size; - auto out = at::empty(out_sizes, self.options().dtype(c10::toValueType(self.scalar_type()))); + auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type()))); return _exec_fft(out, input, out_sizes, dim, normalization, /*forward=*/false); } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 824ff73091e..525edd98ea7 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -2944,10 +2944,10 @@ - func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor variants: function, method - dispatch: - CompositeExplicitAutograd: amax + structured_delegate: amax.out - func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) + structured: True dispatch: CPU, CUDA: amax_out diff --git a/c10/core/ScalarType.h b/c10/core/ScalarType.h index a32d4aa4215..d805623efe6 100644 --- a/c10/core/ScalarType.h +++ b/c10/core/ScalarType.h @@ -307,7 +307,7 @@ static inline bool isUnderlying(ScalarType type, ScalarType qtype) { return type == toUnderlying(qtype); } -static inline ScalarType toValueType(ScalarType t) { +static inline ScalarType toRealValueType(ScalarType t) { switch (t) { case ScalarType::ComplexHalf: return ScalarType::Half; diff --git a/torch/csrc/TypeInfo.cpp b/torch/csrc/TypeInfo.cpp index b75f4fee423..08fd0323642 100644 --- a/torch/csrc/TypeInfo.cpp +++ b/torch/csrc/TypeInfo.cpp @@ -21,7 +21,7 @@ PyObject* THPFInfo_New(const at::ScalarType& type) { if (!self) throw python_error(); auto self_ = reinterpret_cast(self.get()); - self_->type = c10::toValueType(type); + self_->type = c10::toRealValueType(type); return self.release(); } diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 951b5eeca96..685e4876e88 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -908,7 +908,7 @@ Tensor renorm_backward(const Tensor & grad, const Tensor & self, const Scalar& p self, p, reduce_dims, /*keepdim=*/true); } - const auto real_acc_type = c10::toValueType(acc_type); + const auto real_acc_type = c10::toRealValueType(acc_type); auto grad_output = (self.conj() * grad); // vector_norm output is real, so grad_output must also be real if (real_acc_type != acc_type) { @@ -3281,7 +3281,7 @@ Tensor det_backward(const Tensor & grad, const Tensor& self, const Tensor& det) return svd_backward(u_grad, s_grad, vh_grad, u, s, vh); }; - auto eps = at::native::_get_epsilon(c10::toValueType(self.scalar_type())); + auto eps = at::native::_get_epsilon(c10::toRealValueType(self.scalar_type())); auto singular_det_cutoff = eps * at::linalg_matrix_norm(self); if (self.dim() == 2) { diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp index 0f79d01104a..201a208d8b9 100644 --- a/torch/csrc/jit/passes/shape_analysis.cpp +++ b/torch/csrc/jit/passes/shape_analysis.cpp @@ -917,7 +917,7 @@ class ShapePropagator : public PropertyPropBase { if (input_type->scalarType()) { const auto scalar_type = *(input_type->scalarType()); if (isComplexType(scalar_type)) { - const auto out_type = c10::toValueType(scalar_type); + const auto out_type = c10::toRealValueType(scalar_type); return type_vec_t{ input_type->dimensionedOnly()->withScalarType(out_type)}; } From 1f29b3130af218847a043e58fdc64511bbe072fe Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Tue, 15 Feb 2022 23:07:53 -0800 Subject: [PATCH 085/199] [FSDP] Implement apply() (#72600) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72600 Implements `apply()` which applies a `callable` of signature `f(m: Module) -> None` recursively to every submodule. The main difference from `nn.module.apply` is that this version summons the full parameters before apply() so it works appropriately with FSDP. ghstack-source-id: 149217423 Test Plan: CI Reviewed By: zhaojuanmao Differential Revision: D34111109 fbshipit-source-id: 60d9d3f5c4d6c27763f5d68728dfb0bae3d9f644 (cherry picked from commit b20c65e06070f27fda0e5260f5cbbb41e3e33f46) --- test/distributed/fsdp/test_fsdp_apply.py | 108 ++++++++++++++++++ .../fsdp/fully_sharded_data_parallel.py | 39 +++++++ 2 files changed, 147 insertions(+) create mode 100644 test/distributed/fsdp/test_fsdp_apply.py diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py new file mode 100644 index 00000000000..d45fcada027 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_apply.py @@ -0,0 +1,108 @@ +import sys + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.testing._internal.common_fsdp import ( + FSDPTest, + NestedWrappedModule, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) +from torch.testing._internal.common_distributed import ( + skip_if_lt_x_gpu, +) + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestApply(FSDPTest): + @property + def world_size(self): + return 2 + + @torch.no_grad() + def _init_linear_weights(self, m): + if type(m) == nn.Linear: + m.weight.fill_(1.0) + m.bias.fill_(1.0) + + @property + def process_group(self): + return dist.distributed_c10d._get_default_group() + + def check_weights(self, fsdp, expected_tensor_fn, check): + with fsdp._summon_full_params(recurse=True): + linear_modules = [ + module for module in fsdp.modules() if type(module) == nn.Linear + ] + for module in linear_modules: + for param in module.parameters(): + expected = expected_tensor_fn(param) + check(param, expected) + + def _check_apply(self, fsdp): + # Assert linear weights are not all 1.0 + self.check_weights( + fsdp, lambda param: torch.ones_like(param), self.assertNotEqual + ) + + fsdp.apply(self._init_linear_weights) + + # Ensure all weights are 1.0 + self.check_weights(fsdp, lambda param: torch.ones_like(param), self.assertEqual) + + @skip_if_lt_x_gpu(2) + def test_nested_module_apply(self): + """ + Checks apply() modifies weights appropriately on a nested FSDP instance. + """ + nested_module = NestedWrappedModule( + self.process_group, wrap_fsdp=True, wrap_everything=True + ) + fsdp_module = FSDP(nested_module, self.process_group).cuda(self.rank) + self._check_apply(fsdp_module) + + @skip_if_lt_x_gpu(2) + def test_transformer_module_apply(self): + """ + Checks apply() modifiees weights appropriately on a wrapped Transformer + module. + """ + transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank) + # Assert linear weights are not all 1.0 + self.check_weights( + transformer, lambda param: torch.ones_like(param), self.assertNotEqual + ) + transformer.apply(self._init_linear_weights) + # Assert all weights are 1.0 + self.check_weights( + transformer, lambda param: torch.ones_like(param), self.assertEqual + ) + + @skip_if_lt_x_gpu(2) + def test_apply_in_summon_raises_error(self): + """ + Ensures that if user calls apply() on FSDP instance within full param + summon context, appropriate error is raised. + """ + transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank) + with transformer._summon_full_params(recurse=True): + with self.assertRaisesRegex(ValueError, "expected to be in states"): + transformer.apply(self._init_linear_weights) + + +if __name__ == "__main__": + run_tests() diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index d270230eba1..fe61684b69d 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -290,6 +290,45 @@ class FullyShardedDataParallel(nn.Module): assert isinstance(self._fsdp_wrapped_module, FlattenParamsWrapper) return self._fsdp_wrapped_module + def fsdp_modules(self) -> List["FullyShardedDataParallel"]: + """ + Helper function to return all nested FSDP instances, including self. + """ + fsdp_modules = [] + for module in self.modules(): + if isinstance(module, FullyShardedDataParallel): + fsdp_modules.append(module) + + return fsdp_modules + + def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel": + r"""Applies ``fn`` recursively to every submodule (as returned by ``.children()``) + as well as self. Typical use includes initializing the parameters of a model + (see also :ref:`nn-init-doc`). + + Compared to ``torch.nn.Module.apply``, this version additionally gathers + the full parameters before applying ``fn``. It should not be called from + within another ``summon_full_params`` context. + + Args: + fn (:class:`Module` -> None): function to be applied to each submodule + + Returns: + Module: self + """ + uninitialized = self._is_root is None + self._assert_state(TrainingState_.IDLE) + with self._summon_full_params(recurse=False): + ret = super().apply(fn) + + # Reset lazy init that might be called by summon_full_params, since + # it could have set is_root incorrectly for non-root FSDP instances. + if uninitialized and self._is_root: + for module in self.fsdp_modules(): + module._reset_lazy_init() + + return ret + # setting two factors 'self.gradient_predivide_factor' # and 'self.gradient_postdivide_factor' to avoid underflow and overflow def _get_gradient_predivide_factor(self, world_size: int) -> float: From 59dd84cab6ede977173cd48d64abf1bcf6b2fabb Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Wed, 16 Feb 2022 06:56:48 -0800 Subject: [PATCH 086/199] [Join][BE] Fix typo; remove obsolete method (#72886) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72886 **Test Plan** Searching for `_schedule_shadow_all_reduce_for_fwd_pass` shows that it is defined but never used. Test Plan: Imported from OSS Reviewed By: mrshenli Differential Revision: D34255651 Pulled By: awgu fbshipit-source-id: 205a0325c2cdc05e127a183cb86fa2fc2e0db99d (cherry picked from commit 4492f03a3f37c01efa281a6d09a7e3b673cb1139) --- torch/distributed/algorithms/join.py | 2 +- torch/nn/parallel/distributed.py | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/torch/distributed/algorithms/join.py b/torch/distributed/algorithms/join.py index 45d3eee938a..17fe5cce8c6 100644 --- a/torch/distributed/algorithms/join.py +++ b/torch/distributed/algorithms/join.py @@ -259,7 +259,7 @@ class Join(): f"{self._rank} has at least {WARN_THRESHOLD} " f"fewer inputs than other currently-active ranks. " "This level of skew could lead to performance " - "degradataion during training." + "degradation during training." ) # Shadow the all-reduce in non-joined processes num_nonjoined_procs = self._get_num_nonjoined_procs() diff --git a/torch/nn/parallel/distributed.py b/torch/nn/parallel/distributed.py index 73815305ca7..f1ffa4ea405 100644 --- a/torch/nn/parallel/distributed.py +++ b/torch/nn/parallel/distributed.py @@ -1105,14 +1105,6 @@ class DistributedDataParallel(Module, Joinable): super(DistributedDataParallel, self).train(mode) return self - # When running in join mode, schedules an allreduce to match the one in the - # forward pass to determine the no. of currently active processes and whether - # all processes have joined. - def _schedule_shadow_all_reduce_for_fwd_pass(self): - all_active_procs = torch.zeros(1, device=self.device) - dist.all_reduce(all_active_procs, group=self.process_group) - return all_active_procs.item() - # When running in join mode, schedules an allreduce to notify joined ranks # of whether backwards pass synchronization will run this iteraton or not. def _check_global_requires_backward_grad_sync(self, is_joined_rank): From 17b3ba148de5bca4da66b4045e451b245f916179 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 16 Feb 2022 10:14:27 -0500 Subject: [PATCH 087/199] Set `BLAS_LIBRARIES` to `${MKL_LIBRARIES}` for MKL case (#72806) This reverts [suggestion](https://github.com/pytorch/pytorch/pull/49647#discussion_r677737470) proposed to https://github.com/pytorch/pytorch/pull/49647 Which is somehow sufficient to workaround symptoms of https://github.com/pytorch/pytorch/issue/72653 I.e. before this change, `BLAS_LIBRARIES` were set to `caffe2::mkl` which is an interface library with link property set as follows: https://github.com/pytorch/pytorch/blob/59dd84cab6ede977173cd48d64abf1bcf6b2fabb/cmake/public/mkl.cmake#L10-L12 --- cmake/Dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index a87131a992c..f3081cb48a1 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -216,7 +216,7 @@ elseif(BLAS STREQUAL "MKL") set(CAFFE2_USE_MKL ON) set(BLAS_INFO "mkl") set(BLAS_FOUND 1) - set(BLAS_LIBRARIES caffe2::mkl) + set(BLAS_LIBRARIES ${MKL_LIBRARIES}) else() message(WARNING "MKL could not be found. Defaulting to Eigen") set(CAFFE2_USE_EIGEN_FOR_BLAS ON) From ccdff4c4802c966f1b5d06a0ab2bb302f7643af0 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 16 Feb 2022 07:25:34 -0800 Subject: [PATCH 088/199] Revert D34111109: [FSDP] Implement apply() Test Plan: revert-hammer Differential Revision: D34111109 (https://github.com/pytorch/pytorch/commit/1f29b3130af218847a043e58fdc64511bbe072fe) Original commit changeset: 60d9d3f5c4d6 Original Phabricator Diff: D34111109 (https://github.com/pytorch/pytorch/commit/1f29b3130af218847a043e58fdc64511bbe072fe) fbshipit-source-id: d959533f656a1fa69b2af7c029130f674fdd6023 (cherry picked from commit b0d3e2b1c368dea84b94cfa2a06c9e02c5a66906) --- test/distributed/fsdp/test_fsdp_apply.py | 108 ------------------ .../fsdp/fully_sharded_data_parallel.py | 39 ------- 2 files changed, 147 deletions(-) delete mode 100644 test/distributed/fsdp/test_fsdp_apply.py diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py deleted file mode 100644 index d45fcada027..00000000000 --- a/test/distributed/fsdp/test_fsdp_apply.py +++ /dev/null @@ -1,108 +0,0 @@ -import sys - -import torch -import torch.distributed as dist -import torch.nn as nn -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.testing._internal.common_fsdp import ( - FSDPTest, - NestedWrappedModule, -) -from torch.testing._internal.common_utils import ( - TEST_WITH_DEV_DBG_ASAN, - run_tests, -) -from torch.testing._internal.common_distributed import ( - skip_if_lt_x_gpu, -) - -if not dist.is_available(): - print("Distributed not available, skipping tests", file=sys.stderr) - sys.exit(0) - -if TEST_WITH_DEV_DBG_ASAN: - print( - "Skip dev-asan as torch + multiprocessing spawn have known issues", - file=sys.stderr, - ) - sys.exit(0) - - -class TestApply(FSDPTest): - @property - def world_size(self): - return 2 - - @torch.no_grad() - def _init_linear_weights(self, m): - if type(m) == nn.Linear: - m.weight.fill_(1.0) - m.bias.fill_(1.0) - - @property - def process_group(self): - return dist.distributed_c10d._get_default_group() - - def check_weights(self, fsdp, expected_tensor_fn, check): - with fsdp._summon_full_params(recurse=True): - linear_modules = [ - module for module in fsdp.modules() if type(module) == nn.Linear - ] - for module in linear_modules: - for param in module.parameters(): - expected = expected_tensor_fn(param) - check(param, expected) - - def _check_apply(self, fsdp): - # Assert linear weights are not all 1.0 - self.check_weights( - fsdp, lambda param: torch.ones_like(param), self.assertNotEqual - ) - - fsdp.apply(self._init_linear_weights) - - # Ensure all weights are 1.0 - self.check_weights(fsdp, lambda param: torch.ones_like(param), self.assertEqual) - - @skip_if_lt_x_gpu(2) - def test_nested_module_apply(self): - """ - Checks apply() modifies weights appropriately on a nested FSDP instance. - """ - nested_module = NestedWrappedModule( - self.process_group, wrap_fsdp=True, wrap_everything=True - ) - fsdp_module = FSDP(nested_module, self.process_group).cuda(self.rank) - self._check_apply(fsdp_module) - - @skip_if_lt_x_gpu(2) - def test_transformer_module_apply(self): - """ - Checks apply() modifiees weights appropriately on a wrapped Transformer - module. - """ - transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank) - # Assert linear weights are not all 1.0 - self.check_weights( - transformer, lambda param: torch.ones_like(param), self.assertNotEqual - ) - transformer.apply(self._init_linear_weights) - # Assert all weights are 1.0 - self.check_weights( - transformer, lambda param: torch.ones_like(param), self.assertEqual - ) - - @skip_if_lt_x_gpu(2) - def test_apply_in_summon_raises_error(self): - """ - Ensures that if user calls apply() on FSDP instance within full param - summon context, appropriate error is raised. - """ - transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank) - with transformer._summon_full_params(recurse=True): - with self.assertRaisesRegex(ValueError, "expected to be in states"): - transformer.apply(self._init_linear_weights) - - -if __name__ == "__main__": - run_tests() diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index fe61684b69d..d270230eba1 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -290,45 +290,6 @@ class FullyShardedDataParallel(nn.Module): assert isinstance(self._fsdp_wrapped_module, FlattenParamsWrapper) return self._fsdp_wrapped_module - def fsdp_modules(self) -> List["FullyShardedDataParallel"]: - """ - Helper function to return all nested FSDP instances, including self. - """ - fsdp_modules = [] - for module in self.modules(): - if isinstance(module, FullyShardedDataParallel): - fsdp_modules.append(module) - - return fsdp_modules - - def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel": - r"""Applies ``fn`` recursively to every submodule (as returned by ``.children()``) - as well as self. Typical use includes initializing the parameters of a model - (see also :ref:`nn-init-doc`). - - Compared to ``torch.nn.Module.apply``, this version additionally gathers - the full parameters before applying ``fn``. It should not be called from - within another ``summon_full_params`` context. - - Args: - fn (:class:`Module` -> None): function to be applied to each submodule - - Returns: - Module: self - """ - uninitialized = self._is_root is None - self._assert_state(TrainingState_.IDLE) - with self._summon_full_params(recurse=False): - ret = super().apply(fn) - - # Reset lazy init that might be called by summon_full_params, since - # it could have set is_root incorrectly for non-root FSDP instances. - if uninitialized and self._is_root: - for module in self.fsdp_modules(): - module._reset_lazy_init() - - return ret - # setting two factors 'self.gradient_predivide_factor' # and 'self.gradient_postdivide_factor' to avoid underflow and overflow def _get_gradient_predivide_factor(self, world_size: int) -> float: From f395a75c671a53f65f9b69cd4cc5a5dc8202e94e Mon Sep 17 00:00:00 2001 From: CodemodService FBSourceClangFormatLinterBot <> Date: Wed, 16 Feb 2022 07:32:33 -0800 Subject: [PATCH 089/199] [AutoAccept][Codemod][FBSourceClangFormatLinter] Daily `arc lint --take CLANGFORMAT` Reviewed By: zertosh Differential Revision: D34263670 fbshipit-source-id: 9479899031c817ad8cbefba30db7d0203804fd99 (cherry picked from commit c13e2138f40a3ee3262a1b63f57772f0783f5352) --- c10/core/TensorImpl.cpp | 22 +++++++++++-------- c10/core/TensorImpl.h | 6 +++-- c10/core/impl/LocalDispatchKeySet.h | 4 ++-- .../runtime/profiling_graph_executor_impl.h | 2 +- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index e3bbed52aa5..fad9dcb6fc3 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -120,11 +120,11 @@ TensorImpl::TensorImpl( // [Note: Python key removal] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// In most constructors for TensorImpl, you will see Python and PythonTLSSnapshot -// keys are removed from the passed in DispatchKeySet. Why? +// In most constructors for TensorImpl, you will see Python and +// PythonTLSSnapshot keys are removed from the passed in DispatchKeySet. Why? // -// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject for -// the Tensor has a nontrivial __torch_dispatch__ implementation. +// INVARIANT: Python and PythonTLSSnapshot dispatch keys are set iff PyObject +// for the Tensor has a nontrivial __torch_dispatch__ implementation. // // When a fresh TensorImpl is created, there is *no* PyObject (this only gets // initialized lazily at the first point in time the Tensor passes into Python). @@ -148,9 +148,10 @@ TensorImpl::TensorImpl( numel_(0), data_type_(data_type), device_opt_(storage_.device()), - key_set_(key_set.remove( - DispatchKey::Python).remove( - DispatchKey::PythonTLSSnapshot)) { // See [Note: Python key removal] + key_set_( + key_set.remove(DispatchKey::Python) + .remove(DispatchKey::PythonTLSSnapshot)) { // See [Note: Python + // key removal] init_bitfields(); // Inference tensor doesn't have version counter. if (!is_inference()) { @@ -196,7 +197,9 @@ TensorImpl::TensorImpl( key_set = key_set | getAutocastRelatedKeySetFromBackend(k); key_set = - key_set.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); // See [Note: Python key removal] + key_set.remove(DispatchKey::Python) + .remove( + DispatchKey::PythonTLSSnapshot); // See [Note: Python key removal] // Inference tensor doesn't have autograd related keys. if (inference_mode) { @@ -554,7 +557,8 @@ void TensorImpl::copy_tensor_metadata_except_version_counter( dest_impl->storage_offset_ = src_impl->storage_offset_; dest_impl->data_type_ = src_impl->data_type_; dest_impl->device_opt_ = src_impl->device_opt_; - dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); + dest_impl->key_set_ = src_impl->key_set_.remove(DispatchKey::Python) + .remove(DispatchKey::PythonTLSSnapshot); dest_impl->is_contiguous_ = src_impl->is_contiguous_; dest_impl->has_contiguity_ = src_impl->has_contiguity_; dest_impl->is_channels_last_contiguous_ = diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 8ec099c2eab..4f6019a5ec3 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -1476,9 +1476,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { void set_python_dispatch(bool k) { if (k) { - key_set_ = key_set_.add(DispatchKey::Python).add(DispatchKey::PythonTLSSnapshot); + key_set_ = + key_set_.add(DispatchKey::Python).add(DispatchKey::PythonTLSSnapshot); } else { - key_set_ = key_set_.remove(DispatchKey::Python).remove(DispatchKey::PythonTLSSnapshot); + key_set_ = key_set_.remove(DispatchKey::Python) + .remove(DispatchKey::PythonTLSSnapshot); } } diff --git a/c10/core/impl/LocalDispatchKeySet.h b/c10/core/impl/LocalDispatchKeySet.h index 5ee622d433a..70af58b9571 100644 --- a/c10/core/impl/LocalDispatchKeySet.h +++ b/c10/core/impl/LocalDispatchKeySet.h @@ -119,8 +119,8 @@ class C10_API ExcludeDispatchKeyGuard { struct C10_API ForceDispatchKeyGuard { public: - ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) : - saved_keyset_(c10::impl::tls_local_dispatch_key_set()) { + ForceDispatchKeyGuard(c10::impl::LocalDispatchKeySet key_set) + : saved_keyset_(c10::impl::tls_local_dispatch_key_set()) { c10::impl::_force_tls_local_dispatch_key_set(key_set); } ~ForceDispatchKeyGuard() { diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h index 5ae3241d6f3..560eaca2cc3 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h @@ -1,6 +1,6 @@ #pragma once -#include #include +#include namespace torch { namespace jit { From 87975d895c3f52cb33f48757fabefbb3d3f276f2 Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Wed, 16 Feb 2022 08:04:21 -0800 Subject: [PATCH 090/199] [DataPipe] Improve .pyi generation (#72829) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72829 Make two functions more flexible and usable from a different repo. Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D34227912 Pulled By: NivekT fbshipit-source-id: 873934ed33caf485de7f56e9c4a1d3f3fa1a92ef (cherry picked from commit b990c5e4c7244e1d0352b24c475df2c0968ee1c0) --- torch/utils/data/gen_pyi.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/torch/utils/data/gen_pyi.py b/torch/utils/data/gen_pyi.py index f248da42574..11aa3333888 100644 --- a/torch/utils/data/gen_pyi.py +++ b/torch/utils/data/gen_pyi.py @@ -1,6 +1,6 @@ import os import pathlib -from typing import Dict, List, Set, Tuple +from typing import Dict, List, Set, Tuple, Union from tools.codegen.gen import FileManager @@ -136,19 +136,24 @@ def process_signature(line: str) -> str: return line -def get_method_definitions(file_path: str, +def get_method_definitions(file_path: Union[str, List[str]], files_to_exclude: Set[str], deprecated_files: Set[str], default_output_type: str, - method_to_special_output_type: Dict[str, str]) -> List[str]: + method_to_special_output_type: Dict[str, str], + root: str = "") -> List[str]: """ .pyi generation for functional DataPipes Process # 1. Find files that we want to process (exclude the ones who don't) # 2. Parse method name and signature # 3. Remove first argument after self (unless it is "*datapipes"), default args, and spaces """ - os.chdir(str(pathlib.Path(__file__).parent.resolve())) - file_paths = find_file_paths([file_path], + if root == "": + os.chdir(str(pathlib.Path(__file__).parent.resolve())) + else: + os.chdir(root) + file_path = [file_path] if isinstance(file_path, str) else file_path + file_paths = find_file_paths(file_path, files_to_exclude=files_to_exclude.union(deprecated_files)) methods_and_signatures, methods_and_class_names, methods_w_special_output_types = parse_datapipe_files(file_paths) @@ -165,24 +170,26 @@ def get_method_definitions(file_path: str, return method_definitions +# Defined outside of main() so they can be imported by TorchData +iterDP_file_path: str = "datapipes/iter" +iterDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"} +iterDP_deprecated_files: Set[str] = set() +iterDP_method_to_special_output_type: Dict[str, str] = {"demux": "List[IterDataPipe]", "fork": "List[IterDataPipe]"} + +mapDP_file_path: str = "datapipes/map" +mapDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"} +mapDP_deprecated_files: Set[str] = set() +mapDP_method_to_special_output_type: Dict[str, str] = {} + + def main() -> None: """ # Inject file into template dataset.pyi.in TODO: The current implementation of this script only generates interfaces for built-in methods. To generate interface for user-defined DataPipes, consider changing `IterDataPipe.register_datapipe_as_function`. """ - - iterDP_file_path: str = "datapipes/iter" - iterDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"} - iterDP_deprecated_files: Set[str] = set() - iterDP_method_to_special_output_type: Dict[str, str] = {"demux": "List[IterDataPipe]", "fork": "List[IterDataPipe]"} - iter_method_definitions = get_method_definitions(iterDP_file_path, iterDP_files_to_exclude, iterDP_deprecated_files, "IterDataPipe", iterDP_method_to_special_output_type) - mapDP_file_path: str = "datapipes/map" - mapDP_files_to_exclude: Set[str] = {"__init__.py", "utils.py"} - mapDP_deprecated_files: Set[str] = set() - mapDP_method_to_special_output_type: Dict[str, str] = {} map_method_definitions = get_method_definitions(mapDP_file_path, mapDP_files_to_exclude, mapDP_deprecated_files, "MapDataPipe", mapDP_method_to_special_output_type) From 5343cfe9491b3c489e423f2be55086ad2f8e40c5 Mon Sep 17 00:00:00 2001 From: Kim Juhyeong Date: Wed, 16 Feb 2022 08:33:43 -0800 Subject: [PATCH 091/199] Improve numerical stability of `torch.distributions.wishart.Wishart` (#72059) Summary: Maintanance of https://github.com/pytorch/pytorch/issues/70377 Multiple modifications of the merged initial implementation of Wishart distribution. cc neerajprad Key modifications: - `torch/distributions/wishart.py`: Clamp (Clip) float type values to calculate reciprocal in numerically stable manner, by using the `eps` value paired to each `torch.dtype` - `test/distributions/test_distributions.py`: Test Wishart distribution implementation in numerically unstable zones, i.e `df` values are at `ndim - 1 < df < ndim` where `ndim` is the one dimenstion of the Wishart parameter & sample matrix. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72059 Reviewed By: H-Huang Differential Revision: D34245091 Pulled By: neerajprad fbshipit-source-id: 1cd653c1d5c663346433e84fd0bbe2e590790908 (cherry picked from commit ef1da3ba465247f5777c3c40a90b96955c4281d0) --- test/distributions/test_distributions.py | 107 ++++++++++++++--------- torch/distributions/exp_family.py | 2 +- torch/distributions/wishart.py | 53 ++++++----- 3 files changed, 98 insertions(+), 64 deletions(-) diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index 1855c8434be..3f3d0168b71 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -34,6 +34,7 @@ import unittest from collections import namedtuple from itertools import product from random import shuffle +from packaging import version import torch @@ -2214,39 +2215,41 @@ class TestDistributions(TestCase): # We applied same tests in Multivariate Normal distribution for Wishart distribution def test_wishart_shape(self): - df = (torch.rand(5, requires_grad=True) + 1) * 10 - df_no_batch = (torch.rand([], requires_grad=True) + 1) * 10 - df_multi_batch = (torch.rand(6, 5, requires_grad=True) + 1) * 10 + ndim = 3 + + df = torch.rand(5, requires_grad=True) + ndim + df_no_batch = torch.rand([], requires_grad=True) + ndim + df_multi_batch = torch.rand(6, 5, requires_grad=True) + ndim # construct PSD covariance - tmp = torch.randn(3, 10) + tmp = torch.randn(ndim, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() # construct batch of PSD covariances - tmp = torch.randn(6, 5, 3, 10) + tmp = torch.randn(6, 5, ndim, 10) cov_batched = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_() prec_batched = cov_batched.inverse() scale_tril_batched = torch.linalg.cholesky(cov_batched) # ensure that sample, batch, event shapes all handled correctly - self.assertEqual(Wishart(df, cov).sample().size(), (5, 3, 3)) - self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (3, 3)) - self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, 3, 3)) - self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, 3, 3)) - self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, 3, 3)) - self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, 3, 3)) - self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, 3, 3)) - self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, 3, 3)) - self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, 3, 3)) - self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) - self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, 3, 3)) - self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) + self.assertEqual(Wishart(df, cov).sample().size(), (5, ndim, ndim)) + self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (ndim, ndim)) + self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, ndim, ndim)) + self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, ndim, ndim)) + self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) + self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, ndim, ndim)) + self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) + self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) + self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) # check gradients # Modified and applied the same tests for multivariate_normal @@ -2272,14 +2275,18 @@ class TestDistributions(TestCase): wishart_log_prob_gradcheck(df_no_batch, None, None, scale_tril_batched) def test_wishart_stable_with_precision_matrix(self): - x = torch.randn(10) + ndim = 10 + x = torch.randn(ndim) P = torch.exp(-(x - x.unsqueeze(-1)) ** 2) # RBF kernel - Wishart(torch.tensor(10), precision_matrix=P) + Wishart(torch.tensor(ndim), precision_matrix=P) @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_wishart_log_prob(self): - df = (torch.rand([], requires_grad=True) + 1) * 10 - tmp = torch.randn(3, 10) + ndim = 3 + df = torch.rand([], requires_grad=True) + ndim - 1 + if version.parse(scipy.__version__) < version.parse("1.7.0"): + df += 1. + tmp = torch.randn(ndim, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() @@ -2291,7 +2298,7 @@ class TestDistributions(TestCase): dist3 = Wishart(df, scale_tril=scale_tril) ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy()) - x = dist1.sample((10,)) + x = dist1.sample((1000,)) expected = ref_dist.logpdf(x.transpose(0, 2).numpy()) self.assertEqual(0.0, np.mean((dist1.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0) @@ -2299,14 +2306,14 @@ class TestDistributions(TestCase): self.assertEqual(0.0, np.mean((dist3.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0) # Double-check that batched versions behave the same as unbatched - df = (torch.rand(5, requires_grad=True) + 1) * 3 - tmp = torch.randn(5, 3, 10) + df = torch.rand(5, requires_grad=True) + ndim - 1 + tmp = torch.randn(5, ndim, 10) cov = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_() dist_batched = Wishart(df, cov) dist_unbatched = [Wishart(df[i], cov[i]) for i in range(df.size(0))] - x = dist_batched.sample((10,)) + x = dist_batched.sample((1000,)) batched_prob = dist_batched.log_prob(x) unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t() @@ -2316,28 +2323,34 @@ class TestDistributions(TestCase): @unittest.skipIf(not TEST_NUMPY, "NumPy not found") def test_wishart_sample(self): set_rng_seed(0) # see Note [Randomized statistical tests] - df = (torch.rand([], requires_grad=True) + 1) * 3 - tmp = torch.randn(3, 10) + ndim = 3 + df = torch.rand([], requires_grad=True) + ndim - 1 + if version.parse(scipy.__version__) < version.parse("1.7.0"): + df += 1. + tmp = torch.randn(ndim, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() + ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy()) + self._check_sampler_sampler(Wishart(df, cov), - scipy.stats.wishart(df.item(), cov.detach().numpy()), + ref_dist, 'Wishart(df={}, covariance_matrix={})'.format(df, cov), multivariate=True) self._check_sampler_sampler(Wishart(df, precision_matrix=prec), - scipy.stats.wishart(df.item(), cov.detach().numpy()), + ref_dist, 'Wishart(df={}, precision_matrix={})'.format(df, prec), multivariate=True) self._check_sampler_sampler(Wishart(df, scale_tril=scale_tril), - scipy.stats.wishart(df.item(), cov.detach().numpy()), + ref_dist, 'Wishart(df={}, scale_tril={})'.format(df, scale_tril), multivariate=True) def test_wishart_properties(self): - df = (torch.rand([]) + 1) * 5 - scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(5, 5)) + ndim = 5 + df = torch.rand([]) + ndim - 1 + scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim)) m = Wishart(df=df, scale_tril=scale_tril) self.assertEqual(m.covariance_matrix, m.scale_tril.mm(m.scale_tril.t())) self.assertEqual(m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0])) @@ -2345,14 +2358,15 @@ class TestDistributions(TestCase): def test_wishart_moments(self): set_rng_seed(0) # see Note [Randomized statistical tests] - df = (torch.rand([]) + 1) * 3 - scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(3, 3)) + ndim = 3 + df = torch.rand([]) + ndim - 1 + scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim)) d = Wishart(df=df, scale_tril=scale_tril) - samples = d.rsample((100000,)) + samples = d.rsample((ndim * ndim * 100000,)) empirical_mean = samples.mean(0) - self.assertEqual(d.mean, empirical_mean, atol=5, rtol=0) + self.assertEqual(d.mean, empirical_mean, atol=0.5, rtol=0) empirical_var = samples.var(0) - self.assertEqual(d.variance, empirical_var, atol=5, rtol=0) + self.assertEqual(d.variance, empirical_var, atol=0.5, rtol=0) def test_exponential(self): rate = torch.randn(5, 5).abs().requires_grad_() @@ -4617,8 +4631,15 @@ class TestAgainstScipy(TestCase): scipy.stats.weibull_min(c=positive_var2[0], scale=positive_var[0]) ), ( - Wishart(20 + positive_var[0], cov_tensor), # scipy var for Wishart only supports scalars - scipy.stats.wishart(20 + positive_var[0].item(), cov_tensor), + # scipy var for Wishart only supports scalars + Wishart( + (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0], + cov_tensor, + ), + scipy.stats.wishart( + (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0].item(), + cov_tensor, + ), ), ] diff --git a/torch/distributions/exp_family.py b/torch/distributions/exp_family.py index 669619d9db1..7084714ee3d 100644 --- a/torch/distributions/exp_family.py +++ b/torch/distributions/exp_family.py @@ -56,5 +56,5 @@ class ExponentialFamily(Distribution): gradients = torch.autograd.grad(lg_normal.sum(), nparams, create_graph=True) result += lg_normal for np, g in zip(nparams, gradients): - result -= np * g + result -= (np * g).reshape(self._batch_shape + (-1,)).sum(-1) return result diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py index 0dd431a0f7b..04156915a4d 100644 --- a/torch/distributions/wishart.py +++ b/torch/distributions/wishart.py @@ -20,6 +20,10 @@ def _mvdigamma(x: torch.Tensor, p: int) -> torch.Tensor: - torch.arange(p, dtype=x.dtype, device=x.device).div(2).expand(x.shape + (-1,)) ).sum(-1) +def _clamp_with_eps(x: torch.Tensor) -> torch.Tensor: + # We assume positive input for this function + return x.clamp(min=torch.finfo(x.dtype).eps) + class Wishart(ExponentialFamily): r""" Creates a Wishart distribution parameterized by a symmetric positive definite matrix :math:`\Sigma`, @@ -27,8 +31,9 @@ class Wishart(ExponentialFamily): Example: >>> m = Wishart(torch.eye(2), torch.Tensor([2])) - >>> m.sample() #Wishart distributed with mean=`df * I` and - #variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j + >>> m.sample() # Wishart distributed with mean=`df * I` and + # variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j + Args: covariance_matrix (Tensor): positive-definite covariance matrix precision_matrix (Tensor): positive-definite precision matrix @@ -56,6 +61,7 @@ class Wishart(ExponentialFamily): } support = constraints.positive_definite has_rsample = True + _mean_carrier_measure = 0 def __init__(self, df: Union[torch.Tensor, Number], @@ -80,7 +86,7 @@ class Wishart(ExponentialFamily): event_shape = param.shape[-2:] if self.df.le(event_shape[-1] - 1).any(): - raise ValueError(f"Value of df={df} expected to be greater than ndim={event_shape[-1]-1}.") + raise ValueError(f"Value of df={df} expected to be greater than ndim - 1 = {event_shape[-1]-1}.") if scale_tril is not None: self.scale_tril = param.expand(batch_shape + (-1, -1)) @@ -119,9 +125,8 @@ class Wishart(ExponentialFamily): new = self._get_checked_instance(Wishart, _instance) batch_shape = torch.Size(batch_shape) cov_shape = batch_shape + self.event_shape - df_shape = batch_shape new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril.expand(cov_shape) - new.df = self.df.expand(df_shape) + new.df = self.df.expand(batch_shape) new._batch_dims = [-(x + 1) for x in range(len(batch_shape))] @@ -172,22 +177,25 @@ class Wishart(ExponentialFamily): @property def mean(self): - return self.df.view(self._batch_shape + (1, 1,)) * self.covariance_matrix + return self.df.view(self._batch_shape + (1, 1)) * self.covariance_matrix @property def variance(self): V = self.covariance_matrix # has shape (batch_shape x event_shape) diag_V = V.diagonal(dim1=-2, dim2=-1) - return self.df.view(self._batch_shape + (1, 1,)) * (V.pow(2) + torch.einsum("...i,...j->...ij", diag_V, diag_V)) + return self.df.view(self._batch_shape + (1, 1)) * (V.pow(2) + torch.einsum("...i,...j->...ij", diag_V, diag_V)) def _bartlett_sampling(self, sample_shape=torch.Size()): p = self._event_shape[-1] # has singleton shape # Implemented Sampling using Bartlett decomposition - noise = self._dist_chi2.rsample(sample_shape).sqrt().diag_embed(dim1=-2, dim2=-1) + noise = _clamp_with_eps( + self._dist_chi2.rsample(sample_shape).sqrt() + ).diag_embed(dim1=-2, dim2=-1) + i, j = torch.tril_indices(p, p, offset=-1) noise[..., i, j] = torch.randn( - torch.Size(sample_shape) + self._batch_shape + (int(p * (p - 1) / 2),), + torch.Size(sample_shape) + self._batch_shape + (int(0.5 * p * (p - 1)),), dtype=noise.dtype, device=noise.device, ) @@ -250,11 +258,11 @@ class Wishart(ExponentialFamily): nu = self.df # has shape (batch_shape) p = self._event_shape[-1] # has singleton shape return ( - - nu * p * _log_2 / 2 + - 0.5 * nu * p * _log_2 - nu * self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1) - - torch.mvlgamma(nu / 2, p=p) - + (nu - p - 1) / 2 * torch.linalg.slogdet(value).logabsdet - - torch.cholesky_solve(value, self._unbroadcasted_scale_tril).diagonal(dim1=-2, dim2=-1).sum(dim=-1) / 2 + - torch.mvlgamma(0.5 * nu, p=p) + + 0.5 * (nu - p - 1) * torch.linalg.slogdet(value).logabsdet + - 0.5 * torch.cholesky_solve(value, self._unbroadcasted_scale_tril).diagonal(dim1=-2, dim2=-1).sum(dim=-1) ) def entropy(self): @@ -263,19 +271,24 @@ class Wishart(ExponentialFamily): V = self.covariance_matrix # has shape (batch_shape x event_shape) return ( (p + 1) * self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1) - + p * (p + 1) * _log_2 / 2 - + torch.mvlgamma(nu / 2, p=p) - - (nu - p - 1) / 2 * _mvdigamma(nu / 2, p=p) - + nu * p / 2 + + 0.5 * p * (p + 1) * _log_2 + + torch.mvlgamma(0.5 * nu, p=p) + - 0.5 * (nu - p - 1) * _mvdigamma(0.5 * nu, p=p) + + 0.5 * nu * p ) @property def _natural_params(self): + nu = self.df # has shape (batch_shape) + p = self._event_shape[-1] # has singleton shape return ( - 0.5 * self.df, - 0.5 * self.precision_matrix, + 0.5 * (nu - p - 1), ) def _log_normalizer(self, x, y): - p = y.shape[-1] - return x * (- torch.linalg.slogdet(-2 * y).logabsdet + _log_2 * p) + _mvdigamma(x, p=p) + p = self._event_shape[-1] + return ( + (y + 0.5 * (p + 1)) * (- torch.linalg.slogdet(-2 * x).logabsdet + _log_2 * p) + + torch.mvlgamma(y + 0.5 * (p + 1), p=p) + ) From d79aec91f759acb00d9af4bf92f57ec752dd65b7 Mon Sep 17 00:00:00 2001 From: Pavithran Ramachandran Date: Wed, 16 Feb 2022 08:54:24 -0800 Subject: [PATCH 092/199] [easy][PTE] Reduce unnecessary ref count bumps in callstack debug (#72547) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72547 toTuple() returns a new intrusive pointer that bumps its underlying ref count. Whereas, toTupeRef returns a reference. We can save an unnecessary ref count bump. Based on https://fb.workplace.com/groups/pytorch.edge.team/permalink/1021780808376658/ similar to D34047666 (https://github.com/pytorch/pytorch/commit/85d7e73a8aa2dd74970017d11c7411b36b89dfc4) ghstack-source-id: 148665193 Test Plan: ``` > Executing task: buck: buck test //xplat/caffe2:test_lite_interpreter --config client.id=nuclide < Executing in directory: /data/users/pavithran/fbsource buck test //xplat/caffe2:test_lite_interpreter --config client.id=nuclide clang-9: warning: argument unused during compilation: '-pthread' [-Wunused-command-line-argument] Parsing buck files: finished in 2.1 sec Creating action graph: finished in 0.5 sec [RE] Metadata: Session ID=[reSessionID-66858379-0761-4966-a933-bc7f0d0add95] [RE] Waiting on 0 remote actions. Completed 523 actions remotely, action cache hit rate: 0.00%. Downloaded 3947/5089 artifacts, 20.92 Mbytes, 12.5% cache miss (for updated rules) Building: finished in 01:04.0 min (100%) 5438/5438 jobs, 5192/5438 updated Total time: 01:06.6 min Testing: finished in 06:53.7 min (71 PASS/0 FAIL) BUILD SUCCEEDED RESULTS FOR //xplat/caffe2:test_lite_interpreter PASS 406.0s 71 Passed 0 Skipped 0 Failed //xplat/caffe2:test_lite_interpreter TESTS PASSED Terminal will be reused by tasks, press any key to close it. ``` Reviewed By: kimishpatel Differential Revision: D34082609 fbshipit-source-id: 4bcbdb2d11dd4c3bc392010487dccd2270278222 (cherry picked from commit dd64eb386d02335e566fb6496f2ff00a8879ccc3) --- .../jit/serialization/callstack_debug_info_serialization.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp index b5aa951aa12..46384f49f67 100644 --- a/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp +++ b/torch/csrc/jit/serialization/callstack_debug_info_serialization.cpp @@ -222,7 +222,7 @@ ska::flat_hash_map CallStackDebugInfoUnpickler:: {}, c10::parseType); ska::flat_hash_map callstack_ptrs; - auto ivalues = std::move(*std::move(ival).toTuple()).elements(); + const auto& ivalues = ival.toTupleRef().elements(); for (auto& val : ivalues) { const auto& tup_elems = val.toTupleRef().elements(); TORCH_CHECK( From 76df91215f4bb4dff38c89329a5122f5ca44f44e Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Wed, 16 Feb 2022 09:17:22 -0800 Subject: [PATCH 093/199] [Pytorch Edge] Caffe2 Serialize files into indepedent target. Clean up function.cpp deps Summary: It seemed strange to me that min_runtime_lib was dependent on the serialization headers but didnt have a dependency on their .cc. This puts them into their own target that contains both and then updates deps. (Note: this ignores all push blocking failures!) Test Plan: ci Reviewed By: iseeyuan Differential Revision: D34159900 fbshipit-source-id: 57102414be2439f5f4e3ed8ccd2b0c375b9de9b2 (cherry picked from commit c9ff2d2d9df72ad6a990986340b7934c8929861b) --- torch/csrc/jit/mobile/function.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp index 7da992f6b70..3a7c11c0da0 100644 --- a/torch/csrc/jit/mobile/function.cpp +++ b/torch/csrc/jit/mobile/function.cpp @@ -1,5 +1,4 @@ #include -#include #include #include #include @@ -7,7 +6,6 @@ #include #include #include -#include namespace torch { namespace jit { From c1032bf0d155a45646d720f798f863b1b840b620 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 16 Feb 2022 17:30:26 +0000 Subject: [PATCH 094/199] [.github] Fix typo in job name Also add spaces, job name does not have to be a single world Pull Request resolved: https://github.com/pytorch/pytorch/pull/72917 --- .github/templates/android_ci_full_workflow.yml.j2 | 2 +- ...d-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/templates/android_ci_full_workflow.yml.j2 b/.github/templates/android_ci_full_workflow.yml.j2 index b89ae9fd94a..9736bee5c4e 100644 --- a/.github/templates/android_ci_full_workflow.yml.j2 +++ b/.github/templates/android_ci_full_workflow.yml.j2 @@ -61,7 +61,7 @@ on: !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a-build", "arm-v8a") }} !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32-build", "x86_32") }} !{{ common_android.build_android("pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64-build", "x86_64") }} - - name: Build-Final-Artifcact + - name: Build final artifact env: BRANCH: ${{ steps.parse-ref.outputs.branch }} run: | diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml index bccd46728c3..92da9655394 100644 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml +++ b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml @@ -361,7 +361,7 @@ jobs: docker cp "${container_name}:/var/lib/jenkins/workspace/dist" "${GITHUB_WORKSPACE}/." || echo "Dist folder not found" docker commit "${container_name}" "${COMMIT_DOCKER_IMAGE}" time docker push "${COMMIT_DOCKER_IMAGE}" - - name: Build-Final-Artifcact + - name: Build final artifact env: BRANCH: ${{ steps.parse-ref.outputs.branch }} run: | From 3493646f7636046c603921ef9a8b5c3fc635f39f Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 16 Feb 2022 17:38:17 +0000 Subject: [PATCH 095/199] [CircleCI] Re-enable nightly android builds A stop-gap measure to re-enable publishing of Android maven packages by CI, see https://github.com/pytorch/pytorch/issues/72902 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72903 --- .../data/simple/android_definitions.py | 103 +++++ .../cimodel/data/simple/nightly_android.py | 77 ++++ .circleci/config.yml | 383 ++++++++++++++++++ .circleci/generate_config_yml.py | 5 + .../job-specs/binary-job-specs.yml | 1 - .../job-specs/pytorch-job-specs.yml | 229 +++++++++++ 6 files changed, 797 insertions(+), 1 deletion(-) create mode 100644 .circleci/cimodel/data/simple/android_definitions.py create mode 100644 .circleci/cimodel/data/simple/nightly_android.py create mode 100644 .circleci/verbatim-sources/job-specs/pytorch-job-specs.yml diff --git a/.circleci/cimodel/data/simple/android_definitions.py b/.circleci/cimodel/data/simple/android_definitions.py new file mode 100644 index 00000000000..fb6d6f5661b --- /dev/null +++ b/.circleci/cimodel/data/simple/android_definitions.py @@ -0,0 +1,103 @@ +import cimodel.data.simple.util.branch_filters as branch_filters +from cimodel.data.simple.util.docker_constants import ( + DOCKER_IMAGE_NDK, DOCKER_REQUIREMENT_NDK +) + + +class AndroidJob: + def __init__(self, + variant, + template_name, + is_master_only=True): + + self.variant = variant + self.template_name = template_name + self.is_master_only = is_master_only + + def gen_tree(self): + + base_name_parts = [ + "pytorch", + "linux", + "xenial", + "py3", + "clang5", + "android", + "ndk", + "r19c", + ] + self.variant + [ + "build", + ] + + full_job_name = "_".join(base_name_parts) + build_env_name = "-".join(base_name_parts) + + props_dict = { + "name": full_job_name, + "build_environment": "\"{}\"".format(build_env_name), + "docker_image": "\"{}\"".format(DOCKER_IMAGE_NDK), + "requires": [DOCKER_REQUIREMENT_NDK] + } + + if self.is_master_only: + props_dict["filters"] = branch_filters.gen_filter_dict(branch_filters.NON_PR_BRANCH_LIST) + + return [{self.template_name: props_dict}] + + +class AndroidGradleJob: + def __init__(self, + job_name, + template_name, + dependencies, + is_master_only=True, + is_pr_only=False, + extra_props=tuple()): + + self.job_name = job_name + self.template_name = template_name + self.dependencies = dependencies + self.is_master_only = is_master_only + self.is_pr_only = is_pr_only + self.extra_props = dict(extra_props) + + def gen_tree(self): + + props_dict = { + "name": self.job_name, + "requires": self.dependencies, + } + + if self.is_master_only: + props_dict["filters"] = branch_filters.gen_filter_dict(branch_filters.NON_PR_BRANCH_LIST) + elif self.is_pr_only: + props_dict["filters"] = branch_filters.gen_filter_dict(branch_filters.PR_BRANCH_LIST) + if self.extra_props: + props_dict.update(self.extra_props) + + return [{self.template_name: props_dict}] + + +WORKFLOW_DATA = [ + AndroidJob(["x86_32"], "pytorch_linux_build", is_master_only=False), + AndroidJob(["x86_64"], "pytorch_linux_build"), + AndroidJob(["arm", "v7a"], "pytorch_linux_build"), + AndroidJob(["arm", "v8a"], "pytorch_linux_build"), + AndroidGradleJob( + "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32", + "pytorch_android_gradle_build-x86_32", + ["pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build"], + is_master_only=False, + is_pr_only=True), + AndroidGradleJob( + "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build", + "pytorch_android_gradle_build", + ["pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build", + "pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build", + "pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build", + "pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build"]), +] + + +def get_workflow_jobs(): + return [item.gen_tree() for item in WORKFLOW_DATA] diff --git a/.circleci/cimodel/data/simple/nightly_android.py b/.circleci/cimodel/data/simple/nightly_android.py new file mode 100644 index 00000000000..c6da5bbc4c7 --- /dev/null +++ b/.circleci/cimodel/data/simple/nightly_android.py @@ -0,0 +1,77 @@ +from cimodel.data.simple.util.docker_constants import ( + DOCKER_IMAGE_NDK, + DOCKER_REQUIREMENT_NDK +) + + +class AndroidNightlyJob: + def __init__(self, + variant, + template_name, + extra_props=None, + with_docker=True, + requires=None, + no_build_suffix=False): + + self.variant = variant + self.template_name = template_name + self.extra_props = extra_props or {} + self.with_docker = with_docker + self.requires = requires + self.no_build_suffix = no_build_suffix + + def gen_tree(self): + + base_name_parts = [ + "pytorch", + "linux", + "xenial", + "py3", + "clang5", + "android", + "ndk", + "r19c", + ] + self.variant + + build_suffix = [] if self.no_build_suffix else ["build"] + full_job_name = "_".join(["nightly"] + base_name_parts + build_suffix) + build_env_name = "-".join(base_name_parts) + + props_dict = { + "name": full_job_name, + "requires": self.requires, + "filters": {"branches": {"only": "nightly"}}, + } + + props_dict.update(self.extra_props) + + if self.with_docker: + props_dict["docker_image"] = DOCKER_IMAGE_NDK + props_dict["build_environment"] = build_env_name + + return [{self.template_name: props_dict}] + +BASE_REQUIRES = [DOCKER_REQUIREMENT_NDK] + +WORKFLOW_DATA = [ + AndroidNightlyJob(["x86_32"], "pytorch_linux_build", requires=BASE_REQUIRES), + AndroidNightlyJob(["x86_64"], "pytorch_linux_build", requires=BASE_REQUIRES), + AndroidNightlyJob(["arm", "v7a"], "pytorch_linux_build", requires=BASE_REQUIRES), + AndroidNightlyJob(["arm", "v8a"], "pytorch_linux_build", requires=BASE_REQUIRES), + AndroidNightlyJob(["android_gradle"], "pytorch_android_gradle_build", + with_docker=False, + requires=[ + "nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build", + "nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build", + "nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build", + "nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build"]), + AndroidNightlyJob(["x86_32_android_publish_snapshot"], "pytorch_android_publish_snapshot", + extra_props={"context": "org-member"}, + with_docker=False, + requires=["nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_android_gradle_build"], + no_build_suffix=True), +] + + +def get_workflow_jobs(): + return [item.gen_tree() for item in WORKFLOW_DATA] diff --git a/.circleci/config.yml b/.circleci/config.yml index 1a4bfd3418e..7310a4f853c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -456,6 +456,234 @@ promote_common: &promote_common # Job specs ############################################################################## jobs: + pytorch_linux_build: + <<: *pytorch_params + machine: + image: ubuntu-2004:202104-01 + steps: + # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml + - checkout + - calculate_docker_image_tag + - setup_linux_system_environment + - optional_merge_target_branch + - setup_ci_environment + - run: + name: Build + no_output_timeout: "1h" + command: | + set -e + if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then + echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}" + fi + if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + echo 'ATEN_THREADING=TBB' >> "${BASH_ENV}" + echo 'USE_TBB=1' >> "${BASH_ENV}" + elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + echo 'ATEN_THREADING=NATIVE' >> "${BASH_ENV}" + fi + echo "Parallel backend flags: "${PARALLEL_FLAGS} + # Pull Docker image and run build + echo "DOCKER_IMAGE: "${DOCKER_IMAGE}:${DOCKER_TAG} + time docker pull ${DOCKER_IMAGE}:${DOCKER_TAG} >/dev/null + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}:${DOCKER_TAG}) + + git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 + + docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace + + export COMMAND='((echo "sudo chown -R jenkins workspace && export JOB_BASE_NAME="$CIRCLE_JOB" && cd workspace && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "$id" bash) 2>&1' + + echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts + + # Copy dist folder back + docker cp $id:/var/lib/jenkins/workspace/dist /home/circleci/project/. || echo "Dist folder not found" + + # Push intermediate Docker image for next phase to use + if [ -z "${BUILD_ONLY}" ]; then + # Note [Special build images] + # The xla build uses the same docker image as + # pytorch_linux_bionic_py3_6_clang9_build. In the push step, we have to + # distinguish between them so the test can pick up the correct image. + output_image=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1} + if [[ ${BUILD_ENVIRONMENT} == *"xla"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-xla + elif [[ ${BUILD_ENVIRONMENT} == *"libtorch"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-libtorch + elif [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-paralleltbb + elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-parallelnative + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-x86_64"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-x86_64 + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-arm-v7a"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-arm-v7a + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-arm-v8a"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-arm-v8a + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-x86_32"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-x86_32 + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-vulkan-x86_32"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-vulkan-x86_32 + elif [[ ${BUILD_ENVIRONMENT} == *"vulkan-linux"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-vulkan + else + export COMMIT_DOCKER_IMAGE=$output_image + fi + docker commit "$id" ${COMMIT_DOCKER_IMAGE} + time docker push ${COMMIT_DOCKER_IMAGE} + fi + - run: + name: upload build & binary data + no_output_timeout: "5m" + command: | + cd /pytorch && export COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + python3 -mpip install requests && \ + SCRIBE_GRAPHQL_ACCESS_TOKEN=${SCRIBE_GRAPHQL_ACCESS_TOKEN} \ + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - store_artifacts: + path: /home/circleci/project/dist + + pytorch_linux_test: + <<: *pytorch_params + machine: + image: ubuntu-2004:202104-01 + steps: + # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml + - checkout + - calculate_docker_image_tag + - setup_linux_system_environment + - setup_ci_environment + - run: + name: Download Docker image + no_output_timeout: "90m" + command: | + set -e + export PYTHONUNBUFFERED=1 + if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then + export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6" + fi + # See Note [Special build images] + output_image=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1} + if [[ ${BUILD_ENVIRONMENT} == *"xla"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-xla + elif [[ ${BUILD_ENVIRONMENT} == *"libtorch"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-libtorch + elif [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-paralleltbb + elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-parallelnative + elif [[ ${BUILD_ENVIRONMENT} == *"vulkan-linux"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-vulkan + else + export COMMIT_DOCKER_IMAGE=$output_image + fi + echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} + + if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + echo 'ATEN_THREADING=TBB' >> "${BASH_ENV}" + echo 'USE_TBB=1' >> "${BASH_ENV}" + elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + echo 'ATEN_THREADING=NATIVE' >> "${BASH_ENV}" + fi + echo "Parallel backend flags: "${PARALLEL_FLAGS} + + time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null + + # TODO: Make this less painful + if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all --shm-size=2g -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) + elif [[ ${BUILD_ENVIRONMENT} == *"rocm"* ]]; then + hostname + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) + else + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) + fi + echo "id=${id}" >> "${BASH_ENV}" + + - run: + name: Check for no AVX instruction by default + no_output_timeout: "20m" + command: | + set -e + is_vanilla_build() { + if [ "${BUILD_ENVIRONMENT}" == "pytorch-linux-bionic-py3.7-clang9-test" ]; then + return 0 + fi + if [ "${BUILD_ENVIRONMENT}" == "pytorch-linux-xenial-py3.7-gcc5.4-test" ]; then + return 0 + fi + return 1 + } + + if is_vanilla_build; then + echo "apt-get update || apt-get install libgnutls30" | docker exec -u root -i "$id" bash + echo "apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash + echo "cd workspace/build; qemu-x86_64 -g 2345 -cpu Broadwell -E ATEN_CPU_CAPABILITY=default ./bin/basic --gtest_filter=BasicTest.BasicTestCPU & gdb ./bin/basic -ex 'set pagination off' -ex 'target remote :2345' -ex 'continue' -ex 'bt' -ex='set confirm off' -ex 'quit \$_isvoid(\$_exitcode)'" | docker exec -u jenkins -i "$id" bash + else + echo "Skipping for ${BUILD_ENVIRONMENT}" + fi + - run: + name: Test + no_output_timeout: "90m" + command: | + set -e + + cat >docker_commands.sh \<> docker_commands.sh + elif [[ ${BUILD_ENVIRONMENT} == *onnx* ]]; then + echo ".jenkins/caffe2/test.sh" >> docker_commands.sh + else + echo ".jenkins/pytorch/test.sh" >> docker_commands.sh + fi + echo "(cat docker_commands.sh | docker exec -u jenkins -i "$id" bash) 2>&1" > command.sh + unbuffer bash command.sh | ts + + - run: + name: Report results + no_output_timeout: "5m" + command: | + set -e + # Retrieving test results should be done as very first step as command never fails + # But is always executed if previous step fails for some reason + echo "Retrieving test reports" + docker cp $id:/var/lib/jenkins/workspace/test/test-reports ./ || echo 'No test reports found!' + docker stats --all --no-stream + + cat >docker_commands.sh \<&1" > command.sh + unbuffer bash command.sh | ts + when: always + - store_test_results: + path: test-reports binary_linux_build: <<: *binary_linux_build_params steps: @@ -2269,6 +2497,70 @@ workflows: when: << pipeline.parameters.run_binary_tests >> build: jobs: + - pytorch_linux_build: + build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + filters: + branches: + only: + - master + - /ci-all\/.*/ + - /release\/.*/ + name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + filters: + branches: + only: + - master + - /ci-all\/.*/ + - /release\/.*/ + name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + filters: + branches: + only: + - master + - /ci-all\/.*/ + - /release\/.*/ + name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_android_gradle_build-x86_32: + filters: + branches: + only: + - /gh\/.*\/head/ + - /pull\/.*/ + name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build-x86_32 + requires: + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build + - pytorch_android_gradle_build: + filters: + branches: + only: + - master + - /ci-all\/.*/ + - /release\/.*/ + name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build + requires: + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build - binary_linux_build: build_environment: manywheel 3.7m cu102 devtoolset7 docker_image: pytorch/manylinux-cuda102 @@ -2473,6 +2765,60 @@ workflows: requires: - pytorch_ios_full_jit_12_5_1_nightly_x86_64_build - pytorch_ios_full_jit_12_5_1_nightly_arm64_build + - pytorch_linux_build: + build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32 + docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c + filters: + branches: + only: nightly + name: nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64 + docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c + filters: + branches: + only: nightly + name: nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a + docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c + filters: + branches: + only: nightly + name: nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a + docker_image: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c + filters: + branches: + only: nightly + name: nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_android_gradle_build: + filters: + branches: + only: nightly + name: nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_android_gradle_build + requires: + - nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build + - nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build + - nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build + - nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build + - pytorch_android_publish_snapshot: + context: org-member + filters: + branches: + only: nightly + name: nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_android_publish_snapshot + requires: + - nightly_pytorch_linux_xenial_py3_clang5_android_ndk_r19c_android_gradle_build - anaconda_prune: name: anaconda-prune-pytorch-nightly context: "org-member" @@ -2654,9 +3000,43 @@ workflows: only: - postnightly executor: windows-with-nvidia-gpu + - docker_build_job: + name: "docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + image_name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c" when: << pipeline.parameters.run_build >> master_build: jobs: + - pytorch_linux_build: + build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_32-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-x86_64-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v7a-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_linux_build: + build_environment: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-arm-v8a-build" + docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + name: pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build + requires: + - docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c + - pytorch_android_gradle_build: + name: pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-build + requires: + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_32_build + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_x86_64_build + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v7a_build + - pytorch_linux_xenial_py3_clang5_android_ndk_r19c_arm_v8a_build - binary_linux_build: build_environment: manywheel 3.7m cu102 devtoolset7 docker_image: pytorch/manylinux-cuda102 @@ -2719,6 +3099,9 @@ workflows: name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_test requires: - binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_shared-with-deps_build + - docker_build_job: + name: "docker-pytorch-linux-xenial-py3-clang5-android-ndk-r19c" + image_name: "pytorch-linux-xenial-py3-clang5-android-ndk-r19c" when: << pipeline.parameters.run_master_build >> # Promotion workflow promote: diff --git a/.circleci/generate_config_yml.py b/.circleci/generate_config_yml.py index a801aa97848..581a32cd485 100755 --- a/.circleci/generate_config_yml.py +++ b/.circleci/generate_config_yml.py @@ -11,9 +11,11 @@ import sys from collections import namedtuple import cimodel.data.binary_build_definitions as binary_build_definitions +import cimodel.data.simple.android_definitions import cimodel.data.simple.binary_smoketest import cimodel.data.simple.docker_definitions import cimodel.data.simple.mobile_definitions +import cimodel.data.simple.nightly_android import cimodel.data.simple.nightly_ios import cimodel.data.simple.anaconda_prune_defintions import cimodel.lib.miniutils as miniutils @@ -135,9 +137,11 @@ def generate_required_docker_images(items): def gen_build_workflows_tree(): build_workflows_functions = [ + cimodel.data.simple.android_definitions.get_workflow_jobs, cimodel.data.simple.mobile_definitions.get_workflow_jobs, cimodel.data.simple.binary_smoketest.get_workflow_jobs, cimodel.data.simple.nightly_ios.get_workflow_jobs, + cimodel.data.simple.nightly_android.get_workflow_jobs, cimodel.data.simple.anaconda_prune_defintions.get_workflow_jobs, binary_build_definitions.get_post_upload_jobs, binary_build_definitions.get_binary_smoke_test_jobs, @@ -185,6 +189,7 @@ YAML_SOURCES = [ File("build-parameters/binary-build-params.yml"), File("build-parameters/promote-build-params.yml"), Header("Job specs"), + File("job-specs/pytorch-job-specs.yml"), File("job-specs/binary-job-specs.yml"), File("job-specs/job-specs-custom.yml"), File("job-specs/job-specs-promote.yml"), diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml index 581b76c8f94..5dd8dab85c9 100644 --- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml @@ -1,4 +1,3 @@ -jobs: binary_linux_build: <<: *binary_linux_build_params steps: diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml new file mode 100644 index 00000000000..79f879a13f0 --- /dev/null +++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml @@ -0,0 +1,229 @@ +jobs: + pytorch_linux_build: + <<: *pytorch_params + machine: + image: ubuntu-2004:202104-01 + steps: + # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml + - checkout + - calculate_docker_image_tag + - setup_linux_system_environment + - optional_merge_target_branch + - setup_ci_environment + - run: + name: Build + no_output_timeout: "1h" + command: | + set -e + if [[ ${BUILD_ENVIRONMENT} == *"pure_torch"* ]]; then + echo 'BUILD_CAFFE2=OFF' >> "${BASH_ENV}" + fi + if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + echo 'ATEN_THREADING=TBB' >> "${BASH_ENV}" + echo 'USE_TBB=1' >> "${BASH_ENV}" + elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + echo 'ATEN_THREADING=NATIVE' >> "${BASH_ENV}" + fi + echo "Parallel backend flags: "${PARALLEL_FLAGS} + # Pull Docker image and run build + echo "DOCKER_IMAGE: "${DOCKER_IMAGE}:${DOCKER_TAG} + time docker pull ${DOCKER_IMAGE}:${DOCKER_TAG} >/dev/null + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -t -d -w /var/lib/jenkins ${DOCKER_IMAGE}:${DOCKER_TAG}) + + git submodule sync && git submodule update -q --init --recursive --depth 1 --jobs 0 + + docker cp /home/circleci/project/. $id:/var/lib/jenkins/workspace + + export COMMAND='((echo "sudo chown -R jenkins workspace && export JOB_BASE_NAME="$CIRCLE_JOB" && cd workspace && .jenkins/pytorch/build.sh && find ${BUILD_ROOT} -type f -name "*.a" -or -name "*.o" -delete") | docker exec -u jenkins -i "$id" bash) 2>&1' + + echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts + + # Copy dist folder back + docker cp $id:/var/lib/jenkins/workspace/dist /home/circleci/project/. || echo "Dist folder not found" + + # Push intermediate Docker image for next phase to use + if [ -z "${BUILD_ONLY}" ]; then + # Note [Special build images] + # The xla build uses the same docker image as + # pytorch_linux_bionic_py3_6_clang9_build. In the push step, we have to + # distinguish between them so the test can pick up the correct image. + output_image=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1} + if [[ ${BUILD_ENVIRONMENT} == *"xla"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-xla + elif [[ ${BUILD_ENVIRONMENT} == *"libtorch"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-libtorch + elif [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-paralleltbb + elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-parallelnative + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-x86_64"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-x86_64 + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-arm-v7a"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-arm-v7a + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-arm-v8a"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-arm-v8a + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-x86_32"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-x86_32 + elif [[ ${BUILD_ENVIRONMENT} == *"android-ndk-r19c-vulkan-x86_32"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-android-vulkan-x86_32 + elif [[ ${BUILD_ENVIRONMENT} == *"vulkan-linux"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-vulkan + else + export COMMIT_DOCKER_IMAGE=$output_image + fi + docker commit "$id" ${COMMIT_DOCKER_IMAGE} + time docker push ${COMMIT_DOCKER_IMAGE} + fi + - run: + name: upload build & binary data + no_output_timeout: "5m" + command: | + cd /pytorch && export COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + python3 -mpip install requests && \ + SCRIBE_GRAPHQL_ACCESS_TOKEN=${SCRIBE_GRAPHQL_ACCESS_TOKEN} \ + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - store_artifacts: + path: /home/circleci/project/dist + + pytorch_linux_test: + <<: *pytorch_params + machine: + image: ubuntu-2004:202104-01 + steps: + # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml + - checkout + - calculate_docker_image_tag + - setup_linux_system_environment + - setup_ci_environment + - run: + name: Download Docker image + no_output_timeout: "90m" + command: | + set -e + export PYTHONUNBUFFERED=1 + if [[ "${DOCKER_IMAGE}" == *rocm3.9* ]]; then + export DOCKER_TAG="f3d89a32912f62815e4feaeed47e564e887dffd6" + fi + # See Note [Special build images] + output_image=${DOCKER_IMAGE}:build-${DOCKER_TAG}-${CIRCLE_SHA1} + if [[ ${BUILD_ENVIRONMENT} == *"xla"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-xla + elif [[ ${BUILD_ENVIRONMENT} == *"libtorch"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-libtorch + elif [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-paralleltbb + elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-parallelnative + elif [[ ${BUILD_ENVIRONMENT} == *"vulkan-linux"* ]]; then + export COMMIT_DOCKER_IMAGE=$output_image-vulkan + else + export COMMIT_DOCKER_IMAGE=$output_image + fi + echo "DOCKER_IMAGE: "${COMMIT_DOCKER_IMAGE} + + if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then + echo 'ATEN_THREADING=TBB' >> "${BASH_ENV}" + echo 'USE_TBB=1' >> "${BASH_ENV}" + elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then + echo 'ATEN_THREADING=NATIVE' >> "${BASH_ENV}" + fi + echo "Parallel backend flags: "${PARALLEL_FLAGS} + + time docker pull ${COMMIT_DOCKER_IMAGE} >/dev/null + + # TODO: Make this less painful + if [ -n "${USE_CUDA_DOCKER_RUNTIME}" ]; then + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all --shm-size=2g -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) + elif [[ ${BUILD_ENVIRONMENT} == *"rocm"* ]]; then + hostname + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=8g --ipc=host --device /dev/kfd --device /dev/dri --group-add video -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) + else + export id=$(docker run --env-file "${BASH_ENV}" --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --shm-size=1g --ipc=host -t -d -w /var/lib/jenkins ${COMMIT_DOCKER_IMAGE}) + fi + echo "id=${id}" >> "${BASH_ENV}" + + - run: + name: Check for no AVX instruction by default + no_output_timeout: "20m" + command: | + set -e + is_vanilla_build() { + if [ "${BUILD_ENVIRONMENT}" == "pytorch-linux-bionic-py3.7-clang9-test" ]; then + return 0 + fi + if [ "${BUILD_ENVIRONMENT}" == "pytorch-linux-xenial-py3.7-gcc5.4-test" ]; then + return 0 + fi + return 1 + } + + if is_vanilla_build; then + echo "apt-get update || apt-get install libgnutls30" | docker exec -u root -i "$id" bash + echo "apt-get install -y qemu-user gdb" | docker exec -u root -i "$id" bash + echo "cd workspace/build; qemu-x86_64 -g 2345 -cpu Broadwell -E ATEN_CPU_CAPABILITY=default ./bin/basic --gtest_filter=BasicTest.BasicTestCPU & gdb ./bin/basic -ex 'set pagination off' -ex 'target remote :2345' -ex 'continue' -ex 'bt' -ex='set confirm off' -ex 'quit \$_isvoid(\$_exitcode)'" | docker exec -u jenkins -i "$id" bash + else + echo "Skipping for ${BUILD_ENVIRONMENT}" + fi + - run: + name: Test + no_output_timeout: "90m" + command: | + set -e + + cat >docker_commands.sh \<> docker_commands.sh + elif [[ ${BUILD_ENVIRONMENT} == *onnx* ]]; then + echo ".jenkins/caffe2/test.sh" >> docker_commands.sh + else + echo ".jenkins/pytorch/test.sh" >> docker_commands.sh + fi + echo "(cat docker_commands.sh | docker exec -u jenkins -i "$id" bash) 2>&1" > command.sh + unbuffer bash command.sh | ts + + - run: + name: Report results + no_output_timeout: "5m" + command: | + set -e + # Retrieving test results should be done as very first step as command never fails + # But is always executed if previous step fails for some reason + echo "Retrieving test reports" + docker cp $id:/var/lib/jenkins/workspace/test/test-reports ./ || echo 'No test reports found!' + docker stats --all --no-stream + + cat >docker_commands.sh \<&1" > command.sh + unbuffer bash command.sh | ts + when: always + - store_test_results: + path: test-reports From b02c514764c2384e8c3111f5c3dea8d51ba96a44 Mon Sep 17 00:00:00 2001 From: "Junjie Wang (PyTorch)" Date: Wed, 16 Feb 2022 09:36:08 -0800 Subject: [PATCH 096/199] [PT-D][Sharded Tensor] new init api for local tensor and sharding spec auto inference (#72733) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72733 To improve the perf cost due to communication in the process of init the sharded tensor. There are two changes in this PR/diff: 1. We create a new API named `_init_from_local_tensor` so that if we have only one local tensor, we can initiate a sharded tensor directly from it. (GH issue: https://github.com/pytorch/pytorch/issues/72092) 2. We create a new API to infer the sharding spec from global meta data, so we don't have to manually set the sharding spec when it's not `EnumerableShardingSpec`. (GH issue: https://github.com/pytorch/pytorch/issues/67244) ghstack-source-id: 149229259 Test Plan: CI Reviewed By: wanchaol Differential Revision: D34132739 fbshipit-source-id: 3a60135761bcc19d6020b6c45cb2979869645ce6 (cherry picked from commit af569325e2794309a4a86e51749642a062a25f6e) --- .../sharded_tensor/test_partial_tensor.py | 5 + .../sharded_tensor/test_sharded_tensor.py | 124 +++++++++++++ .../test_sharded_tensor_reshard.py | 5 + .../sharding_spec/test_sharding_spec.py | 79 +++++++++ torch/distributed/_shard/api.py | 7 +- .../sharded_tensor/_ops/elementwise_ops.py | 12 +- .../_shard/sharded_tensor/_ops/linear.py | 67 ++----- .../distributed/_shard/sharded_tensor/api.py | 163 +++++++++++++++--- .../_shard/sharding_spec/__init__.py | 1 + torch/distributed/_shard/sharding_spec/api.py | 68 ++++++++ 10 files changed, 432 insertions(+), 99 deletions(-) diff --git a/test/distributed/_shard/sharded_tensor/test_partial_tensor.py b/test/distributed/_shard/sharded_tensor/test_partial_tensor.py index 18418f8fb51..4eb5dc93c3d 100644 --- a/test/distributed/_shard/sharded_tensor/test_partial_tensor.py +++ b/test/distributed/_shard/sharded_tensor/test_partial_tensor.py @@ -17,6 +17,7 @@ from torch.testing._internal.common_distributed import ( ) from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, + run_tests, ) from torch.testing._internal.distributed._shard.sharded_tensor import ( ShardedTensorTestBase, @@ -106,3 +107,7 @@ class TestPartialTensorReshard(ShardedTensorTestBase): self._run_partial_tensor_n_reshard( spec, [13, 21], 3, dist.ReduceOp.SUM, dtype=torch.cfloat ) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py index cbad9458ae4..2aed32b3a85 100644 --- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py +++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor.py @@ -28,6 +28,10 @@ from torch.distributed._shard.sharding_spec import ( EnumerableShardingSpec, ShardMetadata, ) +from torch.distributed._shard.sharding_spec._internals import ( + get_split_size, + get_chunked_dim_size, +) from torch.distributed._shard.sharded_tensor.api import ( TensorProperties, _create_tensor_from_params, @@ -1781,6 +1785,126 @@ class TestShardedTensorEnumerable(ShardedTensorTestBase): self.assertEqual((5, 5), shard.tensor.size()) +class TestShardedTensorFromLocalTensor(ShardedTensorTestBase): + def _generate_st_from_chunk_local_tensor(self, st_size, sharding_spec): + chunk_sharding_dim = sharding_spec.dim + world_size = len(sharding_spec.placements) + split_size = get_split_size(st_size[chunk_sharding_dim], world_size) + # TODO: To use an easier way to generate the ShardedMetadatas + # for sharding spec after https://github.com/pytorch/pytorch/pull/72130. + shards_metadata = [None] * world_size + local_shard_meta = None + for idx, placement in enumerate(sharding_spec.placements): + shard_size = copy.deepcopy(st_size) + offsets = [0] * len(st_size) + offsets[chunk_sharding_dim] = min( + split_size * idx, st_size[chunk_sharding_dim] + ) + shard_size[chunk_sharding_dim] = get_chunked_dim_size( + st_size[chunk_sharding_dim], split_size, idx + ) + shards_metadata[placement.rank()] = ShardMetadata( + shard_offsets=offsets, + shard_sizes=shard_size, + placement=placement, + ) + if placement.rank() == self.rank: + local_tensor_size = copy.deepcopy(shard_size) + local_shard_metadata = shards_metadata[placement.rank()] + + local_tensor = torch.rand(*local_tensor_size).cuda(self.rank) + st = ShardedTensor._init_from_local_tensor( + local_tensor, + sharding_spec, + st_size, + init_rrefs=True, + ) + self.assertEqual(tuple(st_size), st.size()) + self.assertEqual(1, len(st.local_shards())) + + # Verify local shard. + local_shard = st.local_shards()[0] + self.assertEqual(st.local_tensor(), local_tensor) + self.assertEqual(torch.device(f"cuda:{self.rank}"), local_shard.tensor.device) + + # Verify local shard metadata. + self.assertEqual( + local_shard_metadata.shard_offsets, local_shard.metadata.shard_offsets + ) + self.assertEqual( + local_shard_metadata.shard_sizes, local_shard.metadata.shard_sizes + ) + self.assertEqual(local_shard_metadata.placement, local_shard.metadata.placement) + + # Verify global metadata. + st_shards_metadata = st.metadata().shards_metadata + self.assertEqual(world_size, len(st_shards_metadata)) + self.assertEqual(shards_metadata, st_shards_metadata) + + # Validate remote shards. + remote_shards = st.remote_shards() + self.assertEqual(world_size - 1, len(remote_shards)) + for rpc_rank, shards in remote_shards.items(): + self.assertEqual(1, len(shards)) + for remote_shard in shards: + self.assertEqual(rpc_rank, remote_shard.owner().id) + # If remote shard does not exist, to_here() will throw exception. + if shards_metadata[rpc_rank].shard_sizes[chunk_sharding_dim]: + shard = remote_shard.to_here() + self.assertEqual( + shards_metadata[rpc_rank].shard_sizes, shard.tensor.size() + ) + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_init_from_local_tensor(self): + chunk_specs = _chunk_sharding_specs_list_for_test([0, 1, 1, 0], seed=31) + for spec in chunk_specs: + self._generate_st_from_chunk_local_tensor([20, 5], spec) + self._generate_st_from_chunk_local_tensor([21, 7], spec) + self._generate_st_from_chunk_local_tensor([23, 16], spec) + self._generate_st_from_chunk_local_tensor([44, 6, 8], spec) + + @with_comms + @skip_if_lt_x_gpu(4) + @requires_nccl() + def test_init_from_local_tensor_errors(self): + enumerable_sharding_spec = EnumerableShardingSpec( + [ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[5, 5], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_sizes=[5, 5], + placement="rank:1/cuda:1", + ), + ] + ) + st_size = [24, 12] + local_tensor = torch.rand(*st_size).cuda(self.rank) + with self.assertRaisesRegex( + NotImplementedError, "Only ChunkShardingSpec is supported." + ): + ShardedTensor._init_from_local_tensor( + local_tensor, + enumerable_sharding_spec, + st_size, + ) + chunk_specs = _chunk_sharding_specs_list_for_test([0], seed=31) + with self.assertRaisesRegex( + ValueError, "local_tensor is not a contiguous Tensor." + ): + ShardedTensor._init_from_local_tensor( + local_tensor.t(), + chunk_specs[0], + st_size, + ) + + class TestShardedTensorFromLocalShards(ShardedTensorTestBase): @with_comms(init_rpc=False) diff --git a/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py b/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py index 1a106772e67..b2276ade81b 100644 --- a/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py +++ b/test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py @@ -18,6 +18,7 @@ from torch.testing._internal.common_distributed import ( ) from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, + run_tests, ) from torch.testing._internal.distributed._shard.sharded_tensor import ( ShardedTensorTestBase, @@ -95,3 +96,7 @@ class TestReshard(ShardedTensorTestBase): NotImplementedError, "Only single local shard supported for reshard." ): st.reshard(reshard_spec) + + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/_shard/sharding_spec/test_sharding_spec.py b/test/distributed/_shard/sharding_spec/test_sharding_spec.py index d760b5499fd..5072f53c163 100644 --- a/test/distributed/_shard/sharding_spec/test_sharding_spec.py +++ b/test/distributed/_shard/sharding_spec/test_sharding_spec.py @@ -1,5 +1,6 @@ # Owner(s): ["oncall: distributed"] +import copy import torch from torch.testing._internal.common_utils import TestCase from torch.distributed._shard.sharding_spec import ( @@ -7,6 +8,7 @@ from torch.distributed._shard.sharding_spec import ( DevicePlacementSpec, EnumerableShardingSpec, ShardMetadata, + _infer_sharding_spec_from_shards_metadata, ) from torch.distributed._shard.sharding_spec._internals import ( check_tensor, @@ -19,6 +21,9 @@ from torch.testing._internal.common_utils import ( run_tests, sandcastle_skip_if, ) +from torch.testing._internal.distributed._shard.sharded_tensor._test_st_common import ( + _chunk_sharding_specs_list_for_test, +) class TestShardingSpec(TestCase): @@ -276,5 +281,79 @@ class TestShardingSpec(TestCase): self.assertEqual(0, result[0]) self.assertEqual(6, result[1]) + def _infer_enum_sharding_spec_case(self): + shards_metadata = [ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[5, 5], + placement="cuda:0", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_sizes=[10, 5], + placement="cuda:1", + ) + ] + spec = _infer_sharding_spec_from_shards_metadata(shards_metadata) + self.assertTrue(isinstance(spec, EnumerableShardingSpec)) + self.assertEqual(spec.shards, shards_metadata) + + shards_metadata = [ + ShardMetadata( + shard_offsets=[0, 0], + shard_sizes=[5, 5], + placement="rank:0/cuda:0", + ), + ShardMetadata( + shard_offsets=[5, 0], + shard_sizes=[5, 5], + placement="rank:1/cuda:1", + ), + ShardMetadata( + shard_offsets=[0, 5], + shard_sizes=[5, 5], + placement="rank:2/cuda:2", + ), + ShardMetadata( + shard_offsets=[5, 5], + shard_sizes=[5, 5], + placement="rank:3/cuda:3", + ), + ] + spec = _infer_sharding_spec_from_shards_metadata(shards_metadata) + self.assertTrue(isinstance(spec, EnumerableShardingSpec)) + self.assertEqual(spec.shards, shards_metadata) + + def _infer_chunk_sharding_spec_case(self, placements, sharding_dim, st_size): + world_size = len(placements) + split_size = get_split_size(st_size[sharding_dim], world_size) + shards_metadata = [None] * world_size + for idx, placement in enumerate(placements): + shard_size = copy.deepcopy(st_size) + offsets = [0] * len(st_size) + offsets[sharding_dim] = split_size * idx + shard_size[sharding_dim] = get_chunked_dim_size(st_size[sharding_dim], split_size, idx) + shards_metadata[placement.rank()] = ShardMetadata( + shard_offsets=offsets, + shard_sizes=shard_size, + placement=placement, + ) + + spec = _infer_sharding_spec_from_shards_metadata(shards_metadata) + self.assertTrue(isinstance(spec, ChunkShardingSpec)) + self.assertEqual(spec.dim, sharding_dim) + self.assertEqual(spec.placements, placements) + + def test_infer_sharding_spec_from_shards_metadata(self): + self._infer_enum_sharding_spec_case() + chunk_specs = _chunk_sharding_specs_list_for_test([0, 0, 1, 1], seed=31) + for spec in chunk_specs: + self._infer_chunk_sharding_spec_case(spec.placements, 0, [4, 16]) + self._infer_chunk_sharding_spec_case(spec.placements, 0, [5, 15, 16]) + self._infer_chunk_sharding_spec_case(spec.placements, 1, [12, 16]) + self._infer_chunk_sharding_spec_case(spec.placements, 2, [4, 18, 15]) + self._infer_chunk_sharding_spec_case(spec.placements, 3, [7, 12, 16, 37]) + self._infer_chunk_sharding_spec_case(spec.placements, 4, [50, 4, 18, 15, 77]) + if __name__ == '__main__': run_tests() diff --git a/torch/distributed/_shard/api.py b/torch/distributed/_shard/api.py index 18711d4dbdf..dceeb43143e 100644 --- a/torch/distributed/_shard/api.py +++ b/torch/distributed/_shard/api.py @@ -115,12 +115,7 @@ def _shard_tensor( ) ] - st = ShardedTensor._init_from_local_shards(local_shards, tensor.size(), process_group=pg) - - # Manually set sharding_spec - st._sharding_spec = sharding_spec - - return st + return ShardedTensor._init_from_local_shards(local_shards, tensor.size(), process_group=pg) def shard_parameter( module: torch.nn.Module, diff --git a/torch/distributed/_shard/sharded_tensor/_ops/elementwise_ops.py b/torch/distributed/_shard/sharded_tensor/_ops/elementwise_ops.py index 42bc29d3d42..988af0c55d0 100644 --- a/torch/distributed/_shard/sharded_tensor/_ops/elementwise_ops.py +++ b/torch/distributed/_shard/sharded_tensor/_ops/elementwise_ops.py @@ -1,5 +1,3 @@ -import copy - import torch from torch.distributed._shard.sharded_tensor import ( sharded_op_impl, @@ -23,16 +21,10 @@ def register_elementwise_op(op): local_shards_new = [] for local_shard in input.local_shards(): local_shards_new.append(Shard(op(local_shard.tensor), local_shard.metadata)) - # TODO: After a new API for sharded tensor creation, we need to replace this. - # https://github.com/pytorch/pytorch/issues/72092 - new_st = ShardedTensor._init_from_local_shards( - local_shards_new, input.size(), process_group=pg + return ShardedTensor._init_from_local_shards_and_global_metadata( + local_shards_new, input.metadata(), process_group=pg ) - # Manually set sharding_spec - new_st._sharding_spec = copy.deepcopy(input._sharding_spec) - return new_st - register_elementwise_op(torch.nn.functional.gelu) register_elementwise_op(torch.nn.functional.relu) diff --git a/torch/distributed/_shard/sharded_tensor/_ops/linear.py b/torch/distributed/_shard/sharded_tensor/_ops/linear.py index 772947a4c0a..6dc9e7374b4 100644 --- a/torch/distributed/_shard/sharded_tensor/_ops/linear.py +++ b/torch/distributed/_shard/sharded_tensor/_ops/linear.py @@ -1,4 +1,3 @@ -import copy from typing import List, cast import torch @@ -11,9 +10,7 @@ from torch.distributed.nn.functional import ( from torch.distributed._shard.sharded_tensor import ( sharded_op_impl, _PartialTensor, - Shard, ShardedTensor, - ShardMetadata, ) from torch.distributed._shard.sharding_spec import ChunkShardingSpec from torch.distributed._shard.sharding_spec._internals import ( @@ -218,8 +215,17 @@ def _handle_col_wise_sharding(input, world_size, weight, rank, local_shard_t, bi result = torch.stack(results) # type: ignore[arg-type] else: result = torch.cat(results) # type: ignore[arg-type] - return _init_sharded_tensor_from_local_result( - weight, result, 0, -1, world_size, pg # type: ignore[arg-type] + st_size = list(result.size()) + st_size[-1] = weight.size(0) + new_sharding_spec = ChunkShardingSpec( + dim=-1, + placements=weight.sharding_spec().placements + ) + return ShardedTensor._init_from_local_tensor( + result, + new_sharding_spec, + *st_size, # type: ignore[arg-type] + process_group=pg, ) @@ -340,57 +346,6 @@ def _handle_row_wise_sharding_sharded_tensor( return _PartialTensor(torch.cat(results), pg) -def _init_sharded_tensor_from_local_result( - sharded_tensor, - local_result, - tensor_shard_dim, - result_shard_dim, - world_size, - pg, -): - """ - Given a sharded tensor and local_result from an op on top of it. We want - to create a new sharded tensor from the local_result so that the the next - op can be performed on the basis of the new sharded tensor. This can seen - as the last step of the first phase of the Megatron-LM style model(tensor) - parallelism. - - Args: - sharded_tensor: Sharded tensor which the op was performed on. - local_result: A tensor which is from the op performed on the local_shard of - the sharded_tensor. - tensor_shard_dim: Dim which the tensor is sharded on. - result_shard_dim: Dim which the new sharded tensor will be sharded on. - world_size: number of ranks. - pg (ProcessGroup, optional): The process group to work on. If None, - the default process group will be used. - - Return: - A :class:`ShardedTensor` object which filled with local intermediate results. - """ - sharded_weight_metadata = copy.deepcopy(sharded_tensor.local_shards()[0].metadata) - current_offsets = [0] * local_result.dim() - current_offsets[result_shard_dim] = sharded_weight_metadata.shard_offsets[ - tensor_shard_dim - ] - global_size = list(local_result.size()) - global_size[result_shard_dim] = sharded_tensor.size(tensor_shard_dim) - local_shard_metadata = ShardMetadata( - shard_offsets=current_offsets, - shard_sizes=list(local_result.size()), - placement=sharded_weight_metadata.placement, - ) - local_shards = [Shard(local_result, local_shard_metadata)] - new_st = ShardedTensor._init_from_local_shards( - local_shards, tuple(global_size), process_group=pg - ) - - # Manually set sharding_spec - new_st._sharding_spec = copy.deepcopy(sharded_tensor._sharding_spec) - new_st._sharding_spec.dim = result_shard_dim - return new_st - - class _BiasTensorNarrow(Function): """ Since we now return the intermediate results in a col-wise sharding. We diff --git a/torch/distributed/_shard/sharded_tensor/api.py b/torch/distributed/_shard/sharded_tensor/api.py index 0e26ea9166c..670c5bd2443 100644 --- a/torch/distributed/_shard/sharded_tensor/api.py +++ b/torch/distributed/_shard/sharded_tensor/api.py @@ -5,6 +5,7 @@ from typing import ( Dict, List, Optional, + Sequence, Union ) import weakref @@ -20,6 +21,7 @@ from torch.distributed._shard.sharding_spec import ( EnumerableShardingSpec, ShardMetadata, ShardingSpec, + _infer_sharding_spec_from_shards_metadata, ) from torch.distributed._shard.sharding_spec._internals import ( check_tensor, @@ -357,15 +359,139 @@ class ShardedTensor(object): # add to metadata and local_shards sharded_tensor._metadata = global_sharded_tensor_metadata sharded_tensor._local_shards = local_shards - # make a EnumerableShardingSpec for sharded tensors that initialized from this API. - # TODO: make sharding spec a ChunkShardingSpec by inferring from the metadata list. - # see issue https://github.com/pytorch/pytorch/issues/67244 - sharded_tensor._sharding_spec = EnumerableShardingSpec(global_sharded_tensor_metadata.shards_metadata) + sharded_tensor._sharding_spec = _infer_sharding_spec_from_shards_metadata( + global_sharded_tensor_metadata.shards_metadata + ) # run post initialization, i.e. map registration, rpc initialization sharded_tensor._post_init() return sharded_tensor + @classmethod + def _init_from_local_tensor( + cls, + local_tensor: torch.Tensor, + sharding_spec: ShardingSpec, + *global_size: Sequence[int], + process_group: dist.ProcessGroup = None, + init_rrefs=False, + ) -> "ShardedTensor": + """ + Initialize a ShardedTensor given only one local tensor, global sharded tensor + size and sharding spec on each rank. + + Args: + local_tensor (Tensor): Single tensor of local shard stored in each rank. + sharding_spec (:class:`torch.distributed._shard.sharding_spec.ShardingSpec`): + The specification describing how to shard the Tensor. + global_size (Sequence[int]): Size of the sharded tensor. + process_group (ProcessGroup, optional): The process group to aggregate on. + Default: None + init_rrefs (bool, optional): Whether or not to initialize + :class:`torch.distributed.rpc.RRef`s pointing to remote shards. + Need to initialize the RPC Framework if specified as ``True``. + Default: ``False``. + + Returns: + A :class:`ShardedTensor` sharded based on the given sharding_spec with local + tensor stored in the current rank. + + Examples: + >>> # All tensors below are of torch.int64 type. + >>> # We have 2 process groups, 2 ranks. + >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank + >>> local_tensor = torch.unsqueeze(torch.cat([tensor, tensor + 2])) + >>> local_tensor + tensor([[1, 2, 3, 4]]) # Rank 0 + tensor([[3, 4, 5, 6]]) # Rank 1 + >>> sharding_dim = 0 + >>> sharding_spec = ChunkShardingSpec( + dim=sharding_dim, + placements=[ + "rank:0/cuda:0", + "rank:1/cuda:1", + ], + ) + >>> st = ShardedTensor._init_from_local_tensor(local_tensor, sharding_spec, [2, 4]) + >>> st + ShardedTensor( + ShardedTensorMetadata( + shards_metadata=[ + ShardMetadata(shard_offsets=[0, 0], shard_sizes=[1, 4], placement=rank:0/cuda:0), + ShardMetadata(shard_offsets=[1, 0], shard_sizes=[1, 4], placement=rank:1/cuda:1), + ], + size=torch.Size([2, 4]) + ) + >>> st.local_tensor() + tensor([1, 2, 3, 4]) # Rank 0 + tensor([3, 4, 5, 6]) # Rank 1 + + Warning: This API is experimental and subject to change. It lacks of a fully across + rank validations, and we only validate the local shard on the current rank. + We fully rely on the user to ensure local tensor is sharded based on the + sharding spec. + """ + if not isinstance(sharding_spec, ChunkShardingSpec): + raise NotImplementedError('Only ChunkShardingSpec is supported.') + if not local_tensor.is_contiguous(): + raise ValueError('local_tensor is not a contiguous Tensor.') + + process_group = ( + process_group + if process_group is not None + else distributed_c10d._get_default_group() + ) + current_rank = dist.get_rank(process_group) + world_size = dist.get_world_size(process_group) + + global_tensor_size = _flatten_tensor_size(global_size) + sharding_dim = sharding_spec.dim + split_size = get_split_size(global_tensor_size[sharding_dim], world_size) # type: ignore[index] + current_offsets = [0] * len(global_tensor_size) + gathered_metadatas = [None] * world_size + local_shards = [] + + for idx, placement in enumerate(sharding_spec.placements): + chunked_dim_size = get_chunked_dim_size( + global_tensor_size[sharding_dim], split_size, idx # type: ignore[index] + ) + shard_size = copy.deepcopy(global_tensor_size) + shard_size[sharding_spec.dim] = chunked_dim_size # type: ignore[index] + shard_metadata = ShardMetadata( + shard_offsets=copy.deepcopy(current_offsets), + shard_sizes=shard_size, + placement=placement, + ) + if current_rank == placement.rank(): # type: ignore[union-attr] + local_shard = local_tensor + else: + local_shard = torch.empty( + shard_size, + device=placement.device(), # type: ignore[union-attr] + requires_grad=local_tensor.requires_grad, + ) + shards = [ + Shard( + tensor=local_shard, + metadata=shard_metadata, # type: ignore[arg-type] + ) + ] + if current_rank == placement.rank(): # type: ignore[union-attr] + local_shards = shards + gathered_metadatas[placement.rank()] = build_metadata_from_local_shards( # type: ignore[call-overload, union-attr] + shards, global_tensor_size, placement.rank(), process_group # type: ignore[union-attr, arg-type] + ) + current_offsets[sharding_spec.dim] += chunked_dim_size # type: ignore[index] + + global_sharded_tensor_metadata = build_global_metadata(gathered_metadatas) + + return ShardedTensor._init_from_local_shards_and_global_metadata( + local_shards, + global_sharded_tensor_metadata, + process_group=process_group, + init_rrefs=init_rrefs, + ) + @classmethod def _init_from_local_shards_and_global_metadata( cls, @@ -455,10 +581,7 @@ class ShardedTensor(object): # done validation, add local_shards sharded_tensor._local_shards = local_shards - # make a EnumerableShardingSpec for sharded tensors that initialized from this API. - # TODO: make sharding spec a ChunkShardingSpec by inferring from the metadata list. - # see issue https://github.com/pytorch/pytorch/issues/67244 - sharded_tensor._sharding_spec = EnumerableShardingSpec(shards_metadata) + sharded_tensor._sharding_spec = _infer_sharding_spec_from_shards_metadata(shards_metadata) # run post initialization, i.e. map registration, rpc initialization sharded_tensor._post_init() @@ -953,23 +1076,9 @@ class _PartialTensor(object): ) sharded_tensor_size = self.local_shard.size() - current_offsets = [0] * len(local_result.size()) - shards = [] - rank = self.process_group.rank() - for idx, placement in enumerate(resharding_spec.placements): # type: ignore[attr-defined] - if rank == placement.rank(): # type: ignore[union-attr] - local_metadata = ShardMetadata( - shard_offsets=current_offsets, - shard_sizes=list(local_result.size()), - placement=placement, - ) - shards.append(Shard(local_result, local_metadata)) - break - current_offsets[sharding_dim] += local_result.size(sharding_dim) # type: ignore[index] - - st = ShardedTensor._init_from_local_shards( - shards, tuple(sharded_tensor_size), process_group=self.process_group + return ShardedTensor._init_from_local_tensor( + local_result, + resharding_spec, + sharded_tensor_size, + process_group=self.process_group, ) - st._sharding_spec = copy.deepcopy(resharding_spec) - - return st diff --git a/torch/distributed/_shard/sharding_spec/__init__.py b/torch/distributed/_shard/sharding_spec/__init__.py index f25c849559d..2fb2ff4fce9 100644 --- a/torch/distributed/_shard/sharding_spec/__init__.py +++ b/torch/distributed/_shard/sharding_spec/__init__.py @@ -5,4 +5,5 @@ from .api import ( PlacementSpec, ShardMetadata, ShardingSpec, + _infer_sharding_spec_from_shards_metadata, ) diff --git a/torch/distributed/_shard/sharding_spec/api.py b/torch/distributed/_shard/sharding_spec/api.py index 6f1d0b81ac4..813816171c8 100644 --- a/torch/distributed/_shard/sharding_spec/api.py +++ b/torch/distributed/_shard/sharding_spec/api.py @@ -110,3 +110,71 @@ class EnumerableShardingSpec(ShardingSpec): rank = len(shard.shard_offsets) validate_non_overlapping_shards_metadata(self.shards) + + +def _infer_sharding_spec_from_shards_metadata(shards_metadata): + """ + Infer the sharding spec from the metadata of each shard of a ShardedTensor. + If the tensor is sharded only on one dimension, we then assume it's a ChunkShardingSpec. + Otherwise, we assume it's enum sharded. + + Args: + shards_metadata (List[ShardMetadata]): List of Metadata of local shards. + + Returns: + A :class:`torch.distributed._shard.sharding_spec.ShardingSpec` object of sharding + spec for one sharded tensor. + """ + placements = [] + chunk_sharding_dim = None + chunk_offset_list = [] + shard_size_list = [] + # collect local shard metadatas from the global sharded_tensor_metadata + for shard_metadata in shards_metadata: # type: ignore[attr-defined] + placements.append(shard_metadata.placement) + local_offsets = shard_metadata.shard_offsets + chunk_offset_list.append(sum(local_offsets)) + shard_size_list.append(shard_metadata.shard_sizes) + shard_dims = [idx for idx, e in enumerate(local_offsets) if e != 0] + # If the offset is [0, 0, ..., 0] (all zeros), + # we cannot decide whether how the tensor is sharded. + if len(shard_dims) == 0: + continue + # If the offset is [0, N, .,0, M, 0, .., 0], + # we are sure it's sharded by more than one dimension. + if len(shard_dims) != 1: + chunk_sharding_dim = None + break + # If the offset is [0, 0, .,0, M, 0, .., 0], aka, it's sharded by just + # one dimension, we need to make sure all ranks share the same dimension. + if not chunk_sharding_dim: + chunk_sharding_dim = shard_dims[0] + elif chunk_sharding_dim != shard_dims[0]: + chunk_sharding_dim = None + break + + if chunk_sharding_dim is not None: + # Ensure we infer the correct placement order from offsets + placements = [ + x for _, x in sorted(zip(chunk_offset_list, placements), key=lambda e: e[0]) + ] + chunk_spec = ChunkShardingSpec( + dim=chunk_sharding_dim, + placements=placements, + ) + shard_sizes = [ + x[chunk_sharding_dim] + for _, x in sorted(zip(chunk_offset_list, shard_size_list)) + ] + if len(shard_sizes) == 1 or ( + len(set(shard_sizes[:-1])) == 1 and shard_sizes[0] >= shard_sizes[-1] + ): + return chunk_spec + # Corner case when length = 5 and chunks = 4, local size is [2, 2, 1, 0] + if ( + len(set(shard_sizes[:-2])) == 1 + and shard_sizes[0] >= shard_sizes[-2] + and shard_sizes[-2] >= shard_sizes[-1] + ): + return chunk_spec + return EnumerableShardingSpec(shards_metadata) From 443a337e14bcb719206036fa2794ba54121994ff Mon Sep 17 00:00:00 2001 From: Yeounoh Chung Date: Wed, 16 Feb 2022 09:40:31 -0800 Subject: [PATCH 097/199] Create a CI workflow for XLA tests using the XLA test image (#72496) Summary: This PR resolves https://github.com/pytorch/pytorch/issues/72693 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72496 Reviewed By: H-Huang Differential Revision: D34255441 Pulled By: seemethere fbshipit-source-id: fdfd54fbd59ef7266a78c9f729c1d5b6ed25e9d6 (cherry picked from commit ba14f0ee6cfa2fe248784d2dc5d54e427aef6bf7) --- .circleci/docker/build.sh | 6 + .github/generated-ciflow-ruleset.json | 8 +- .github/scripts/generate_ci_workflows.py | 21 +- .../scripts/generate_pytorch_test_matrix.py | 3 + .github/templates/linux_ci_workflow.yml.j2 | 23 + .github/workflows/generated-docker-builds.yml | 2 - .../generated-linux-bionic-py3.7-clang9.yml | 3 +- ...-pytorch-xla-linux-bionic-py3.7-clang8.yml | 507 ++++++++++++++++++ .jenkins/pytorch/build.sh | 9 +- 9 files changed, 570 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index bfab08d8bd0..f8767107435 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -40,6 +40,12 @@ function extract_all_from_image_name() { done } +# Use the same pre-built XLA test image from PyTorch/XLA +if [[ "$image" == *xla* ]]; then + echo "Using pre-built XLA test image..." + exit 0 +fi + if [[ "$image" == *-xenial* ]]; then UBUNTU_VERSION=16.04 elif [[ "$image" == *-artful* ]]; then diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 186441321a7..c13e357b645 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -44,6 +44,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", + "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3", "win-vs2019-cuda11.3-py3" ], @@ -105,6 +106,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", + "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3" ], "ciflow/cuda": [ @@ -201,7 +203,8 @@ "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", - "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit" + "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", + "pytorch-xla-linux-bionic-py3.7-clang8" ], "ciflow/macos": [ "ios-12-5-1-arm64", @@ -284,6 +287,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", + "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3", "win-vs2019-cuda11.3-py3" ], @@ -297,7 +301,7 @@ "win-vs2019-cuda11.3-py3" ], "ciflow/xla": [ - "linux-bionic-py3.7-clang9" + "pytorch-xla-linux-bionic-py3.7-clang8" ] }, "version": "v1" diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index f90690f2f95..4bcfed7c88d 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -625,9 +625,8 @@ LINUX_WORKFLOWS = [ num_test_shards=2, distributed_test=False, enable_noarch_test=1, - enable_xla_test=1, ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA, LABEL_CIFLOW_NOARCH}, + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_NOARCH}, ), ), CIWorkflow( @@ -657,6 +656,22 @@ LINUX_WORKFLOWS = [ ), ] +XLA_WORKFLOWS = [ + CIWorkflow( + arch="linux", + build_environment="pytorch-xla-linux-bionic-py3.7-clang8", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/xla_base", + test_runner_type=LINUX_CPU_TEST_RUNNER, + num_test_shards=2, + distributed_test=False, + enable_xla_test=1, + ciflow_config=CIFlowConfig( + labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA}, + ), + ), + +] + ANDROID_SHORT_WORKFLOWS = [ CIWorkflow( arch="linux", @@ -832,7 +847,6 @@ MACOS_WORKFLOWS = [ ] DOCKER_IMAGES = { - f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9", # for pytorch/xla f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.3.1-py3.7", # for rocm f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-rocm4.5-py3.7", # for rocm } @@ -1023,6 +1037,7 @@ def main() -> None: ) template_and_workflows = [ (jinja_env.get_template("linux_ci_workflow.yml.j2"), LINUX_WORKFLOWS), + (jinja_env.get_template("linux_ci_workflow.yml.j2"), XLA_WORKFLOWS), (jinja_env.get_template("windows_ci_workflow.yml.j2"), WINDOWS_WORKFLOWS), (jinja_env.get_template("bazel_ci_workflow.yml.j2"), BAZEL_WORKFLOWS), (jinja_env.get_template("ios_ci_workflow.yml.j2"), IOS_WORKFLOWS), diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py index 967f7222dd3..bd0f77d4aa4 100755 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ b/.github/scripts/generate_pytorch_test_matrix.py @@ -61,6 +61,7 @@ def run_as_if_on_trunk() -> bool: return current_workflow_triggered_by_label def main() -> None: + INCLUDE_DEFAULT_TEST = True TEST_RUNNER_TYPE = os.getenv('TEST_RUNNER_TYPE') assert TEST_RUNNER_TYPE is not None RUN_SMOKE_TESTS_ONLY_ON_PR = os.getenv('RUN_SMOKE_TESTS_ONLY_ON_PR') @@ -97,6 +98,7 @@ def main() -> None: configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} if os.getenv('ENABLE_XLA_TEST'): configs['xla'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} + INCLUDE_DEFAULT_TEST = False if os.getenv('ENABLE_NOARCH_TEST'): configs['noarch'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} if RUN_SMOKE_TESTS: @@ -110,6 +112,7 @@ def main() -> None: 'runner': TEST_RUNNER_TYPE, } for shard in range(1, NUM_TEST_SHARDS + 1) + if INCLUDE_DEFAULT_TEST ] + [ { 'config': name, diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 660c0a74ba5..2d978bab9dc 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -55,6 +55,11 @@ env: PR_NUMBER: ${{ github.event.pull_request.number }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} PYTORCH_RETRY_TEST_CASES: 1 +{%- if enable_xla_test == 1 %} + # This is used for XLA tests only + XLA_CUDA: 0 + XLA_IMAGE_TAG: v0.2 +{%- endif %} {%- if build_with_debug %} DEBUG: 1 {%- endif %} @@ -74,7 +79,19 @@ jobs: run: echo "${PR_LABELS}" !{{ common.setup_ec2_linux() }} !{{ common.checkout() }} + {%- if enable_xla_test == 1 %} + - name: Calculate docker image tag + id: calculate-tag + run: | + echo "XLA workflow uses pre-built test image at ${XLA_IMAGE_TAG}" + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" + echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}" >> "${GITHUB_ENV}" + echo "::set-output name=docker_tag::${DOCKER_TAG}" + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}" + {%- else %} !{{ common.calculate_docker_image(false) }} + {%- endif %} - name: Pull Docker image run: | !{{ common.add_retry_to_env() }} @@ -96,6 +113,9 @@ jobs: -e BRANCH \ -e GITHUB_RUN_ID \ -e SCCACHE_BUCKET \ + {%- if enable_xla_test == 1 %} + -e XLA_CUDA \ + {%- endif %} -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ -e SKIP_SCCACHE_INITIALIZATION=1 \ @@ -307,6 +327,9 @@ jobs: -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ + {%- if enable_xla_test == 1 %} + -e XLA_CUDA \ + {%- endif %} -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ {%- if 'rocm' not in test_runner_type %} ${PROXY_ENV} \ diff --git a/.github/workflows/generated-docker-builds.yml b/.github/workflows/generated-docker-builds.yml index 785c65d45b9..0dc8ac9b98b 100644 --- a/.github/workflows/generated-docker-builds.yml +++ b/.github/workflows/generated-docker-builds.yml @@ -28,8 +28,6 @@ jobs: strategy: matrix: include: - - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9' - docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.7-clang9' - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7' docker_image_short_name: 'pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7' - docker_image_base: '308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-bionic-cuda11.5-cudnn8-py3-gcc7' diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml index 91e4ff63e4c..181b6b3be13 100644 --- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml @@ -12,7 +12,6 @@ on: - 'ciflow/linux/*' - 'ciflow/noarch/*' - 'ciflow/trunk/*' - - 'ciflow/xla/*' branches: - master - release/* @@ -264,7 +263,7 @@ jobs: ENABLE_SLOW_TEST: '' ENABLE_DOCS_TEST: '' ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: 1 + ENABLE_XLA_TEST: '' ENABLE_NOARCH_TEST: 1 NUM_TEST_SHARDS: 2 MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu diff --git a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml new file mode 100644 index 00000000000..dceefb68815 --- /dev/null +++ b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml @@ -0,0 +1,507 @@ +# @generated DO NOT EDIT MANUALLY +# Template is at: .github/templates/linux_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: pytorch-xla-linux-bionic-py3.7-clang8 + +on: + push: + tags: + - 'ciflow/all/*' + - 'ciflow/cpu/*' + - 'ciflow/linux/*' + - 'ciflow/trunk/*' + - 'ciflow/xla/*' + branches: + - master + - release/* + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: pytorch-xla-linux-bionic-py3.7-clang8 + DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + TORCH_CUDA_ARCH_LIST: 5.2 + IN_CI: 1 + IS_GHA: 1 + # This is used for the phase of adding wheel tests only, will be removed once completed + IN_WHEEL_TEST: 1 + # Used for custom_opertor, jit_hooks, custom_backend, see .jenkins/pytorch/build.sh + CUSTOM_TEST_ARTIFACT_BUILD_DIR: build/custom_test_artifacts + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + PYTORCH_RETRY_TEST_CASES: 1 + # This is used for XLA tests only + XLA_CUDA: 0 + XLA_IMAGE_TAG: v0.2 +concurrency: + group: pytorch-xla-linux-bionic-py3.7-clang8-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + + build: + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-build + outputs: + docker_image: ${{ steps.calculate-tag.outputs.docker_image }} + steps: + - name: print labels + run: echo "${PR_LABELS}" + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Calculate docker image tag + id: calculate-tag + run: | + echo "XLA workflow uses pre-built test image at ${XLA_IMAGE_TAG}" + DOCKER_TAG=$(git rev-parse HEAD:.circleci/docker) + echo "DOCKER_TAG=${DOCKER_TAG}" >> "${GITHUB_ENV}" + echo "DOCKER_IMAGE=${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}" >> "${GITHUB_ENV}" + echo "::set-output name=docker_tag::${DOCKER_TAG}" + echo "::set-output name=docker_image::${DOCKER_IMAGE_BASE}:${XLA_IMAGE_TAG}" + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Build + env: + BRANCH: ${{ steps.parse-ref.outputs.branch }} + run: | + # detached container should get cleaned up by teardown_ec2_linux + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e JOB_BASE_NAME \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e AWS_DEFAULT_REGION \ + -e IS_GHA \ + -e PR_NUMBER \ + -e SHA1 \ + -e BRANCH \ + -e GITHUB_RUN_ID \ + -e SCCACHE_BUCKET \ + -e XLA_CUDA \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e PR_LABELS \ + -e http_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e https_proxy="http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" -e no_proxy="localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c 'sudo chown -R jenkins . && .jenkins/pytorch/build.sh' + - name: Display and upload binary build size statistics (Click Me) + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + run: | + COMMIT_TIME=$(git log --max-count=1 --format=%ct || echo 0) + export COMMIT_TIME + pip3 install requests==2.26 boto3==1.16.34 + python3 -m tools.stats.upload_binary_size_to_scuba || exit 0 + - name: Chown workspace + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Archive artifacts into zip + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .pytorch-test-times.json + - uses: seemethere/upload-artifact-s3@v3 + name: Store PyTorch Build Artifacts on S3 + with: + name: ${{ env.BUILD_ENVIRONMENT }} + retention-days: 14 + if-no-files-found: error + path: + artifacts.zip + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Clean up docker images + if: always() + run: | + # Prune all of the docker images + docker system prune -af + + generate-test-matrix: + needs: build + runs-on: ubuntu-18.04 + timeout-minutes: 240 + env: + TEST_RUNNER_TYPE: linux.2xlarge + ENABLE_DISTRIBUTED_TEST: '' + ENABLE_JIT_LEGACY_TEST: '' + ENABLE_MULTIGPU_TEST: '' + ENABLE_NOGPU_NO_AVX_TEST: '' + ENABLE_NOGPU_NO_AVX2_TEST: '' + ENABLE_SLOW_TEST: '' + ENABLE_DOCS_TEST: '' + ENABLE_BACKWARDS_COMPAT_TEST: '' + ENABLE_XLA_TEST: 1 + ENABLE_NOARCH_TEST: '' + NUM_TEST_SHARDS: 2 + MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu + DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu + NOGPU_RUNNER_TYPE: linux.2xlarge + PR_BODY: ${{ github.event.pull_request.body }} + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} + ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} + container: + image: python:3.9 + steps: + - name: Install dependencies + run: pip install typing-extensions==3.10 + - name: Clone pytorch/pytorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + - name: Generating test matrix + id: set-matrix + run: .github/scripts/generate_pytorch_test_matrix.py + + test: + needs: [build, generate-test-matrix] + strategy: + matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CUDA \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index 1fc4fecf2f8..01faa947634 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -209,10 +209,13 @@ else if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then - # ppc64le build fails when WERROR=1 + # ppc64le, rocm builds fail when WERROR=1 + # XLA test build fails when WERROR=1 # set only when building other architectures - # only use for "python setup.py install" line - if [[ "$BUILD_ENVIRONMENT" != *ppc64le* && "$BUILD_ENVIRONMENT" != *rocm* ]]; then + # or building non-XLA tests. + if [[ "$BUILD_ENVIRONMENT" != *ppc64le* && + "$BUILD_ENVIRONMENT" != *rocm* && + "$BUILD_ENVIRONMENT" != *xla* ]]; then WERROR=1 python setup.py bdist_wheel else python setup.py bdist_wheel From ad623fdecf783243ca3b70ecc5e696a1d25876fe Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 16 Feb 2022 09:59:42 -0800 Subject: [PATCH 098/199] [PyTorch] MHA: add test for transform_bias_rescale_qkv (#72464) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72464 We had some trouble getting this component (and this test!) right, so let's test it. ghstack-source-id: 149201478 Test Plan: new test passes Reviewed By: zrphercule Differential Revision: D33992477 fbshipit-source-id: cc377eed5d4a4412b42bdabf360601c6e52947cf (cherry picked from commit 9832867b12e555b512ded16decbea17b1794bda8) --- aten/src/ATen/native/attention.cpp | 164 ++++++++++++--------- aten/src/ATen/native/cuda/attention.cu | 7 + aten/src/ATen/native/native_functions.yaml | 5 + test/test_nn.py | 38 +++++ 4 files changed, 145 insertions(+), 69 deletions(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index 26dca7ed4ed..2ff70ebcc2e 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -17,6 +17,89 @@ Tensor gemm_nt(const Tensor& a, const Tensor& b) { return at::native::matmul(a, b.t()); } +template +void transform_bias_rescale_qkv_inner_loop( + int64_t B, + int64_t T, + int64_t _3D, + int64_t D, + int64_t num_head, + int64_t dim_per_head, + scalar_t* qkv_data, + scalar_t* qkv_bias_data, + scalar_t* q_k_v_data, + scalar_t sqrt_dim_per_head, + int64_t begin, + int64_t end) { + for (auto i : c10::irange(begin, end)) { + auto t = i % T; + i /= T; + auto nh = i % num_head; + i /= num_head; + auto b = i; + using Vec = vec::Vectorized; + auto V = vec::Vectorized::size(); + auto dh = 0; + auto d = nh * dim_per_head; + for (; dh + V <= dim_per_head; dh += V, d += V) { + // load + auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]); + auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]); + auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]); + + auto q_data = + Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) + + q_bias_data; + auto k_data = + Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) + + k_bias_data; + auto v_data = + Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) + + v_bias_data; + + q_data = q_data / Vec(sqrt_dim_per_head); + + q_data.store(&q_k_v_data + [0 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh]); + k_data.store(&q_k_v_data + [1 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh]); + v_data.store(&q_k_v_data + [2 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh]); + } + for (; dh < dim_per_head; dh++) { + auto d = nh * dim_per_head + dh; + auto q_bias = qkv_bias_data[d + 0 * D]; + auto k_bias = qkv_bias_data[d + 1 * D]; + auto v_bias = qkv_bias_data[d + 2 * D]; + auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias; + auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias; + auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias; + q_data = q_data / sqrt_dim_per_head; + q_k_v_data[0 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh] = q_data; + q_k_v_data[1 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh] = k_data; + q_k_v_data[2 * B * num_head * T * dim_per_head + + b * num_head * T * dim_per_head + + nh * T * dim_per_head + + t * dim_per_head + dh] = v_data; + } + } +} + // compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias std::tuple transform_bias_rescale_qkv( const Tensor& qkv, @@ -27,6 +110,7 @@ std::tuple transform_bias_rescale_qkv( auto _3D = qkv.size(2); auto D = _3D / 3; TORCH_CHECK(D % num_head == 0); + TORCH_CHECK(_3D % 3 == 0); const auto dim_per_head = D / num_head; auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options()); @@ -45,75 +129,7 @@ std::tuple transform_bias_rescale_qkv( std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1); parallel_for( 0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) { - for (auto i : c10::irange(begin, end)) { - auto t = i % T; - i /= T; - auto nh = i % num_head; - i /= num_head; - auto b = i; - using Vec = vec::Vectorized; - auto V = vec::Vectorized::size(); - auto dh = 0; - for (; dh < dim_per_head; dh += V) { - auto d = nh * dim_per_head + dh; - // load - auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]); - auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]); - auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]); - - auto q_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) + - q_bias_data; - auto k_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) + - k_bias_data; - auto v_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) + - v_bias_data; - - q_data = q_data / Vec(sqrt_dim_per_head); - - q_data.store(&q_k_v_data - [0 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh]); - k_data.store(&q_k_v_data - [1 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh]); - v_data.store(&q_k_v_data - [2 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh]); - } - if (dh != dim_per_head) { - for (dh = std::max(0, dh - V); dh < dim_per_head; dh++) { - auto d = nh * dim_per_head + dh; - auto q_bias = qkv_bias_data[d + 0 * D]; - auto k_bias = qkv_bias_data[d + 1 * D]; - auto v_bias = qkv_bias_data[d + 2 * D]; - auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias; - auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias; - auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias; - q_data = q_data / sqrt_dim_per_head; - q_k_v_data[0 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh] = q_data; - q_k_v_data[1 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh] = k_data; - q_k_v_data[2 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh] = v_data; - } - } - } + transform_bias_rescale_qkv_inner_loop(B, T, _3D, D, num_head, dim_per_head, qkv_data, qkv_bias_data, q_k_v_data, sqrt_dim_per_head, begin, end); }); }); auto q_k_v_s = @@ -234,6 +250,14 @@ void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) { } // namespace +std::tuple transform_bias_rescale_qkv_op_cpu( + const Tensor& qkv, + const Tensor& qkv_bias, + const int64_t num_head) { + auto result = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); + return std::make_tuple(std::get<0>(result).clone(), std::get<1>(result).clone(), std::get<2>(result).clone()); +} + Tensor multi_head_self_attention_cpu( const Tensor& query, const Tensor& qkv_weight, @@ -251,6 +275,8 @@ Tensor multi_head_self_attention_cpu( TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor"); TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query"); TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal"); + TORCH_CHECK(qkv_bias.dim() == 1, "expected 2-dimensional qkv_bias, got ", qkv_bias.dim(), "-D tensor"); + TORCH_CHECK(qkv_bias.sizes()[0] == 3 * D, "expected qkv_bias first dim and first dim of query to be equal"); TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head"); #ifndef NDEBUG diff --git a/aten/src/ATen/native/cuda/attention.cu b/aten/src/ATen/native/cuda/attention.cu index 0b9414f3b67..a41e35575b3 100644 --- a/aten/src/ATen/native/cuda/attention.cu +++ b/aten/src/ATen/native/cuda/attention.cu @@ -215,6 +215,13 @@ void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) { } // namespace +std::tuple transform_bias_rescale_qkv_op_cuda( + const Tensor& qkv, + const Tensor& qkv_bias, + const int64_t num_head) { + auto result = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); + return std::make_tuple(std::get<0>(result).clone(), std::get<1>(result).clone(), std::get<2>(result).clone()); +} Tensor multi_head_self_attention_cuda( const Tensor& query, diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 525edd98ea7..0598f1cedfa 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -2554,6 +2554,11 @@ CPU: multi_head_self_attention_cpu CUDA: multi_head_self_attention_cuda +- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_head) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: transform_bias_rescale_qkv_op_cpu + CUDA: transform_bias_rescale_qkv_op_cuda + - func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CPU: layer_norm_backward_cpu diff --git a/test/test_nn.py b/test/test_nn.py index d0c7d8e14a4..cec6b8ce0d9 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -17525,6 +17525,44 @@ class TestNNDeviceType(NNTestCase): self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True) self._test_EmbeddingBag(device, 'mean', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True) + @dtypesIfCUDA(torch.float) + @dtypes(torch.float) + def test_transform_bias_rescale_qkv(self, device, dtype): + # TODO: debug CPU test failure with settings (48, 4, 16, 8) and add that mode + tests = [ + (64, 4, 16, 8), + # dim_per_head = 12 does not divide evenly by CPU vectorization length of 8 + (24, 2, 4, 2) + ] + if "cuda" not in str(device): + # TODO: CUDA implementation doesn't work if size is too small. + tests.append((2, 2, 2, 2)) + print(tests) + print(device) + for (embed_dim, num_heads, sl, bs) in tests: + x = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype) * 10 + qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype) + + with torch.no_grad(): + (q, k, v) = torch._transform_bias_rescale_qkv(x @ qkv.weight.t(), qkv.bias, num_head=num_heads) + + def simple_transform_bias_rescale_qkv(qkv, bias): + (q, k, v) = torch.split(qkv, embed_dim, dim=-1) + (q_bias, k_bias, v_bias) = torch.split(bias, embed_dim, dim=-1) + return tuple( + x.reshape((sl, bs, num_heads, embed_dim // num_heads)).transpose(2, 1) + for x in ( + (q + q_bias) / math.sqrt(embed_dim // num_heads), + (k + k_bias), + (v + v_bias) + ) + ) + correct_q, correct_k, correct_v = simple_transform_bias_rescale_qkv(x @ qkv.weight.t(), qkv.bias) + + self.assertEqual(q.size(), correct_q.size()) + self.assertTrue(torch.allclose(q, correct_q)) + self.assertTrue(torch.allclose(k, correct_k)) + self.assertTrue(torch.allclose(v, correct_v)) @onlyCUDA @dtypes(torch.half, torch.float, torch.double) From ae8198121c4e6ec8ef3e01843880a918f4ff943c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 16 Feb 2022 10:03:53 -0800 Subject: [PATCH 099/199] [PyTorch] Handle non-vectorizable parameters for native MHA CUDA rescale kernel (#72671) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72671 The existing kernel did not handle cases where D % 4 != 0 or dim_per_head % 4 != 0. Now we have a non-vectorized kernel for these cases. ghstack-source-id: 149201477 Test Plan: Updated test_nn to cover these cases. Reviewed By: zrphercule, ngimel Differential Revision: D34119371 fbshipit-source-id: 4e9b4d9b636224ef2c433593f6f236df040de782 (cherry picked from commit f5393878e4c16342ee62465bb656b18053000677) --- aten/src/ATen/native/cuda/attention.cu | 129 ++++++++++++++++--------- test/test_nn.py | 11 +-- 2 files changed, 91 insertions(+), 49 deletions(-) diff --git a/aten/src/ATen/native/cuda/attention.cu b/aten/src/ATen/native/cuda/attention.cu index a41e35575b3..5b09731c610 100644 --- a/aten/src/ATen/native/cuda/attention.cu +++ b/aten/src/ATen/native/cuda/attention.cu @@ -26,7 +26,9 @@ Tensor gemm_nt(const Tensor& a, const Tensor& b) { return at::native::matmul(a, b.t()); } -template +static constexpr int TRANSFORM_BIAS_RESCALE_VEC = 4; + +template __global__ void transform_bias_rescale_qkv_kernel( // [B, T, 3 * D] const PackedTensorAccessor64 qkv, @@ -44,57 +46,86 @@ __global__ void transform_bias_rescale_qkv_kernel( auto b = blockIdx.x / T; auto D = NH * DH; - constexpr int VEC = 4; const scalar_t sqrt_dim_per_head = std::sqrt(static_cast(DH)); - using LoadT = memory::aligned_vector; - // FIXME: assert ((D % VEC) == 0) + if (assume_aligned) { + constexpr int VEC = TRANSFORM_BIAS_RESCALE_VEC; + using LoadT = memory::aligned_vector; + for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) { + auto d = d_v * VEC; + auto nh = d / DH; + auto dh = d % DH; + scalar_t qkv_bias_q[VEC]; + scalar_t qkv_bias_k[VEC]; + scalar_t qkv_bias_v[VEC]; + scalar_t qkv_q[VEC]; + scalar_t qkv_k[VEC]; + scalar_t qkv_v[VEC]; - for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) { - auto d = d_v * VEC; - auto nh = d / DH; - auto dh = d % DH; - scalar_t qkv_bias_q[VEC]; - scalar_t qkv_bias_k[VEC]; - scalar_t qkv_bias_v[VEC]; - scalar_t qkv_q[VEC]; - scalar_t qkv_k[VEC]; - scalar_t qkv_v[VEC]; - - *reinterpret_cast(&qkv_bias_q) = + // Here we require D % VEC == 0 for these vectorized loads. + *reinterpret_cast(&qkv_bias_q) = *reinterpret_cast(&qkv_bias[d + 0 * D]); - *reinterpret_cast(&qkv_bias_k) = + *reinterpret_cast(&qkv_bias_k) = *reinterpret_cast(&qkv_bias[d + 1 * D]); - *reinterpret_cast(&qkv_bias_v) = + *reinterpret_cast(&qkv_bias_v) = *reinterpret_cast(&qkv_bias[d + 2 * D]); - *reinterpret_cast(&qkv_q) = + *reinterpret_cast(&qkv_q) = *reinterpret_cast(&qkv[b][t][d + 0 * D]); - *reinterpret_cast(&qkv_k) = + *reinterpret_cast(&qkv_k) = *reinterpret_cast(&qkv[b][t][d + 1 * D]); - *reinterpret_cast(&qkv_v) = + *reinterpret_cast(&qkv_v) = *reinterpret_cast(&qkv[b][t][d + 2 * D]); #pragma unroll - // TODO: specialize for float2half2/half2float2? - for (auto ii = 0; ii < VEC; ++ii) { - qkv_q[ii] = static_cast( - (static_cast(qkv_q[ii]) + - static_cast(qkv_bias_q[ii])) / - static_cast(sqrt_dim_per_head)); - qkv_k[ii] = static_cast( - (static_cast(qkv_k[ii]) + - static_cast(qkv_bias_k[ii]))); - qkv_v[ii] = static_cast( - (static_cast(qkv_v[ii]) + - static_cast(qkv_bias_v[ii]))); - } - *reinterpret_cast(&q_k_v[0][b][nh][t][dh]) = + // TODO: specialize for float2half2/half2float2? + for (auto ii = 0; ii < VEC; ++ii) { + qkv_q[ii] = static_cast( + (static_cast(qkv_q[ii]) + + static_cast(qkv_bias_q[ii])) / + static_cast(sqrt_dim_per_head)); + qkv_k[ii] = static_cast( + (static_cast(qkv_k[ii]) + + static_cast(qkv_bias_k[ii]))); + qkv_v[ii] = static_cast( + (static_cast(qkv_v[ii]) + + static_cast(qkv_bias_v[ii]))); + } + + // Here we require DH % VEC == 0 for these vectorized stores. + *reinterpret_cast(&q_k_v[0][b][nh][t][dh]) = *reinterpret_cast(&qkv_q); - *reinterpret_cast(&q_k_v[1][b][nh][t][dh]) = + *reinterpret_cast(&q_k_v[1][b][nh][t][dh]) = *reinterpret_cast(&qkv_k); - *reinterpret_cast(&q_k_v[2][b][nh][t][dh]) = + *reinterpret_cast(&q_k_v[2][b][nh][t][dh]) = *reinterpret_cast(&qkv_v); + } + } else { + // Same as above, but we can't vectorize memory access. + for (int32_t d = threadIdx.x; d < D; d += blockDim.x) { + auto nh = d / DH; + auto dh = d % DH; + scalar_t qkv_bias_q = qkv_bias[d + 0 * D]; + scalar_t qkv_bias_k = qkv_bias[d + 1 * D]; + scalar_t qkv_bias_v = qkv_bias[d + 2 * D]; + scalar_t qkv_q = qkv[b][t][d + 0 * D]; + scalar_t qkv_k = qkv[b][t][d + 1 * D]; + scalar_t qkv_v = qkv[b][t][d + 2 * D]; + qkv_q = static_cast( + (static_cast(qkv_q) + + static_cast(qkv_bias_q)) / + static_cast(sqrt_dim_per_head)); + qkv_k = static_cast( + (static_cast(qkv_k) + + static_cast(qkv_bias_k))); + qkv_v = static_cast( + (static_cast(qkv_v) + + static_cast(qkv_bias_v))); + + q_k_v[0][b][nh][t][dh] = qkv_q; + q_k_v[1][b][nh][t][dh] = qkv_k; + q_k_v[2][b][nh][t][dh] = qkv_v; + } } } @@ -110,6 +141,12 @@ std::tuple transform_bias_rescale_qkv( TORCH_CHECK(D % num_head == 0); const auto dim_per_head = D / num_head; auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options()); +#define CALL_KERNEL(assume_aligned) \ + transform_bias_rescale_qkv_kernel \ + <<>>( \ + qkv.packed_accessor64(), \ + qkv_bias.packed_accessor64(), \ + q_k_v.packed_accessor64()) AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::Half, ScalarType::BFloat16, @@ -117,15 +154,21 @@ std::tuple transform_bias_rescale_qkv( "transform_bias_rescale_qkv", [&] { using accscalar_t = acc_type; - auto threads = std::min(1024, D / 4); + auto threads = std::max(std::min(1024, D / TRANSFORM_BIAS_RESCALE_VEC), 1); auto blocks = B * T; - transform_bias_rescale_qkv_kernel - <<>>( - qkv.packed_accessor64(), - qkv_bias.packed_accessor64(), - q_k_v.packed_accessor64()); + if (dim_per_head % TRANSFORM_BIAS_RESCALE_VEC == 0) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + D % TRANSFORM_BIAS_RESCALE_VEC == 0, + "D = num_heads * dim_per_head, so we should have dim_per_head % " + "TRANSFORM_BIAS_RESCALE_VEC == 0 => " + "D % TRANSFORM_BIAS_RESCALE_VEC == 0"); + CALL_KERNEL(true); + } else { + CALL_KERNEL(false); + } C10_CUDA_KERNEL_LAUNCH_CHECK(); }); +#undef CALL_KERNEL auto q_k_v_s = at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0); return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]); diff --git a/test/test_nn.py b/test/test_nn.py index cec6b8ce0d9..cdfddcc2d21 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -17532,13 +17532,12 @@ class TestNNDeviceType(NNTestCase): tests = [ (64, 4, 16, 8), # dim_per_head = 12 does not divide evenly by CPU vectorization length of 8 - (24, 2, 4, 2) + (24, 2, 4, 2), + # Make sure CUDA can handle small input sizes + (2, 2, 2, 2), + # dim_per_head = 6 does not divide evenly by CUDA vectorization length of 4, causes alignment issues + (24, 4, 4, 2) ] - if "cuda" not in str(device): - # TODO: CUDA implementation doesn't work if size is too small. - tests.append((2, 2, 2, 2)) - print(tests) - print(device) for (embed_dim, num_heads, sl, bs) in tests: x = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype) * 10 qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype) From 3842140fd5eb6dcb7eaab5628a79a9093aac5a24 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Wed, 16 Feb 2022 10:13:55 -0800 Subject: [PATCH 100/199] Update lazy_ir.py from lazy_tensor_staging (#72730) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72730 This diff contains changes from several PRs landed to lazy_tensor_staging branch. - generating 'fallback' overrides for each codegenned op, useful for debugging - supports operators which are missing aten:: symbols for op names, instead using their string counterpart - makes the IR class a base class instead of hardcoding the assumption of TS Test Plan: tested on lazy_tensor_staging branch Reviewed By: desertfire Differential Revision: D34178476 fbshipit-source-id: 7190b2e0d82b4eb1f4510c858c24446c6df3f9d0 (cherry picked from commit 6713d3f0ef1bb0ea5e2acfad2ea252cf73d1cbbf) --- test/cpp/lazy/test_cache.cpp | 2 +- test/cpp/lazy/test_ir.cpp | 2 +- test/cpp/lazy/test_ir_util.cpp | 2 +- tools/codegen/api/lazy.py | 45 ++++-- tools/codegen/dest/lazy_ir.py | 143 +++++++++++++------ tools/codegen/dest/lazy_ts_lowering.py | 8 +- tools/codegen/gen_lazy_tensor.py | 22 +-- torch/csrc/lazy/core/config.cpp | 5 + torch/csrc/lazy/core/config.h | 1 + torch/csrc/lazy/core/ir.cpp | 27 +++- torch/csrc/lazy/core/ir.h | 41 +++++- torch/csrc/lazy/core/lazy_graph_executor.cpp | 10 +- torch/csrc/lazy/core/shape.cpp | 8 +- torch/csrc/lazy/core/shape.h | 2 +- torch/csrc/lazy/ts_backend/ts_node.cpp | 18 +-- torch/csrc/lazy/ts_backend/ts_node.h | 2 +- 16 files changed, 243 insertions(+), 95 deletions(-) diff --git a/test/cpp/lazy/test_cache.cpp b/test/cpp/lazy/test_cache.cpp index 033b6c21b1e..a02e7eb251a 100644 --- a/test/cpp/lazy/test_cache.cpp +++ b/test/cpp/lazy/test_cache.cpp @@ -11,7 +11,7 @@ namespace lazy { class CacheNode : public Node { public: explicit CacheNode(const std::string& str) - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(str)), + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool bakeInSizes) -> hash_t { return Hash(str); }), str_(str) {} ~CacheNode() override = default; diff --git a/test/cpp/lazy/test_ir.cpp b/test/cpp/lazy/test_ir.cpp index 78b94618c7f..811ab8b50fe 100644 --- a/test/cpp/lazy/test_ir.cpp +++ b/test/cpp/lazy/test_ir.cpp @@ -12,7 +12,7 @@ namespace lazy { class TestLeafNode : public Node { public: explicit TestLeafNode(size_t param) - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(param)), + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */[&](bool bakeInSizes) -> hash_t { return Hash(param); }), param_(param) {} ~TestLeafNode() override = default; diff --git a/test/cpp/lazy/test_ir_util.cpp b/test/cpp/lazy/test_ir_util.cpp index 5c216258f9a..7f5202c01d7 100644 --- a/test/cpp/lazy/test_ir_util.cpp +++ b/test/cpp/lazy/test_ir_util.cpp @@ -12,7 +12,7 @@ namespace lazy { class IrUtilNode : public Node { public: explicit IrUtilNode() - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(0)) {} + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool bakeInSizes) -> hash_t { return Hash(0); }) {} ~IrUtilNode() override = default; void AddOperand(Value v) { diff --git a/tools/codegen/api/lazy.py b/tools/codegen/api/lazy.py index 3fe83936eef..9bcec4c8f81 100644 --- a/tools/codegen/api/lazy.py +++ b/tools/codegen/api/lazy.py @@ -1,10 +1,10 @@ from typing import List, Union, Tuple from tools.codegen.model import (Type, BaseTy, BaseType, OptionalType, ListType, OperatorName, FunctionSchema, - Return) + Return, TensorOptionsArguments) from tools.codegen.api.types import (BaseCppType, BaseCType, OptionalCType, ConstRefCType, NamedCType, - MutRefCType, + MutRefCType, deviceT, layoutT, VectorCType, boolT, longT, doubleT, ListCType, stringT, scalarT, scalarTypeT, ArrayRefCType, ArrayCType, TupleCType) @@ -33,7 +33,9 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L if typ.name == BaseTy.Tensor: return BaseCType(valueT) elif typ.name == BaseTy.Scalar: - return BaseCType(scalarT) + # at::scalar has special handling, + # and is wrapped in an IR value just like at::tensor + return BaseCType(valueT) elif typ.name == BaseTy.ScalarType: return BaseCType(scalarTypeT) elif typ.name == BaseTy.int: @@ -44,6 +46,10 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L return BaseCType(doubleT) elif typ.name == BaseTy.str: return BaseCType(stringT) + elif typ.name == BaseTy.Device: + return BaseCType(deviceT) + elif typ.name == BaseTy.Layout: + return BaseCType(layoutT) else: raise AssertionError(f"TODO add support for type {repr(typ)}") elif isinstance(typ, OptionalType): @@ -65,12 +71,30 @@ def isValueType(typ: Union[Type, BaseCType, OptionalCType, ConstRefCType, MutRef being Tensor-like, but assumes the type has already been transformed. """ if isinstance(typ, BaseCType): - return typ.type == valueT + # I am regretting my naming conventions, but now we are wrapping at::scalar in + # lazy value, while preserving other 'scalar' types as scalars in the IR + return typ.type == valueT or typ.type == scalarT elif isinstance(typ, (OptionalCType, ListCType, VectorCType)): return isValueType(typ.elem) else: return False +def isWrappedScalarType(typ: Type) -> bool: + """ + Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value. + Since we literally change the type from scalarT to valueT, information is lost. + This function helps build a list of wrapped scalars to save that information + """ + if isinstance(typ, BaseType): + # I am regretting my naming conventions, but now we are wrapping at::scalar in + # lazy value, while preserving other 'scalar' types as scalars in the IR + return typ.name == BaseTy.Scalar + elif isinstance(typ, (OptionalType, ListType)): + return isWrappedScalarType(typ.elem) + else: + return False + + # Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node. # Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML), # but carries type information from a native FunctionSchema modified for use with IR nodes, @@ -87,6 +111,8 @@ class LazyIrSchema: # TODO: Need to handle collisions with argument names at some point returns: Tuple['Return', ...] + wrapped_scalar_names: List[str] + def __init__(self, func: FunctionSchema): positional_arg_types = [] @@ -108,14 +134,15 @@ class LazyIrSchema: "tensor_options", "post_tensor_options_kwarg_only", "out"]: - if getattr(func.arguments, arg_field) is not None: - keyword_arg_types.extend([ - NamedCType( - arg.name, - process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)]) + curr_args = getattr(func.arguments, arg_field) + if curr_args is not None: + if isinstance(curr_args, TensorOptionsArguments): + curr_args = curr_args.all() + keyword_arg_types.extend([NamedCType(arg.name, process_ir_type(arg.type)) for arg in curr_args]) self.keyword_arg_types = tuple(keyword_arg_types) self.name = func.name self.returns = func.returns + self.wrapped_scalar_names = [arg.name for arg in func.schema_order_arguments() if isWrappedScalarType(arg.type)] @property def node_name(self) -> str: diff --git a/tools/codegen/dest/lazy_ir.py b/tools/codegen/dest/lazy_ir.py index d41b4edcd8a..f1145d7fcfe 100644 --- a/tools/codegen/dest/lazy_ir.py +++ b/tools/codegen/dest/lazy_ir.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from typing import List, Union from dataclasses import dataclass from tools.codegen.context import method_with_native_function @@ -9,17 +10,23 @@ import tools.codegen.api.dispatcher as dispatcher from tools.codegen.api.lazy import LazyIrSchema, isValueType from tools.codegen.dest.lazy_ts_lowering import ts_lowering_body - -def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: +def node_ctor_arg_rvalue_string(arg: NamedCType, schema: LazyIrSchema) -> str: """ Given a NamedCType from a lazy IR schema, generate a c++ string for materializing an rvalue of that arg for passing into a lazy Node constructor. """ + if isValueType(arg.type): if isinstance(arg.type, BaseCType): + if arg.name in schema.wrapped_scalar_names: + return f"torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen({arg.name})" return f"lazy_{arg.name}.GetIrValue()" elif isinstance(arg.type, OptionalCType): + if arg.name in schema.wrapped_scalar_names: + return f"{arg.name} ? " \ + f"c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen(*{arg.name})) : " \ + "c10::nullopt" return f"lazy_{arg.name} ? " \ f"c10::make_optional(lazy_{arg.name}.GetIrValue()) : " \ "c10::nullopt" @@ -35,24 +42,55 @@ def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: else: return f"{arg.name}" -def node_ctor_inputs(func: LazyIrSchema) -> str: +def node_ctor_inputs(schema: LazyIrSchema) -> str: """ Produce a formatted string with the arguments as passed into the constructor of a node class. """ - node_ctor_values = [node_ctor_arg_rvalue_string(arg) for arg in func.filtered_types()] + node_ctor_values = [node_ctor_arg_rvalue_string(arg, schema) for arg in schema.filtered_types()] return ",\n ".join(node_ctor_values) +def gen_fallback_code(schema: LazyIrSchema, overload_name: str) -> str: + """ + Generate code that falls back to eager conditioned on a predicate + """ + fallback_args = ",\n ".join([arg.name for arg in schema.filtered_types()]) + if len(overload_name): + aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})" + else: + aten_op_str = f"ATEN_OP({schema.aten_name})" + return f""" + if (force_eager_fallback({aten_symbol(schema)})) {{ + return at::native::call_fallback_fn<<c_eager_fallback, {aten_op_str}>::call( + {fallback_args} + ); + }} +""" + +def aten_symbol(schema: LazyIrSchema) -> str: + missing_interned_strings = { + 'sigmoid_backward', + } + if schema.aten_name in missing_interned_strings: + return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")' + return f'at::aten::{schema.aten_name}' @dataclass(frozen=True) -class LazyIR: +class LazyIR(ABC): backend_index: BackendIndex node_base: str + lowering_function_type: str + lowering_context_type: str + lowering_return_type: str @method_with_native_function def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func return self.gen(f) + @abstractmethod + def lowering_body(self, f): + pass + def gen(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: # for now, we just want one IR class decl and soon after also the method defs # and we use the functional version not out/inplace. @@ -63,9 +101,9 @@ class LazyIR: scalar_types = schema.filtered_types(values=False, scalars=True) node_ctor_args = ", ".join([f"const {i.cpp_type()}& {i.name}" for i in all_types]) - scalar_initializers = ",\n ".join([f"{t.name}_({t.name})" for t in scalar_types]) + scalar_initializers = ",\n ".join([f"{t.name}({t.name})" for t in scalar_types]) comma_if_scalar_initializers = ",\n" if len(scalar_initializers) else "" - scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name}_;" for t in scalar_types]) + scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name};" for t in scalar_types]) scalar_hashes = ", ".join([f"{f.name}" for f in scalar_types]) base_ctor_value_args_list = [] optional_values = [] @@ -83,21 +121,20 @@ class LazyIR: members_to_string = [] for t in scalar_types: if isinstance(t.type, OptionalCType): - members_to_string.append(f"""if ({t.name}_.has_value()) {{ - ss << ", {t.name}=" << {t.name}_.value(); + members_to_string.append(f"""if ({t.name}.has_value()) {{ + ss << ", {t.name}=" << {t.name}.value(); }} else {{ ss << ", {t.name}=null"; }}""") else: - members_to_string.append(f'ss << ", {t.name}=" << {t.name}_;') + members_to_string.append(f'ss << ", {t.name}=" << {t.name};') members_to_string_str = "\n ".join(members_to_string) return [f"""\ -// TODO(alanwaketan): Public members don't need to have _ suffix. class {schema.node_name} : public {self.node_base} {{ public: {schema.node_name}({node_ctor_args}, std::vector&& shapes) - : {self.node_base}(torch::lazy::OpKind(at::aten::{schema.aten_name}), + : {self.node_base}(torch::lazy::OpKind({aten_symbol(schema)}), {{{base_ctor_value_args}}}, std::move(shapes), /* num_outputs */ {len(func.returns)}, torch::lazy::MHash({scalar_hashes})){comma_if_scalar_initializers} @@ -109,14 +146,14 @@ class {schema.node_name} : public {self.node_base} {{ std::string ToString() const override {{ std::stringstream ss; - ss << TsNode::ToString(); + ss << {self.node_base}::ToString(); {members_to_string_str} return ss.str(); }} - torch::lazy::TSOpVector Lower(std::shared_ptr function, - torch::lazy::TSLoweringContext* loctx) const override {{ - {ts_lowering_body(f)} + {self.lowering_return_type} Lower({self.lowering_function_type} function, + {self.lowering_context_type} loctx) const override {{ + {self.lowering_body(f)} }} {scalar_decls} @@ -127,21 +164,34 @@ class {schema.node_name} : public {self.node_base} {{ """, ] -def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str) -> str: +@dataclass(frozen=True) +class TSLazyIR(LazyIR): + lowering_function_type: str = "std::shared_ptr" + lowering_context_type: str = "torch::lazy::TSLoweringContext*" + lowering_return_type: str = "torch::lazy::TSOpVector" + + def lowering_body(self, f): + return ts_lowering_body(f) + + +def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str, schema: LazyIrSchema) -> str: lazy_tensor_decls: List[str] = [] for t in value_types: + if t.name in schema.wrapped_scalar_names: + # no lazy tensor wrapper for scalars that are promoted to IR values + continue if isinstance(t.type, BaseCType): lazy_tensor_decls.append( f"{tensor_class} lazy_{t.name} = " - f"GetLtcTensorOrCreateForWrappedNumber({t.name}, *device);") + f"torch::lazy::GetLtcTensorOrCreateForWrappedNumber({t.name}, *common_device);") elif isinstance(t.type, OptionalCType): # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it # until we encounter a real world example. lazy_tensor_decls.append( - f" {tensor_class} lazy_{t.name} = TryGetLtcTensor({t.name}.value_or(at::Tensor()));") + f" {tensor_class} lazy_{t.name} = torch::lazy::TryGetLtcTensor({t.name}.value_or(at::Tensor()));") else: raise AssertionError("TODO not sure if there are other valid types to handle here") - return "\n ".join(lazy_tensor_decls) + return ("\n ").join(lazy_tensor_decls) @dataclass(frozen=True) class GenLazyNativeFuncDefinition: @@ -152,7 +202,7 @@ class GenLazyNativeFuncDefinition: @method_with_native_function def __call__(self, func: NativeFunction) -> List[str]: sig = kernel_signature(func, self.backend_index) - + metadata = self.backend_index.get_kernel(func) # Lazy IR stuff schema = LazyIrSchema(func.func) all_types = schema.filtered_types() @@ -160,9 +210,14 @@ class GenLazyNativeFuncDefinition: scalar_types = schema.filtered_types(values=False, scalars=True) returns_length = len(schema.returns) - value_types_names = ", ".join([f"{t.name}" for t in value_types]) - get_device_str = f"""auto device = bridge::GetBackendDevice({value_types_names});""" - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) + fallback_str = gen_fallback_code(schema, overload_name=func.func.name.overload_name) + value_types_names = [f"{t.name}" for t in value_types if t.name not in schema.wrapped_scalar_names] + assert len(value_types_names) > 0, "Code below assumes there is at least one tensor arg" + get_device_str = f"""auto common_device = torch::lazy::GetBackendDevice({', '.join(value_types_names)}); + TORCH_INTERNAL_ASSERT(common_device); + """ + + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) node_ctor_input_str = node_ctor_inputs(schema) # call the meta kernel if it exists, to compute output shape/dtype for our IR @@ -174,37 +229,40 @@ class GenLazyNativeFuncDefinition: shapes_str = ','.join([this_shape(i) for i in range(returns_length)]) meta_out = "std::vector shapes{" + shapes_str + "};" + # TODO: INTEGRATION POINT HERE: meta_str = f"""auto out_meta = at::meta::{schema.aten_name}({', '.join(str(t.name) for t in all_types)}); {meta_out}""" else: - shape_sig = ComputeShapeSignature(func) + shape_sig = ComputeShapeSignature(metadata.kernel, func) meta_str = f""" auto shapes = {shape_sig.shape_call};""" + meta_str += f""" TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});""" node_str = f"""auto node = torch::lazy::MakeNode({node_ctor_input_str}, std::move(shapes));""" + first_tensor_name = value_types_names[0] + bridge_str = """auto result = torch::lazy::CreateAtenFromLtcTensor( + torch::lazy::LazyTensor::Create(std::move(node), *common_device));""" - assert len(value_types) > 0, f"Only supporting tensor ops so far, none found in {sig}" - first_tensor = value_types[0] - bridge_str = f"""auto result = CreateAtenFromLtcTensor(lazy_{first_tensor.name}.CreateFrom(node));""" if returns_length > 1: bridge_str = f"""std::vector<{self.tensor_class}> lazy_tensors; for (int i = 0; i < {returns_length}; i++) {{ - lazy_tensors.push_back(lazy_{first_tensor.name}.CreateFrom(torch::lazy::Value(node, i))); + lazy_tensors.push_back(torch::lazy::LazyTensor::Create(torch::lazy::Value(node, i), *common_device)); }} - auto result = TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" - if schema.name.name.inplace: + auto result = torch::lazy::TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" + + if schema.name.name.inplace or func.func.is_out_fn(): assert returns_length == 1, "We assumed there was no such case where an op is an in-place variant " \ "and has tuple outputs." - bridge_str = f"""lazy_{first_tensor.name}.SetInPlaceIrValue(node); - auto& result = {first_tensor.name};""" + bridge_str = f"""lazy_{first_tensor_name}.SetInPlaceIrValue(node); + auto& result = {first_tensor_name};""" return [f"""\ - // TODO(alanwaketan): Quite a lot inefficient copy-by-value there. Let's optimize it. - {sig.decl(name=f"{self.class_method_name}::{schema.aten_name}")} {{ + {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{ + {fallback_str} TORCH_LAZY_FN_COUNTER("lazy::"); {get_device_str} {lazy_tensor_decls_str} @@ -219,17 +277,17 @@ class ComputeShapeSignature: """ Here we use the base name as the suffix of the signature to avoid generating for in-place variants. """ - @method_with_native_function - def __init__(self, f: NativeFunction): + def __init__(self, kernel_name: str, f: NativeFunction): self.__schema = LazyIrSchema(f.func) self.__dispatch_args = ', '.join([a.decl() for a in dispatcher.arguments(f.func)]) self.__call_args = ", ".join([f"{t.name}" for t in self.__schema.filtered_types()]) + self.__kernel_name = kernel_name def __decl_suffix(self) -> str: - return f"{self.__schema.base_name}({self.__dispatch_args})" + return f"{self.__kernel_name}({self.__dispatch_args})" def __call_suffix(self) -> str: - return f"{self.__schema.base_name}({self.__call_args})" + return f"{self.__kernel_name}({self.__call_args})" @property def shape_decl(self) -> str: @@ -246,19 +304,20 @@ class GenLazyShapeInferenceDefinition: tensor_class: str @method_with_native_function + # def gen_lazy_shape_inference_decl(f: NativeFunction, backend_index: BackendIndex, tensor_class: str) -> List[str]: def __call__(self, f: NativeFunction) -> List[str]: sig = kernel_signature(f, self.backend_index) - + metadata = self.backend_index.get_kernel(f) # Lazy IR stuff schema = LazyIrSchema(f.func) value_types = schema.filtered_types(values=True, scalars=False) - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) node_ctor_input_str = node_ctor_inputs(schema) # Only generate shape/dtype fn for non-structured kernels, # since we just use the meta function for structured kernels if not f.structured and f.structured_delegate is None: - shape_sig = ComputeShapeSignature(f) + shape_sig = ComputeShapeSignature(metadata.kernel, f) return ["\n".join([f"{shape_sig.shape_decl};"])] else: return [] diff --git a/tools/codegen/dest/lazy_ts_lowering.py b/tools/codegen/dest/lazy_ts_lowering.py index 32d505cda7b..3f7701d5587 100644 --- a/tools/codegen/dest/lazy_ts_lowering.py +++ b/tools/codegen/dest/lazy_ts_lowering.py @@ -18,13 +18,12 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: continue emplace_arguments.append('loctx->GetOutputOp(operand(i++))') continue - emplace_arguments.append(f'"{value.name}", {value.name}_') + emplace_arguments.append(f'"{value.name}", {value.name}') emplace_arguments_str = "\n ".join( [f"arguments.emplace_back({a});" for a in emplace_arguments]) - emplace_kwarg_values = [f'loctx->GetOutputOp(operand({i}))' for i in range(len(schema.keyword_values))] - emplace_kwarg_scalars = [f'"{t.name}", {t.name}_' for t in schema.keyword_scalars] - assert len(schema.keyword_values) == 0, "TODO the logic for operand(i) is broken if there are kw values" + emplace_kwarg_values = [f'"{t.name}", loctx->GetOutputOp(operand(i++))' for t in schema.keyword_values] + emplace_kwarg_scalars = [f'"{t.name}", {t.name}' for t in schema.keyword_scalars] emplace_kwarguments = "\n ".join( [f"kwarguments.emplace_back({a});" for a in emplace_kwarg_values + emplace_kwarg_scalars]) return f"""\ @@ -38,6 +37,5 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments); CHECK_EQ({schema.aten_name}_out.size(), {len(func.returns)}); - // TODO: need to call GenerateClone sometimes? Or else return LowerBuiltIn() directly return {schema.aten_name}_out; """ diff --git a/tools/codegen/gen_lazy_tensor.py b/tools/codegen/gen_lazy_tensor.py index b2515d3d083..9705620fa2e 100644 --- a/tools/codegen/gen_lazy_tensor.py +++ b/tools/codegen/gen_lazy_tensor.py @@ -3,7 +3,8 @@ import argparse import os import yaml from collections import namedtuple -from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple +from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple, Type +from tools.codegen.dest.lazy_ir import LazyIR, TSLazyIR from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml from tools.codegen.model import (FunctionSchema, NativeFunction, NativeFunctionsGroup, OperatorName) @@ -60,20 +61,20 @@ def main() -> None: parser.add_argument( '--node_base_hdr', type=str, default=None, help='Path to header file defining custom Lazy IR Node base class') parser.add_argument( - '--tensor_class', type=str, default="LazyTensor", help='Name of backend specific custom Lazy Tensor class') + '--tensor_class', type=str, default="torch::lazy::LazyTensor", help='Name of backend specific custom Lazy Tensor class') parser.add_argument( - '--tensor_class_hdr', type=str, default="lazy_tensor_core/csrc/tensor.h", + '--tensor_class_hdr', type=str, default="torch/csrc/lazy/core/tensor.h", help='Path to header file defining custom Lazy Tensor class') options = parser.parse_args() run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path, options.gen_ts_lowerings, options.node_base, options.node_base_hdr, - options.tensor_class, options.tensor_class_hdr) + options.tensor_class, options.tensor_class_hdr, TSLazyIR) def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str], gen_ts_lowerings: bool, node_base: str, node_base_hdr: Optional[str], - tensor_class: str, tensor_class_hdr: str) -> None: + tensor_class: str, tensor_class_hdr: str, lazy_ir_cls: Type[LazyIR]) -> None: # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute() @@ -160,11 +161,13 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st fm.write_with_template(f'{backend_key}NativeFunctions.cpp', 'DispatchKeyNativeFunctions.cpp', lambda: { 'includes': [f'#include <{path}>' for path in [ tensor_class_hdr, + "ATen/Functions.h", "ATen/MetaFunctions.h", + "ATen/Operators.h", + "torch/csrc/lazy/core/lazy_graph_executor.h", "torch/csrc/lazy/core/metrics.h", "torch/csrc/lazy/core/shape.h", - "lazy_tensor_core/csrc/aten_ltc_bridge.h", - "lazy_tensor_core/csrc/lazy_graph_executor.h", + "lazy_tensor_core/csrc/ts_backend/aten_eager_fallback.h", f"{output_dir}/{backend_key}NativeFunctions.h", f"{output_dir}/{backend_key}LazyIr.h", f"{output_dir}/{backend_key}ShapeInference.h", @@ -196,7 +199,8 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'func_declarations': list(concat_map_codegen( dest.GenLazyShapeInferenceDefinition(backend_indices[backend_key], tensor_class), - grouped_native_functions + grouped_native_functions, + codegenInplaceVariant=True, )), }) # Generate IR node classes @@ -217,7 +221,7 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'DispatchKey': backend_key, 'dispatch_namespace': backend_key.lower(), 'ir_declarations': list(concat_map_codegen( - dest.LazyIR(backend_indices[backend_key], node_base), + lazy_ir_cls(backend_indices[backend_key], node_base), grouped_native_functions )), }) diff --git a/torch/csrc/lazy/core/config.cpp b/torch/csrc/lazy/core/config.cpp index af86dd926d6..b47054913e1 100644 --- a/torch/csrc/lazy/core/config.cpp +++ b/torch/csrc/lazy/core/config.cpp @@ -7,6 +7,11 @@ C10_DEFINE_bool( false, "Enable parameter aliasing support"); +C10_DEFINE_bool( + torch_lazy_use_thread_pool, + false, + "Use thread pool to schedule backend execution"); + C10_DEFINE_int( torch_lazy_compilation_cache_size, 1024, diff --git a/torch/csrc/lazy/core/config.h b/torch/csrc/lazy/core/config.h index beee5b4b214..fa6630123cd 100644 --- a/torch/csrc/lazy/core/config.h +++ b/torch/csrc/lazy/core/config.h @@ -3,6 +3,7 @@ C10_DECLARE_bool(torch_lazy_ir_debug); C10_DECLARE_bool(torch_lazy_param_aliasing); +C10_DECLARE_bool(torch_lazy_use_thread_pool); C10_DECLARE_int(torch_lazy_compilation_cache_size); C10_DECLARE_int(torch_lazy_device_data_cache_size); diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp index 63e6ee8744c..a1726aacba6 100644 --- a/torch/csrc/lazy/core/ir.cpp +++ b/torch/csrc/lazy/core/ir.cpp @@ -1,6 +1,8 @@ #include #include +C10_DEFINE_bool(ltc_enable_dynamic_shapes, false, "Whether dynamic shape is enabled"); + namespace torch { namespace lazy { @@ -23,6 +25,14 @@ hash_t Value::hash() const { return HashCombine(node->hash(), Hash(index)); } +hash_t Value::hash_with_sizes() const { + return HashCombine(node->hash_with_sizes(), Hash(index)); +} + +hash_t Value::hash_without_sizes() const { + return HashCombine(node->hash_without_sizes(), Hash(index)); +} + OpKind OpKind::Get(const std::string& name) { return OpKind(c10::Symbol::fromQualString(name)); } @@ -31,18 +41,25 @@ hash_t OpKind::hash() const { return StringHash(op.toQualString()); } -Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash) +bool Node::enableDynamicShape() { + static bool enabled = std::getenv("LTC_ENABLE_DYNAMIC_SHAPES") != nullptr; + return enabled || FLAGS_ltc_enable_dynamic_shapes; +} + +Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn) : op_(op), num_outputs_(num_outputs), node_hash_(node_hash), - dag_hash_(dag_hash), + dag_hash_without_sizes_(dag_hash_fn(false)), + dag_hash_with_sizes_(dag_hash_fn(true)), metadata_(GetMetaDataIfDebugging()) {} -Node::Node(OpKind op, size_t num_outputs, hash_t node_hash) +Node::Node(OpKind op, size_t num_outputs, std::function node_hash_fn) : op_(op), num_outputs_(num_outputs), - node_hash_(node_hash), - dag_hash_(node_hash), + node_hash_(node_hash_fn(!enableDynamicShape())), + dag_hash_without_sizes_(node_hash_fn(false)), + dag_hash_with_sizes_(node_hash_fn(true)), metadata_(GetMetaDataIfDebugging()) {} Node::~Node() = default; diff --git a/torch/csrc/lazy/core/ir.h b/torch/csrc/lazy/core/ir.h index 6ca1df8d2fb..4132400bb65 100644 --- a/torch/csrc/lazy/core/ir.h +++ b/torch/csrc/lazy/core/ir.h @@ -15,6 +15,9 @@ #include #include #include +#include + +C10_DECLARE_bool(ltc_enable_dynamic_shapes); namespace torch { namespace lazy { @@ -65,9 +68,12 @@ using OutputMap = std::unordered_map; // Represents an input/operand for a Node object. struct TORCH_API Value { Value() = default; - /* implicit */ Value(NodePtr node, size_t index = 0) : node(std::move(node)), index(index) {} + /* implicit */ Value(NodePtr&& node, size_t index = 0) : node(std::move(node)), index(index) {} + /* implicit */ Value(const NodePtr& node, size_t index = 0) : node(node), index(index) {} hash_t hash() const; + hash_t hash_with_sizes() const; + hash_t hash_without_sizes() const; operator bool() const { return node != nullptr; @@ -121,7 +127,6 @@ inline std::ostream& operator<<(std::ostream& stream, const OpKind& op) { using OpList = c10::ArrayRef; - // A node in the graph. Nodes for operations which requires extra data to be // stored for lowering, should inherit from this class and add operation // specific member there. For example, a constant might create a new @@ -130,13 +135,18 @@ using OpList = c10::ArrayRef; // client data handle in it. class TORCH_API Node { public: + static bool enableDynamicShape(); + // Creates a new node with the given op name. The op is a unique identifier // for the operation. The num_outputs tells how many outputs a given operation // generates. - Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash); + // + // None leaf node's node_hash does not contains shape information always. + // So we pass in the hash value rather than a function. + Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn); // Contructor used to create leaf nodes. - Node(OpKind op, size_t num_outputs, hash_t node_hash); + Node(OpKind op, size_t num_outputs, std::function node_hash_fn); virtual ~Node(); @@ -157,7 +167,15 @@ class TORCH_API Node { } hash_t hash() const { - return dag_hash_; + return enableDynamicShape() ? dag_hash_without_sizes_ : dag_hash_with_sizes_; + } + + hash_t hash_without_sizes() const { + return dag_hash_without_sizes_; + } + + hash_t hash_with_sizes() const { + return dag_hash_with_sizes_; } const MetaData& metadata() const { @@ -183,8 +201,17 @@ class TORCH_API Node { // The hash value of this node. hash_t node_hash_; - // The hash value of the graph rooted at this node. - hash_t dag_hash_; + // dag_hash represents the hash value of the graph rooted at this node. There are 2 variants, one + // with sizes info and one without. We need 2 such hashes to support dynamic + // shape. Here are the logic to pick the hash in the 2 major scenarios that a hash is needed: + // - shape cache: in this case, we always use the dag hash with size info. This way, looking up the + // shape for one node does not get the shape for another node with the same rank but different sizes + // - lookup the compiled graph by a hash: in this case, we will use the dag hash + // WITHOUT size info if dynamic shape is enabled and use the dag hash WITH size info otherwise. + // The different requirement for the hash in these 2 scenarios forces us to maintain 2 + // different hashes. + hash_t dag_hash_without_sizes_; + hash_t dag_hash_with_sizes_; // The IR specific metadata attached to the IR node. MetaData metadata_; // The IR framework user can attach a user defined metadata object deriving diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index 9f504c935e9..3599abb7b8d 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -462,7 +462,7 @@ void LazyGraphExecutor::SyncTensorsGraph( config.sync_ltc_data = sync_ltc_data; auto async = SyncTensorsGraphInternal(tensors, devices, config); - if (wait && async != nullptr) { + if (FLAGS_torch_lazy_use_thread_pool && wait && async != nullptr) { async->mwait.Wait(); } } @@ -972,7 +972,11 @@ std::shared_ptr LazyGraphExecutor:: } }; - ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); + if (FLAGS_torch_lazy_use_thread_pool) { + ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); + } else { + syncfn(); + } return async; } @@ -995,7 +999,7 @@ std::vector LazyGraphExecutor::GetTensorsFused( SyncTensorsConfig config; config.force_ltc_data = false; auto async = SyncTensorsGraphInternal(tensors, {}, config); - if (async != nullptr) { + if (FLAGS_torch_lazy_use_thread_pool && async != nullptr) { async->mwait.Wait(); } std::vector tensors_data = GatherTensorsData( diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp index 2b7fd2c74b8..bd5ea5b75c9 100644 --- a/torch/csrc/lazy/core/shape.cpp +++ b/torch/csrc/lazy/core/shape.cpp @@ -28,8 +28,12 @@ size_t Shape::numel() const { return elts; } -hash_t Shape::hash() const { - return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); +hash_t Shape::hash(bool bakeInSizes) const { + if (bakeInSizes) { + return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); + } else { + return HashCombine(Hash(scalar_type_), Hash(sizes_.size())); + } } } // namespace lazy diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h index c67ff908833..9b34b90fec0 100644 --- a/torch/csrc/lazy/core/shape.h +++ b/torch/csrc/lazy/core/shape.h @@ -25,7 +25,7 @@ class TORCH_API Shape { int64_t size(int64_t dim) const { return sizes_.at(dim); } void set_size(int64_t dim, int64_t size) { sizes_.at(dim) = size; } size_t numel() const; - hash_t hash() const; + hash_t hash(bool bakeInSizes) const; bool operator==(const Shape& other) const; diff --git a/torch/csrc/lazy/ts_backend/ts_node.cpp b/torch/csrc/lazy/ts_backend/ts_node.cpp index d79dd999f81..a7948e5cbec 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.cpp +++ b/torch/csrc/lazy/ts_backend/ts_node.cpp @@ -28,14 +28,15 @@ void TsNodeSetShapeDeferred( throw std::runtime_error("Expected TsNode but could not dynamic cast"); } -hash_t OperandHashes(const OpList& operands, const hash_t& seed) { +hash_t OperandHashes(const OpList& operands, const hash_t& seed, bool bakeInSizes) { hash_t hash = seed; for (auto& operand : operands) { if (!operand) { hash = HashCombine(hash, static_cast(kNullOpt)); continue; } - hash = HashCombine(hash, operand.hash()); + auto operand_hash = bakeInSizes ? operand.hash_with_sizes() : operand.hash_without_sizes(); + hash = HashCombine(hash, operand_hash); } return hash; } @@ -48,7 +49,7 @@ TsNode::TsNode(OpKind op, OpList operands, std::vector&& shapes, // initialization to a separate function? /* node_hash */ HashCombine(op.hash(), hash_seed), /* dag_hash */ - OperandHashes(operands, HashCombine(op.hash(), hash_seed))), + [&](bool bakeInSizes) { return OperandHashes(operands, HashCombine(op.hash(), hash_seed), bakeInSizes); }), shapes_(shapes) { for (auto& operand : operands) { // Ideally, optional operands should be filtered by the leaf node classes, @@ -80,7 +81,7 @@ void TsNode::SetShapeDeferred( } TsNode::TsNode(OpKind op, Shape shape, size_t num_outputs, hash_t hash_seed) - : Node(op, num_outputs, GetOpHash(op, shape, hash_seed)) + : Node(op, num_outputs, [&](bool bakeInSizes) -> hash_t { return GetOpHash(op, shape, hash_seed, bakeInSizes); }) { shapes_.push_back(std::move(shape)); } @@ -98,10 +99,11 @@ ShapeCache* GetShapeCache() { Shape TsNode::GetOpShape( const std::function& shape_fn) const { + auto hash = hash_with_sizes(); ShapeCache* shape_cache = GetShapeCache(); - auto shape = shape_cache->Get(hash()); + auto shape = shape_cache->Get(hash); if (shape == nullptr) { - shape = shape_cache->Add(hash(), + shape = shape_cache->Add(hash, std::make_shared(shape_fn())); } return *shape; @@ -120,8 +122,8 @@ std::string TsNode::ToString() const { return ss.str(); } -hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed) { - hash_t h = HashCombine(op.hash(), shape.hash()); +hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes) { + hash_t h = HashCombine(op.hash(), shape.hash(bakeInSizes)); return HashCombine(h, hash_seed); } diff --git a/torch/csrc/lazy/ts_backend/ts_node.h b/torch/csrc/lazy/ts_backend/ts_node.h index a6595a5337d..156444852d9 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.h +++ b/torch/csrc/lazy/ts_backend/ts_node.h @@ -55,7 +55,7 @@ class TORCH_API TsNode : public lazy::Node { std::string ToString() const override; - static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed); + static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes); const std::vector& operands() const override { return operands_as_outputs_; From dadbf43effe448a6dbb3b4af0f671ae30ceae999 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Wed, 16 Feb 2022 10:17:44 -0800 Subject: [PATCH 101/199] Fix asserts in tests (#72864) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72864 Fixes #72860 Test Plan: Imported from OSS Reviewed By: rohan-varma Differential Revision: D34246987 Pulled By: H-Huang fbshipit-source-id: 1ba47585533aff4cff9beec49bdc801f8320ffc8 (cherry picked from commit 03e45ceb890d72216950a9c3d5cd648b02e6a557) --- caffe2/python/core_test.py | 2 +- test/jit/test_with.py | 2 +- test/quantization/fx/test_quantize_fx.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index 0543233b7c4..2f143fbae07 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -308,7 +308,7 @@ class TestCreateOperator(test_util.TestCase): self.assertTrue(op.HasField('device_option')) self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType) self.assertEqual(op.device_option.device_id, 1) - self.assertTrue(len(op.arg), 3) + self.assertEqual(len(op.arg), 3) # can't guarantee ordering of kwargs, so generate a set of args # to test with diff --git a/test/jit/test_with.py b/test/jit/test_with.py index b56324093ce..bd09a36c686 100644 --- a/test/jit/test_with.py +++ b/test/jit/test_with.py @@ -621,7 +621,7 @@ class TestWith(JitTestCase): function_events = p.function_events # Event with name "foo" should be recorded. rf_events = [evt for evt in function_events if evt.name == "foo"] - self.assertTrue(len(rf_events), 1) + self.assertEqual(len(rf_events), 1) rf_event = rf_events[0] child_events = rf_event.cpu_children # Ensure we find nested record_function event diff --git a/test/quantization/fx/test_quantize_fx.py b/test/quantization/fx/test_quantize_fx.py index 8bc2f6501d9..da8a974aded 100644 --- a/test/quantization/fx/test_quantize_fx.py +++ b/test/quantization/fx/test_quantize_fx.py @@ -1770,7 +1770,7 @@ class TestQuantizeFx(QuantizationTestCase): def assertAttrPreserved(m): self.assertTrue(hasattr(m, "preserved_attr")) - self.assertTrue(m.preserved_attr, 3) + self.assertEqual(m.preserved_attr, 3) assertAttrPreserved(m) convert_custom_config_dict = { From 41ad221751e57c2d2ccc82b431f56d6ed62e1741 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 16 Feb 2022 10:18:51 -0800 Subject: [PATCH 102/199] [PyTorch] MHA: fix contiguity assumption in transform_bias_rescale_qkv (#72465) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72465 This code path incorrectly assumed input tensors were contiguous. Now we check that. ghstack-source-id: 149201476 Test Plan: CI Reviewed By: ngimel Differential Revision: D34007665 fbshipit-source-id: c43438f2495e32304ea3f7846e01eceb4a9448f7 (cherry picked from commit 0767b225f23846c1636ac3622f46b0c5ec071d96) --- aten/src/ATen/native/attention.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp index 2ff70ebcc2e..5db27b67d4e 100644 --- a/aten/src/ATen/native/attention.cpp +++ b/aten/src/ATen/native/attention.cpp @@ -113,15 +113,18 @@ std::tuple transform_bias_rescale_qkv( TORCH_CHECK(_3D % 3 == 0); const auto dim_per_head = D / num_head; auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options()); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v.is_contiguous()); - AT_DISPATCH_FLOATING_TYPES_AND2( + const auto qkv_contig = qkv.expect_contiguous(); + const auto qkv_bias_contig = qkv_bias.expect_contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::Half, ScalarType::BFloat16, qkv.scalar_type(), "transform_bias_rescale_qkv", [&] { - scalar_t* qkv_data = qkv.data_ptr(); - scalar_t* qkv_bias_data = qkv_bias.data_ptr(); + scalar_t* qkv_data = qkv_contig->data_ptr(); + scalar_t* qkv_bias_data = qkv_bias_contig->data_ptr(); scalar_t* q_k_v_data = q_k_v.data_ptr(); const scalar_t sqrt_dim_per_head = std::sqrt(static_cast(dim_per_head)); @@ -134,6 +137,7 @@ std::tuple transform_bias_rescale_qkv( }); auto q_k_v_s = at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v_s.size() == 3); return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]); } From c32b74cecbf9974a6d3077f72a06d787a8078614 Mon Sep 17 00:00:00 2001 From: Ivan Kobzarev Date: Wed, 16 Feb 2022 10:32:06 -0800 Subject: [PATCH 103/199] [nnc][aot_compiler] Memory formats args to aot_compiler (#72873) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72873 Test Plan: Imported from OSS Reviewed By: priyaramani Differential Revision: D34250984 Pulled By: IvanKobzarev fbshipit-source-id: e723ee64b024883eef78853e1b185b7040cafb09 (cherry picked from commit e9908df045acf33aa3cd0aec6784f15421236787) --- binaries/aot_model_compiler.cc | 16 ++++++++-- torch/csrc/jit/mobile/nnc/aot_compiler.cpp | 36 ++++++++++++++++++++-- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/binaries/aot_model_compiler.cc b/binaries/aot_model_compiler.cc index 2a234810b2c..0f8b8ee1718 100644 --- a/binaries/aot_model_compiler.cc +++ b/binaries/aot_model_compiler.cc @@ -30,6 +30,12 @@ C10_DEFINE_string( "If multiple inputs needed, use semicolon to separate " "the dtype of different tensors." "Supported dtypes: float, int64, uint8"); +C10_DEFINE_string( + input_memory_formats, + "", + "Input memory format." + "If multiple inputs needed, use semicolon to separate." + "Supported values: contiguous, channels_last"); C10_DEFINE_string(method_name, "forward", "The name of the method."); C10_DEFINE_string( output_llvm, @@ -61,6 +67,7 @@ c10::Dict createCompileSpec() { c10::StringType::get(), c10::AnyType::get()); method_spec.insert("sizes", FLAGS_input_dims); method_spec.insert("types", FLAGS_input_types); + method_spec.insert("memory_formats", FLAGS_input_memory_formats); method_spec.insert("asmfile", FLAGS_output_llvm); method_spec.insert("model_name", FLAGS_model_name); method_spec.insert("model_version", FLAGS_model_version); @@ -79,6 +86,7 @@ int main(int argc, char** argv) { " --model_version=" " --input_dims=" " --input_types=" + " --input_memory_formats=" " [--method_name=]" " [--output_llvm=]" " [--output_model=]"); @@ -93,10 +101,14 @@ int main(int argc, char** argv) { CAFFE_ENFORCE(!FLAGS_model_name.empty(), c10::UsageMessage()); CAFFE_ENFORCE(!FLAGS_model_version.empty(), c10::UsageMessage()); CAFFE_ENFORCE(!FLAGS_input_dims.empty(), c10::UsageMessage()); + const auto dims_size = split(';', FLAGS_input_dims).size(); CAFFE_ENFORCE( - split(';', FLAGS_input_dims).size() == - split(';', FLAGS_input_types).size(), + dims_size == split(';', FLAGS_input_types).size(), "Number of input_dims and input_types should be the same"); + const auto mem_formats_size = split(';', FLAGS_input_memory_formats).size(); + CAFFE_ENFORCE( + mem_formats_size == 0 || mem_formats_size == dims_size, + "Number of input_memory_formats should be 0 (default contiguous) or the same as number of input_dims"); if (FLAGS_output_llvm.empty()) { FLAGS_output_llvm = FLAGS_model.substr(0, FLAGS_model.find('.')) + ".compiled.ll"; diff --git a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp index 1d140390a52..60152d861d2 100644 --- a/torch/csrc/jit/mobile/nnc/aot_compiler.cpp +++ b/torch/csrc/jit/mobile/nnc/aot_compiler.cpp @@ -253,6 +253,24 @@ std::vector parseInputTypes( return scalarTypes; } +std::vector parseInputMemoryFormats( + const std::string& input_memory_format_str) { + std::vector memFormatsStr = split(';', input_memory_format_str); + std::vector memFormats; + for (const auto& memFormatStr : memFormatsStr) { + at::MemoryFormat memFormat; + if (memFormatStr == "contiguous") { + memFormat = at::MemoryFormat::Contiguous; + } else if (memFormatStr == "channels_last") { + memFormat = at::MemoryFormat::ChannelsLast; + } else { + CAFFE_THROW("Unsupported memory format: ", memFormatStr); + } + memFormats.push_back(memFormat); + } + return memFormats; +} + std::string getNncKernelId( const std::string& model_name, const std::string& model_version, @@ -309,12 +327,16 @@ std::shared_ptr preprocessGraphPasses( std::vector> generateExampleInputs( const std::vector>& inputShapes, - const std::vector& inputTypes) { + const std::vector& inputTypes, + const std::vector& inputMemoryFormats) { std::vector> example_inputs; example_inputs.reserve(inputShapes.size()); for (int i = 0; i < inputShapes.size(); ++i) { + const auto dtype = at::dtype(inputTypes[i]); + const auto memory_format = inputMemoryFormats[i]; example_inputs.emplace_back( - at::rand(inputShapes[i]).to(at::dtype(inputTypes[i]))); + at::rand(inputShapes[i], at::TensorOptions(dtype)) + .contiguous(memory_format)); } return example_inputs; } @@ -343,7 +365,15 @@ c10::IValue preprocess( auto sizes = parseInputShapes(*method_spec.at("sizes").toString()); auto types = parseInputTypes(*method_spec.at("types").toString()); - auto example_inputs = generateExampleInputs(sizes, types); + std::string memory_formats_str = method_spec.contains("memory_formats") + ? (*method_spec.at("memory_formats").toString()).string() + : ""; + auto memory_formats = memory_formats_str.empty() + ? std::vector( + sizes.size(), at::MemoryFormat::Contiguous) + : parseInputMemoryFormats(memory_formats_str); + + auto example_inputs = generateExampleInputs(sizes, types, memory_formats); graph = preprocessGraphPasses(graph, example_inputs); auto kernel_func_name = From c19255840f509d4b22b03319b6c00dc08bebba67 Mon Sep 17 00:00:00 2001 From: Guo Yejun Date: Wed, 16 Feb 2022 11:22:36 -0800 Subject: [PATCH 104/199] codegen: do not generate code for dispatch_namespaced_definitions (#69074) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/69074 Reviewed By: jbschlosser Differential Revision: D32758621 Pulled By: bdhirsh fbshipit-source-id: f8a174fd9d74039003f9713d8dfaae2b4eaa7089 (cherry picked from commit 462e92c82d196ba27228db4e7e1d8195a7f07e08) --- tools/codegen/gen_backend_stubs.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/tools/codegen/gen_backend_stubs.py b/tools/codegen/gen_backend_stubs.py index 7837a41cab6..a98aca30887 100644 --- a/tools/codegen/gen_backend_stubs.py +++ b/tools/codegen/gen_backend_stubs.py @@ -249,16 +249,7 @@ def gen_dispatcher_registrations( 'dispatch_namespace': dispatch_key.lower(), 'dispatch_headers': dest.gen_registration_headers(backend_index, per_operator_headers=False, rocm=False), 'dispatch_helpers': dest.gen_registration_helpers(backend_index), - 'dispatch_namespaced_definitions': list(concatMap( - dest.RegisterDispatchKey( - backend_index, - Target.NAMESPACED_DEFINITION, - selector, - rocm=False, - cpp_namespace=cpp_namespace, - class_method_name=f'{backend_dispatch_key}NativeFunctions'), - grouped_native_functions - )), + 'dispatch_namespaced_definitions': '', 'dispatch_anonymous_definitions': list(concatMap( dest.RegisterDispatchKey( backend_index, From 889f3f48b2ef0fb27cff5a1a474d7c316fd7b5d4 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Wed, 16 Feb 2022 11:23:00 -0800 Subject: [PATCH 105/199] Revert D34178476: Update lazy_ir.py from lazy_tensor_staging Test Plan: revert-hammer Differential Revision: D34178476 (https://github.com/pytorch/pytorch/commit/3842140fd5eb6dcb7eaab5628a79a9093aac5a24) Original commit changeset: 7190b2e0d82b Original Phabricator Diff: D34178476 (https://github.com/pytorch/pytorch/commit/3842140fd5eb6dcb7eaab5628a79a9093aac5a24) fbshipit-source-id: 4c969a355f01244c6f5acc52bc31679f2182aa55 (cherry picked from commit 17082075ddc6bae93981d95eb4512856c3f91bb1) --- test/cpp/lazy/test_cache.cpp | 2 +- test/cpp/lazy/test_ir.cpp | 2 +- test/cpp/lazy/test_ir_util.cpp | 2 +- tools/codegen/api/lazy.py | 45 ++---- tools/codegen/dest/lazy_ir.py | 143 ++++++------------- tools/codegen/dest/lazy_ts_lowering.py | 8 +- tools/codegen/gen_lazy_tensor.py | 22 ++- torch/csrc/lazy/core/config.cpp | 5 - torch/csrc/lazy/core/config.h | 1 - torch/csrc/lazy/core/ir.cpp | 27 +--- torch/csrc/lazy/core/ir.h | 41 +----- torch/csrc/lazy/core/lazy_graph_executor.cpp | 10 +- torch/csrc/lazy/core/shape.cpp | 8 +- torch/csrc/lazy/core/shape.h | 2 +- torch/csrc/lazy/ts_backend/ts_node.cpp | 18 ++- torch/csrc/lazy/ts_backend/ts_node.h | 2 +- 16 files changed, 95 insertions(+), 243 deletions(-) diff --git a/test/cpp/lazy/test_cache.cpp b/test/cpp/lazy/test_cache.cpp index a02e7eb251a..033b6c21b1e 100644 --- a/test/cpp/lazy/test_cache.cpp +++ b/test/cpp/lazy/test_cache.cpp @@ -11,7 +11,7 @@ namespace lazy { class CacheNode : public Node { public: explicit CacheNode(const std::string& str) - : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool bakeInSizes) -> hash_t { return Hash(str); }), + : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(str)), str_(str) {} ~CacheNode() override = default; diff --git a/test/cpp/lazy/test_ir.cpp b/test/cpp/lazy/test_ir.cpp index 811ab8b50fe..78b94618c7f 100644 --- a/test/cpp/lazy/test_ir.cpp +++ b/test/cpp/lazy/test_ir.cpp @@ -12,7 +12,7 @@ namespace lazy { class TestLeafNode : public Node { public: explicit TestLeafNode(size_t param) - : Node(OpKind(), /* num_outputs */ 1, /* hash_func */[&](bool bakeInSizes) -> hash_t { return Hash(param); }), + : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(param)), param_(param) {} ~TestLeafNode() override = default; diff --git a/test/cpp/lazy/test_ir_util.cpp b/test/cpp/lazy/test_ir_util.cpp index 7f5202c01d7..5c216258f9a 100644 --- a/test/cpp/lazy/test_ir_util.cpp +++ b/test/cpp/lazy/test_ir_util.cpp @@ -12,7 +12,7 @@ namespace lazy { class IrUtilNode : public Node { public: explicit IrUtilNode() - : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool bakeInSizes) -> hash_t { return Hash(0); }) {} + : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(0)) {} ~IrUtilNode() override = default; void AddOperand(Value v) { diff --git a/tools/codegen/api/lazy.py b/tools/codegen/api/lazy.py index 9bcec4c8f81..3fe83936eef 100644 --- a/tools/codegen/api/lazy.py +++ b/tools/codegen/api/lazy.py @@ -1,10 +1,10 @@ from typing import List, Union, Tuple from tools.codegen.model import (Type, BaseTy, BaseType, OptionalType, ListType, OperatorName, FunctionSchema, - Return, TensorOptionsArguments) + Return) from tools.codegen.api.types import (BaseCppType, BaseCType, OptionalCType, ConstRefCType, NamedCType, - MutRefCType, deviceT, layoutT, + MutRefCType, VectorCType, boolT, longT, doubleT, ListCType, stringT, scalarT, scalarTypeT, ArrayRefCType, ArrayCType, TupleCType) @@ -33,9 +33,7 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L if typ.name == BaseTy.Tensor: return BaseCType(valueT) elif typ.name == BaseTy.Scalar: - # at::scalar has special handling, - # and is wrapped in an IR value just like at::tensor - return BaseCType(valueT) + return BaseCType(scalarT) elif typ.name == BaseTy.ScalarType: return BaseCType(scalarTypeT) elif typ.name == BaseTy.int: @@ -46,10 +44,6 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L return BaseCType(doubleT) elif typ.name == BaseTy.str: return BaseCType(stringT) - elif typ.name == BaseTy.Device: - return BaseCType(deviceT) - elif typ.name == BaseTy.Layout: - return BaseCType(layoutT) else: raise AssertionError(f"TODO add support for type {repr(typ)}") elif isinstance(typ, OptionalType): @@ -71,30 +65,12 @@ def isValueType(typ: Union[Type, BaseCType, OptionalCType, ConstRefCType, MutRef being Tensor-like, but assumes the type has already been transformed. """ if isinstance(typ, BaseCType): - # I am regretting my naming conventions, but now we are wrapping at::scalar in - # lazy value, while preserving other 'scalar' types as scalars in the IR - return typ.type == valueT or typ.type == scalarT + return typ.type == valueT elif isinstance(typ, (OptionalCType, ListCType, VectorCType)): return isValueType(typ.elem) else: return False -def isWrappedScalarType(typ: Type) -> bool: - """ - Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value. - Since we literally change the type from scalarT to valueT, information is lost. - This function helps build a list of wrapped scalars to save that information - """ - if isinstance(typ, BaseType): - # I am regretting my naming conventions, but now we are wrapping at::scalar in - # lazy value, while preserving other 'scalar' types as scalars in the IR - return typ.name == BaseTy.Scalar - elif isinstance(typ, (OptionalType, ListType)): - return isWrappedScalarType(typ.elem) - else: - return False - - # Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node. # Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML), # but carries type information from a native FunctionSchema modified for use with IR nodes, @@ -111,8 +87,6 @@ class LazyIrSchema: # TODO: Need to handle collisions with argument names at some point returns: Tuple['Return', ...] - wrapped_scalar_names: List[str] - def __init__(self, func: FunctionSchema): positional_arg_types = [] @@ -134,15 +108,14 @@ class LazyIrSchema: "tensor_options", "post_tensor_options_kwarg_only", "out"]: - curr_args = getattr(func.arguments, arg_field) - if curr_args is not None: - if isinstance(curr_args, TensorOptionsArguments): - curr_args = curr_args.all() - keyword_arg_types.extend([NamedCType(arg.name, process_ir_type(arg.type)) for arg in curr_args]) + if getattr(func.arguments, arg_field) is not None: + keyword_arg_types.extend([ + NamedCType( + arg.name, + process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)]) self.keyword_arg_types = tuple(keyword_arg_types) self.name = func.name self.returns = func.returns - self.wrapped_scalar_names = [arg.name for arg in func.schema_order_arguments() if isWrappedScalarType(arg.type)] @property def node_name(self) -> str: diff --git a/tools/codegen/dest/lazy_ir.py b/tools/codegen/dest/lazy_ir.py index f1145d7fcfe..d41b4edcd8a 100644 --- a/tools/codegen/dest/lazy_ir.py +++ b/tools/codegen/dest/lazy_ir.py @@ -1,4 +1,3 @@ -from abc import ABC, abstractmethod from typing import List, Union from dataclasses import dataclass from tools.codegen.context import method_with_native_function @@ -10,23 +9,17 @@ import tools.codegen.api.dispatcher as dispatcher from tools.codegen.api.lazy import LazyIrSchema, isValueType from tools.codegen.dest.lazy_ts_lowering import ts_lowering_body -def node_ctor_arg_rvalue_string(arg: NamedCType, schema: LazyIrSchema) -> str: + +def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: """ Given a NamedCType from a lazy IR schema, generate a c++ string for materializing an rvalue of that arg for passing into a lazy Node constructor. """ - if isValueType(arg.type): if isinstance(arg.type, BaseCType): - if arg.name in schema.wrapped_scalar_names: - return f"torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen({arg.name})" return f"lazy_{arg.name}.GetIrValue()" elif isinstance(arg.type, OptionalCType): - if arg.name in schema.wrapped_scalar_names: - return f"{arg.name} ? " \ - f"c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen(*{arg.name})) : " \ - "c10::nullopt" return f"lazy_{arg.name} ? " \ f"c10::make_optional(lazy_{arg.name}.GetIrValue()) : " \ "c10::nullopt" @@ -42,55 +35,24 @@ def node_ctor_arg_rvalue_string(arg: NamedCType, schema: LazyIrSchema) -> str: else: return f"{arg.name}" -def node_ctor_inputs(schema: LazyIrSchema) -> str: +def node_ctor_inputs(func: LazyIrSchema) -> str: """ Produce a formatted string with the arguments as passed into the constructor of a node class. """ - node_ctor_values = [node_ctor_arg_rvalue_string(arg, schema) for arg in schema.filtered_types()] + node_ctor_values = [node_ctor_arg_rvalue_string(arg) for arg in func.filtered_types()] return ",\n ".join(node_ctor_values) -def gen_fallback_code(schema: LazyIrSchema, overload_name: str) -> str: - """ - Generate code that falls back to eager conditioned on a predicate - """ - fallback_args = ",\n ".join([arg.name for arg in schema.filtered_types()]) - if len(overload_name): - aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})" - else: - aten_op_str = f"ATEN_OP({schema.aten_name})" - return f""" - if (force_eager_fallback({aten_symbol(schema)})) {{ - return at::native::call_fallback_fn<<c_eager_fallback, {aten_op_str}>::call( - {fallback_args} - ); - }} -""" - -def aten_symbol(schema: LazyIrSchema) -> str: - missing_interned_strings = { - 'sigmoid_backward', - } - if schema.aten_name in missing_interned_strings: - return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")' - return f'at::aten::{schema.aten_name}' @dataclass(frozen=True) -class LazyIR(ABC): +class LazyIR: backend_index: BackendIndex node_base: str - lowering_function_type: str - lowering_context_type: str - lowering_return_type: str @method_with_native_function def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func return self.gen(f) - @abstractmethod - def lowering_body(self, f): - pass - def gen(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: # for now, we just want one IR class decl and soon after also the method defs # and we use the functional version not out/inplace. @@ -101,9 +63,9 @@ class LazyIR(ABC): scalar_types = schema.filtered_types(values=False, scalars=True) node_ctor_args = ", ".join([f"const {i.cpp_type()}& {i.name}" for i in all_types]) - scalar_initializers = ",\n ".join([f"{t.name}({t.name})" for t in scalar_types]) + scalar_initializers = ",\n ".join([f"{t.name}_({t.name})" for t in scalar_types]) comma_if_scalar_initializers = ",\n" if len(scalar_initializers) else "" - scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name};" for t in scalar_types]) + scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name}_;" for t in scalar_types]) scalar_hashes = ", ".join([f"{f.name}" for f in scalar_types]) base_ctor_value_args_list = [] optional_values = [] @@ -121,20 +83,21 @@ class LazyIR(ABC): members_to_string = [] for t in scalar_types: if isinstance(t.type, OptionalCType): - members_to_string.append(f"""if ({t.name}.has_value()) {{ - ss << ", {t.name}=" << {t.name}.value(); + members_to_string.append(f"""if ({t.name}_.has_value()) {{ + ss << ", {t.name}=" << {t.name}_.value(); }} else {{ ss << ", {t.name}=null"; }}""") else: - members_to_string.append(f'ss << ", {t.name}=" << {t.name};') + members_to_string.append(f'ss << ", {t.name}=" << {t.name}_;') members_to_string_str = "\n ".join(members_to_string) return [f"""\ +// TODO(alanwaketan): Public members don't need to have _ suffix. class {schema.node_name} : public {self.node_base} {{ public: {schema.node_name}({node_ctor_args}, std::vector&& shapes) - : {self.node_base}(torch::lazy::OpKind({aten_symbol(schema)}), + : {self.node_base}(torch::lazy::OpKind(at::aten::{schema.aten_name}), {{{base_ctor_value_args}}}, std::move(shapes), /* num_outputs */ {len(func.returns)}, torch::lazy::MHash({scalar_hashes})){comma_if_scalar_initializers} @@ -146,14 +109,14 @@ class {schema.node_name} : public {self.node_base} {{ std::string ToString() const override {{ std::stringstream ss; - ss << {self.node_base}::ToString(); + ss << TsNode::ToString(); {members_to_string_str} return ss.str(); }} - {self.lowering_return_type} Lower({self.lowering_function_type} function, - {self.lowering_context_type} loctx) const override {{ - {self.lowering_body(f)} + torch::lazy::TSOpVector Lower(std::shared_ptr function, + torch::lazy::TSLoweringContext* loctx) const override {{ + {ts_lowering_body(f)} }} {scalar_decls} @@ -164,34 +127,21 @@ class {schema.node_name} : public {self.node_base} {{ """, ] -@dataclass(frozen=True) -class TSLazyIR(LazyIR): - lowering_function_type: str = "std::shared_ptr" - lowering_context_type: str = "torch::lazy::TSLoweringContext*" - lowering_return_type: str = "torch::lazy::TSOpVector" - - def lowering_body(self, f): - return ts_lowering_body(f) - - -def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str, schema: LazyIrSchema) -> str: +def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str) -> str: lazy_tensor_decls: List[str] = [] for t in value_types: - if t.name in schema.wrapped_scalar_names: - # no lazy tensor wrapper for scalars that are promoted to IR values - continue if isinstance(t.type, BaseCType): lazy_tensor_decls.append( f"{tensor_class} lazy_{t.name} = " - f"torch::lazy::GetLtcTensorOrCreateForWrappedNumber({t.name}, *common_device);") + f"GetLtcTensorOrCreateForWrappedNumber({t.name}, *device);") elif isinstance(t.type, OptionalCType): # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it # until we encounter a real world example. lazy_tensor_decls.append( - f" {tensor_class} lazy_{t.name} = torch::lazy::TryGetLtcTensor({t.name}.value_or(at::Tensor()));") + f" {tensor_class} lazy_{t.name} = TryGetLtcTensor({t.name}.value_or(at::Tensor()));") else: raise AssertionError("TODO not sure if there are other valid types to handle here") - return ("\n ").join(lazy_tensor_decls) + return "\n ".join(lazy_tensor_decls) @dataclass(frozen=True) class GenLazyNativeFuncDefinition: @@ -202,7 +152,7 @@ class GenLazyNativeFuncDefinition: @method_with_native_function def __call__(self, func: NativeFunction) -> List[str]: sig = kernel_signature(func, self.backend_index) - metadata = self.backend_index.get_kernel(func) + # Lazy IR stuff schema = LazyIrSchema(func.func) all_types = schema.filtered_types() @@ -210,14 +160,9 @@ class GenLazyNativeFuncDefinition: scalar_types = schema.filtered_types(values=False, scalars=True) returns_length = len(schema.returns) - fallback_str = gen_fallback_code(schema, overload_name=func.func.name.overload_name) - value_types_names = [f"{t.name}" for t in value_types if t.name not in schema.wrapped_scalar_names] - assert len(value_types_names) > 0, "Code below assumes there is at least one tensor arg" - get_device_str = f"""auto common_device = torch::lazy::GetBackendDevice({', '.join(value_types_names)}); - TORCH_INTERNAL_ASSERT(common_device); - """ - - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) + value_types_names = ", ".join([f"{t.name}" for t in value_types]) + get_device_str = f"""auto device = bridge::GetBackendDevice({value_types_names});""" + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) node_ctor_input_str = node_ctor_inputs(schema) # call the meta kernel if it exists, to compute output shape/dtype for our IR @@ -229,40 +174,37 @@ class GenLazyNativeFuncDefinition: shapes_str = ','.join([this_shape(i) for i in range(returns_length)]) meta_out = "std::vector shapes{" + shapes_str + "};" - # TODO: INTEGRATION POINT HERE: meta_str = f"""auto out_meta = at::meta::{schema.aten_name}({', '.join(str(t.name) for t in all_types)}); {meta_out}""" else: - shape_sig = ComputeShapeSignature(metadata.kernel, func) + shape_sig = ComputeShapeSignature(func) meta_str = f""" auto shapes = {shape_sig.shape_call};""" - meta_str += f""" TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});""" node_str = f"""auto node = torch::lazy::MakeNode({node_ctor_input_str}, std::move(shapes));""" - first_tensor_name = value_types_names[0] - bridge_str = """auto result = torch::lazy::CreateAtenFromLtcTensor( - torch::lazy::LazyTensor::Create(std::move(node), *common_device));""" + assert len(value_types) > 0, f"Only supporting tensor ops so far, none found in {sig}" + first_tensor = value_types[0] + bridge_str = f"""auto result = CreateAtenFromLtcTensor(lazy_{first_tensor.name}.CreateFrom(node));""" if returns_length > 1: bridge_str = f"""std::vector<{self.tensor_class}> lazy_tensors; for (int i = 0; i < {returns_length}; i++) {{ - lazy_tensors.push_back(torch::lazy::LazyTensor::Create(torch::lazy::Value(node, i), *common_device)); + lazy_tensors.push_back(lazy_{first_tensor.name}.CreateFrom(torch::lazy::Value(node, i))); }} - auto result = torch::lazy::TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" - - if schema.name.name.inplace or func.func.is_out_fn(): + auto result = TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" + if schema.name.name.inplace: assert returns_length == 1, "We assumed there was no such case where an op is an in-place variant " \ "and has tuple outputs." - bridge_str = f"""lazy_{first_tensor_name}.SetInPlaceIrValue(node); - auto& result = {first_tensor_name};""" + bridge_str = f"""lazy_{first_tensor.name}.SetInPlaceIrValue(node); + auto& result = {first_tensor.name};""" return [f"""\ - {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{ - {fallback_str} + // TODO(alanwaketan): Quite a lot inefficient copy-by-value there. Let's optimize it. + {sig.decl(name=f"{self.class_method_name}::{schema.aten_name}")} {{ TORCH_LAZY_FN_COUNTER("lazy::"); {get_device_str} {lazy_tensor_decls_str} @@ -277,17 +219,17 @@ class ComputeShapeSignature: """ Here we use the base name as the suffix of the signature to avoid generating for in-place variants. """ - def __init__(self, kernel_name: str, f: NativeFunction): + @method_with_native_function + def __init__(self, f: NativeFunction): self.__schema = LazyIrSchema(f.func) self.__dispatch_args = ', '.join([a.decl() for a in dispatcher.arguments(f.func)]) self.__call_args = ", ".join([f"{t.name}" for t in self.__schema.filtered_types()]) - self.__kernel_name = kernel_name def __decl_suffix(self) -> str: - return f"{self.__kernel_name}({self.__dispatch_args})" + return f"{self.__schema.base_name}({self.__dispatch_args})" def __call_suffix(self) -> str: - return f"{self.__kernel_name}({self.__call_args})" + return f"{self.__schema.base_name}({self.__call_args})" @property def shape_decl(self) -> str: @@ -304,20 +246,19 @@ class GenLazyShapeInferenceDefinition: tensor_class: str @method_with_native_function - # def gen_lazy_shape_inference_decl(f: NativeFunction, backend_index: BackendIndex, tensor_class: str) -> List[str]: def __call__(self, f: NativeFunction) -> List[str]: sig = kernel_signature(f, self.backend_index) - metadata = self.backend_index.get_kernel(f) + # Lazy IR stuff schema = LazyIrSchema(f.func) value_types = schema.filtered_types(values=True, scalars=False) - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) node_ctor_input_str = node_ctor_inputs(schema) # Only generate shape/dtype fn for non-structured kernels, # since we just use the meta function for structured kernels if not f.structured and f.structured_delegate is None: - shape_sig = ComputeShapeSignature(metadata.kernel, f) + shape_sig = ComputeShapeSignature(f) return ["\n".join([f"{shape_sig.shape_decl};"])] else: return [] diff --git a/tools/codegen/dest/lazy_ts_lowering.py b/tools/codegen/dest/lazy_ts_lowering.py index 3f7701d5587..32d505cda7b 100644 --- a/tools/codegen/dest/lazy_ts_lowering.py +++ b/tools/codegen/dest/lazy_ts_lowering.py @@ -18,12 +18,13 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: continue emplace_arguments.append('loctx->GetOutputOp(operand(i++))') continue - emplace_arguments.append(f'"{value.name}", {value.name}') + emplace_arguments.append(f'"{value.name}", {value.name}_') emplace_arguments_str = "\n ".join( [f"arguments.emplace_back({a});" for a in emplace_arguments]) - emplace_kwarg_values = [f'"{t.name}", loctx->GetOutputOp(operand(i++))' for t in schema.keyword_values] - emplace_kwarg_scalars = [f'"{t.name}", {t.name}' for t in schema.keyword_scalars] + emplace_kwarg_values = [f'loctx->GetOutputOp(operand({i}))' for i in range(len(schema.keyword_values))] + emplace_kwarg_scalars = [f'"{t.name}", {t.name}_' for t in schema.keyword_scalars] + assert len(schema.keyword_values) == 0, "TODO the logic for operand(i) is broken if there are kw values" emplace_kwarguments = "\n ".join( [f"kwarguments.emplace_back({a});" for a in emplace_kwarg_values + emplace_kwarg_scalars]) return f"""\ @@ -37,5 +38,6 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments); CHECK_EQ({schema.aten_name}_out.size(), {len(func.returns)}); + // TODO: need to call GenerateClone sometimes? Or else return LowerBuiltIn() directly return {schema.aten_name}_out; """ diff --git a/tools/codegen/gen_lazy_tensor.py b/tools/codegen/gen_lazy_tensor.py index 9705620fa2e..b2515d3d083 100644 --- a/tools/codegen/gen_lazy_tensor.py +++ b/tools/codegen/gen_lazy_tensor.py @@ -3,8 +3,7 @@ import argparse import os import yaml from collections import namedtuple -from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple, Type -from tools.codegen.dest.lazy_ir import LazyIR, TSLazyIR +from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml from tools.codegen.model import (FunctionSchema, NativeFunction, NativeFunctionsGroup, OperatorName) @@ -61,20 +60,20 @@ def main() -> None: parser.add_argument( '--node_base_hdr', type=str, default=None, help='Path to header file defining custom Lazy IR Node base class') parser.add_argument( - '--tensor_class', type=str, default="torch::lazy::LazyTensor", help='Name of backend specific custom Lazy Tensor class') + '--tensor_class', type=str, default="LazyTensor", help='Name of backend specific custom Lazy Tensor class') parser.add_argument( - '--tensor_class_hdr', type=str, default="torch/csrc/lazy/core/tensor.h", + '--tensor_class_hdr', type=str, default="lazy_tensor_core/csrc/tensor.h", help='Path to header file defining custom Lazy Tensor class') options = parser.parse_args() run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path, options.gen_ts_lowerings, options.node_base, options.node_base_hdr, - options.tensor_class, options.tensor_class_hdr, TSLazyIR) + options.tensor_class, options.tensor_class_hdr) def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str], gen_ts_lowerings: bool, node_base: str, node_base_hdr: Optional[str], - tensor_class: str, tensor_class_hdr: str, lazy_ir_cls: Type[LazyIR]) -> None: + tensor_class: str, tensor_class_hdr: str) -> None: # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute() @@ -161,13 +160,11 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st fm.write_with_template(f'{backend_key}NativeFunctions.cpp', 'DispatchKeyNativeFunctions.cpp', lambda: { 'includes': [f'#include <{path}>' for path in [ tensor_class_hdr, - "ATen/Functions.h", "ATen/MetaFunctions.h", - "ATen/Operators.h", - "torch/csrc/lazy/core/lazy_graph_executor.h", "torch/csrc/lazy/core/metrics.h", "torch/csrc/lazy/core/shape.h", - "lazy_tensor_core/csrc/ts_backend/aten_eager_fallback.h", + "lazy_tensor_core/csrc/aten_ltc_bridge.h", + "lazy_tensor_core/csrc/lazy_graph_executor.h", f"{output_dir}/{backend_key}NativeFunctions.h", f"{output_dir}/{backend_key}LazyIr.h", f"{output_dir}/{backend_key}ShapeInference.h", @@ -199,8 +196,7 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'func_declarations': list(concat_map_codegen( dest.GenLazyShapeInferenceDefinition(backend_indices[backend_key], tensor_class), - grouped_native_functions, - codegenInplaceVariant=True, + grouped_native_functions )), }) # Generate IR node classes @@ -221,7 +217,7 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'DispatchKey': backend_key, 'dispatch_namespace': backend_key.lower(), 'ir_declarations': list(concat_map_codegen( - lazy_ir_cls(backend_indices[backend_key], node_base), + dest.LazyIR(backend_indices[backend_key], node_base), grouped_native_functions )), }) diff --git a/torch/csrc/lazy/core/config.cpp b/torch/csrc/lazy/core/config.cpp index b47054913e1..af86dd926d6 100644 --- a/torch/csrc/lazy/core/config.cpp +++ b/torch/csrc/lazy/core/config.cpp @@ -7,11 +7,6 @@ C10_DEFINE_bool( false, "Enable parameter aliasing support"); -C10_DEFINE_bool( - torch_lazy_use_thread_pool, - false, - "Use thread pool to schedule backend execution"); - C10_DEFINE_int( torch_lazy_compilation_cache_size, 1024, diff --git a/torch/csrc/lazy/core/config.h b/torch/csrc/lazy/core/config.h index fa6630123cd..beee5b4b214 100644 --- a/torch/csrc/lazy/core/config.h +++ b/torch/csrc/lazy/core/config.h @@ -3,7 +3,6 @@ C10_DECLARE_bool(torch_lazy_ir_debug); C10_DECLARE_bool(torch_lazy_param_aliasing); -C10_DECLARE_bool(torch_lazy_use_thread_pool); C10_DECLARE_int(torch_lazy_compilation_cache_size); C10_DECLARE_int(torch_lazy_device_data_cache_size); diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp index a1726aacba6..63e6ee8744c 100644 --- a/torch/csrc/lazy/core/ir.cpp +++ b/torch/csrc/lazy/core/ir.cpp @@ -1,8 +1,6 @@ #include #include -C10_DEFINE_bool(ltc_enable_dynamic_shapes, false, "Whether dynamic shape is enabled"); - namespace torch { namespace lazy { @@ -25,14 +23,6 @@ hash_t Value::hash() const { return HashCombine(node->hash(), Hash(index)); } -hash_t Value::hash_with_sizes() const { - return HashCombine(node->hash_with_sizes(), Hash(index)); -} - -hash_t Value::hash_without_sizes() const { - return HashCombine(node->hash_without_sizes(), Hash(index)); -} - OpKind OpKind::Get(const std::string& name) { return OpKind(c10::Symbol::fromQualString(name)); } @@ -41,25 +31,18 @@ hash_t OpKind::hash() const { return StringHash(op.toQualString()); } -bool Node::enableDynamicShape() { - static bool enabled = std::getenv("LTC_ENABLE_DYNAMIC_SHAPES") != nullptr; - return enabled || FLAGS_ltc_enable_dynamic_shapes; -} - -Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn) +Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash) : op_(op), num_outputs_(num_outputs), node_hash_(node_hash), - dag_hash_without_sizes_(dag_hash_fn(false)), - dag_hash_with_sizes_(dag_hash_fn(true)), + dag_hash_(dag_hash), metadata_(GetMetaDataIfDebugging()) {} -Node::Node(OpKind op, size_t num_outputs, std::function node_hash_fn) +Node::Node(OpKind op, size_t num_outputs, hash_t node_hash) : op_(op), num_outputs_(num_outputs), - node_hash_(node_hash_fn(!enableDynamicShape())), - dag_hash_without_sizes_(node_hash_fn(false)), - dag_hash_with_sizes_(node_hash_fn(true)), + node_hash_(node_hash), + dag_hash_(node_hash), metadata_(GetMetaDataIfDebugging()) {} Node::~Node() = default; diff --git a/torch/csrc/lazy/core/ir.h b/torch/csrc/lazy/core/ir.h index 4132400bb65..6ca1df8d2fb 100644 --- a/torch/csrc/lazy/core/ir.h +++ b/torch/csrc/lazy/core/ir.h @@ -15,9 +15,6 @@ #include #include #include -#include - -C10_DECLARE_bool(ltc_enable_dynamic_shapes); namespace torch { namespace lazy { @@ -68,12 +65,9 @@ using OutputMap = std::unordered_map; // Represents an input/operand for a Node object. struct TORCH_API Value { Value() = default; - /* implicit */ Value(NodePtr&& node, size_t index = 0) : node(std::move(node)), index(index) {} - /* implicit */ Value(const NodePtr& node, size_t index = 0) : node(node), index(index) {} + /* implicit */ Value(NodePtr node, size_t index = 0) : node(std::move(node)), index(index) {} hash_t hash() const; - hash_t hash_with_sizes() const; - hash_t hash_without_sizes() const; operator bool() const { return node != nullptr; @@ -127,6 +121,7 @@ inline std::ostream& operator<<(std::ostream& stream, const OpKind& op) { using OpList = c10::ArrayRef; + // A node in the graph. Nodes for operations which requires extra data to be // stored for lowering, should inherit from this class and add operation // specific member there. For example, a constant might create a new @@ -135,18 +130,13 @@ using OpList = c10::ArrayRef; // client data handle in it. class TORCH_API Node { public: - static bool enableDynamicShape(); - // Creates a new node with the given op name. The op is a unique identifier // for the operation. The num_outputs tells how many outputs a given operation // generates. - // - // None leaf node's node_hash does not contains shape information always. - // So we pass in the hash value rather than a function. - Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn); + Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash); // Contructor used to create leaf nodes. - Node(OpKind op, size_t num_outputs, std::function node_hash_fn); + Node(OpKind op, size_t num_outputs, hash_t node_hash); virtual ~Node(); @@ -167,15 +157,7 @@ class TORCH_API Node { } hash_t hash() const { - return enableDynamicShape() ? dag_hash_without_sizes_ : dag_hash_with_sizes_; - } - - hash_t hash_without_sizes() const { - return dag_hash_without_sizes_; - } - - hash_t hash_with_sizes() const { - return dag_hash_with_sizes_; + return dag_hash_; } const MetaData& metadata() const { @@ -201,17 +183,8 @@ class TORCH_API Node { // The hash value of this node. hash_t node_hash_; - // dag_hash represents the hash value of the graph rooted at this node. There are 2 variants, one - // with sizes info and one without. We need 2 such hashes to support dynamic - // shape. Here are the logic to pick the hash in the 2 major scenarios that a hash is needed: - // - shape cache: in this case, we always use the dag hash with size info. This way, looking up the - // shape for one node does not get the shape for another node with the same rank but different sizes - // - lookup the compiled graph by a hash: in this case, we will use the dag hash - // WITHOUT size info if dynamic shape is enabled and use the dag hash WITH size info otherwise. - // The different requirement for the hash in these 2 scenarios forces us to maintain 2 - // different hashes. - hash_t dag_hash_without_sizes_; - hash_t dag_hash_with_sizes_; + // The hash value of the graph rooted at this node. + hash_t dag_hash_; // The IR specific metadata attached to the IR node. MetaData metadata_; // The IR framework user can attach a user defined metadata object deriving diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index 3599abb7b8d..9f504c935e9 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -462,7 +462,7 @@ void LazyGraphExecutor::SyncTensorsGraph( config.sync_ltc_data = sync_ltc_data; auto async = SyncTensorsGraphInternal(tensors, devices, config); - if (FLAGS_torch_lazy_use_thread_pool && wait && async != nullptr) { + if (wait && async != nullptr) { async->mwait.Wait(); } } @@ -972,11 +972,7 @@ std::shared_ptr LazyGraphExecutor:: } }; - if (FLAGS_torch_lazy_use_thread_pool) { - ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); - } else { - syncfn(); - } + ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); return async; } @@ -999,7 +995,7 @@ std::vector LazyGraphExecutor::GetTensorsFused( SyncTensorsConfig config; config.force_ltc_data = false; auto async = SyncTensorsGraphInternal(tensors, {}, config); - if (FLAGS_torch_lazy_use_thread_pool && async != nullptr) { + if (async != nullptr) { async->mwait.Wait(); } std::vector tensors_data = GatherTensorsData( diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp index bd5ea5b75c9..2b7fd2c74b8 100644 --- a/torch/csrc/lazy/core/shape.cpp +++ b/torch/csrc/lazy/core/shape.cpp @@ -28,12 +28,8 @@ size_t Shape::numel() const { return elts; } -hash_t Shape::hash(bool bakeInSizes) const { - if (bakeInSizes) { - return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); - } else { - return HashCombine(Hash(scalar_type_), Hash(sizes_.size())); - } +hash_t Shape::hash() const { + return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); } } // namespace lazy diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h index 9b34b90fec0..c67ff908833 100644 --- a/torch/csrc/lazy/core/shape.h +++ b/torch/csrc/lazy/core/shape.h @@ -25,7 +25,7 @@ class TORCH_API Shape { int64_t size(int64_t dim) const { return sizes_.at(dim); } void set_size(int64_t dim, int64_t size) { sizes_.at(dim) = size; } size_t numel() const; - hash_t hash(bool bakeInSizes) const; + hash_t hash() const; bool operator==(const Shape& other) const; diff --git a/torch/csrc/lazy/ts_backend/ts_node.cpp b/torch/csrc/lazy/ts_backend/ts_node.cpp index a7948e5cbec..d79dd999f81 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.cpp +++ b/torch/csrc/lazy/ts_backend/ts_node.cpp @@ -28,15 +28,14 @@ void TsNodeSetShapeDeferred( throw std::runtime_error("Expected TsNode but could not dynamic cast"); } -hash_t OperandHashes(const OpList& operands, const hash_t& seed, bool bakeInSizes) { +hash_t OperandHashes(const OpList& operands, const hash_t& seed) { hash_t hash = seed; for (auto& operand : operands) { if (!operand) { hash = HashCombine(hash, static_cast(kNullOpt)); continue; } - auto operand_hash = bakeInSizes ? operand.hash_with_sizes() : operand.hash_without_sizes(); - hash = HashCombine(hash, operand_hash); + hash = HashCombine(hash, operand.hash()); } return hash; } @@ -49,7 +48,7 @@ TsNode::TsNode(OpKind op, OpList operands, std::vector&& shapes, // initialization to a separate function? /* node_hash */ HashCombine(op.hash(), hash_seed), /* dag_hash */ - [&](bool bakeInSizes) { return OperandHashes(operands, HashCombine(op.hash(), hash_seed), bakeInSizes); }), + OperandHashes(operands, HashCombine(op.hash(), hash_seed))), shapes_(shapes) { for (auto& operand : operands) { // Ideally, optional operands should be filtered by the leaf node classes, @@ -81,7 +80,7 @@ void TsNode::SetShapeDeferred( } TsNode::TsNode(OpKind op, Shape shape, size_t num_outputs, hash_t hash_seed) - : Node(op, num_outputs, [&](bool bakeInSizes) -> hash_t { return GetOpHash(op, shape, hash_seed, bakeInSizes); }) + : Node(op, num_outputs, GetOpHash(op, shape, hash_seed)) { shapes_.push_back(std::move(shape)); } @@ -99,11 +98,10 @@ ShapeCache* GetShapeCache() { Shape TsNode::GetOpShape( const std::function& shape_fn) const { - auto hash = hash_with_sizes(); ShapeCache* shape_cache = GetShapeCache(); - auto shape = shape_cache->Get(hash); + auto shape = shape_cache->Get(hash()); if (shape == nullptr) { - shape = shape_cache->Add(hash, + shape = shape_cache->Add(hash(), std::make_shared(shape_fn())); } return *shape; @@ -122,8 +120,8 @@ std::string TsNode::ToString() const { return ss.str(); } -hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes) { - hash_t h = HashCombine(op.hash(), shape.hash(bakeInSizes)); +hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed) { + hash_t h = HashCombine(op.hash(), shape.hash()); return HashCombine(h, hash_seed); } diff --git a/torch/csrc/lazy/ts_backend/ts_node.h b/torch/csrc/lazy/ts_backend/ts_node.h index 156444852d9..a6595a5337d 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.h +++ b/torch/csrc/lazy/ts_backend/ts_node.h @@ -55,7 +55,7 @@ class TORCH_API TsNode : public lazy::Node { std::string ToString() const override; - static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes); + static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed); const std::vector& operands() const override { return operands_as_outputs_; From 3fcdff061508b6e3a0fd50d91b89dda053702f73 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Wed, 16 Feb 2022 21:03:09 +0000 Subject: [PATCH 106/199] Set pull_request checkout to head sha MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **this is a re-submit of this PR, the previous version broke forked pull requests by checkout out the head ref as opposed to the head sha** There are two commits that we test sometimes in CI: 1. The merge commit (a test merge between the PR head ref and the latest base ref) 2. The head ref (the exact commit that was at the head of the user's branch when they pushed). This distinction is fairly subtle; in the case of 1, you are effectively running against a "rebased" version of your PR's branch. The problem is that we use *both* of these commits today, with confusing results—depending on how you put up your PR and what workflows are running, we might be testing two different commits! We should probably consolidate on one. This would eliminate a subtle but complex part of our CI (I am mildly horrified by the complexity of [this explanation](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md#which-commit-is-used-in-ci), although it's heroic that someone went and documented it lol). This PR consolidates on using the head ref (option 2). - This is the behavior of phabricator/fbcode, which many PT devs will be more familiar with. - This is the behavior of ghstack - Our master branch moves quite quickly, so the chance that there is a substantial divergence between your local test runs and CI is high, with confusing results that are nondeterministic based on when you put up the PR. - We use a linear history/squash-rebase-merge workflow, which is better modeled by option 2. Option 1 effectively emulates a merge-commit-style workflow. The primary disadvantage is that now when re-running workflows, you will not be re-running against a "rebased" version of the PR, but the exact head ref that was pushed. Tbh I find it quite unintuitive that what you're testing changes depending on when you press the re-run button, but I know at least @malfet does this so it's worth mentioning. Pull Request resolved: https://github.com/pytorch/pytorch/pull/71974 --- .github/templates/common.yml.j2 | 5 +++- .../linux_binary_build_workflow.yml.j2 | 2 +- ...rated-caffe2-linux-xenial-py3.7-gcc5.4.yml | 1 + .github/workflows/generated-docker-builds.yml | 1 + .../generated-ios-12-5-1-arm64-coreml.yml | 1 + .../generated-ios-12-5-1-arm64-custom-ops.yml | 1 + .../generated-ios-12-5-1-arm64-full-jit.yml | 1 + .../generated-ios-12-5-1-arm64-metal.yml | 1 + .../workflows/generated-ios-12-5-1-arm64.yml | 1 + .../generated-ios-12-5-1-x86-64-coreml.yml | 1 + .../generated-ios-12-5-1-x86-64-full-jit.yml | 1 + .../workflows/generated-ios-12-5-1-x86-64.yml | 1 + ...torch-linux-xenial-cuda10.2-py3.7-gcc7.yml | 1 + ...torch-linux-xenial-cuda11.3-py3.7-gcc7.yml | 1 + .../generated-linux-binary-conda.yml | 20 +++++++++++++ ...erated-linux-binary-libtorch-cxx11-abi.yml | 20 +++++++++++++ ...erated-linux-binary-libtorch-pre-cxx11.yml | 20 +++++++++++++ .../generated-linux-binary-manywheel.yml | 28 +++++++++++++++++++ ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 2 ++ .../generated-linux-bionic-py3.7-clang9.yml | 2 ++ .../generated-linux-bionic-rocm4.5-py3.7.yml | 2 ++ .../workflows/generated-linux-docs-push.yml | 2 ++ .github/workflows/generated-linux-docs.yml | 2 ++ ...rated-linux-vulkan-bionic-py3.7-clang9.yml | 2 ++ ...-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml | 1 + ...inux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml | 1 + ...rated-linux-xenial-cuda11.3-py3.7-gcc7.yml | 2 ++ ...d-linux-xenial-py3-clang5-mobile-build.yml | 1 + ...-py3-clang5-mobile-custom-build-static.yml | 1 + ...nerated-linux-xenial-py3.7-clang7-asan.yml | 2 ++ ...nerated-linux-xenial-py3.7-clang7-onnx.yml | 2 ++ .../generated-linux-xenial-py3.7-gcc5.4.yml | 2 ++ ...nerated-linux-xenial-py3.7-gcc7-no-ops.yml | 1 + .../generated-linux-xenial-py3.7-gcc7.yml | 2 ++ .../generated-macos-10-15-py3-arm64.yml | 1 + ...acos-10-15-py3-lite-interpreter-x86-64.yml | 1 + .../generated-macos-11-py3-x86-64.yml | 2 ++ .../generated-macos-arm64-binary-conda.yml | 6 ++++ .../generated-macos-arm64-binary-wheel.yml | 8 ++++++ .../generated-macos-binary-conda.yml | 8 ++++++ ...erated-macos-binary-libtorch-cxx11-abi.yml | 8 ++++++ ...erated-macos-binary-libtorch-pre-cxx11.yml | 8 ++++++ .../generated-macos-binary-wheel.yml | 8 ++++++ ...rallelnative-linux-xenial-py3.7-gcc5.4.yml | 2 ++ ...torch-linux-bionic-cuda11.5-py3.7-gcc7.yml | 1 + ...torch-linux-xenial-cuda11.1-py3.7-gcc7.yml | 1 + ...iodic-linux-bionic-cuda11.5-py3.7-gcc7.yml | 2 ++ ...enial-cuda10.2-py3-gcc7-slow-gradcheck.yml | 2 ++ ...linux-xenial-cuda11.1-py3.7-gcc7-debug.yml | 2 ++ ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 2 ++ ...rated-periodic-win-vs2019-cuda11.5-py3.yml | 2 ++ ...nial-py3-clang5-android-ndk-r19c-build.yml | 1 + ...9c-gradle-custom-build-single-full-jit.yml | 1 + ...id-ndk-r19c-gradle-custom-build-single.yml | 1 + .../generated-win-vs2019-cpu-py3.yml | 2 ++ .../generated-win-vs2019-cuda11.3-py3.yml | 2 ++ 56 files changed, 203 insertions(+), 2 deletions(-) diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 9d5edc4c2cd..e8c92296c6a 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -185,10 +185,13 @@ concurrency: docker system prune -af {%- endmacro -%} -{%- macro checkout(submodules="recursive", deep_clone=True, directory="", repository="pytorch/pytorch", branch="") -%} +{%- macro checkout(submodules="recursive", deep_clone=True, directory="", repository="pytorch/pytorch", branch="", checkout_pr_head=True) -%} - name: Checkout !{{ 'PyTorch' if repository == "pytorch/pytorch" else repository }} uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + {%- if checkout_pr_head %} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + {%- endif %} {%- if deep_clone %} # deep clone, to allow use of git merge-base fetch-depth: 0 diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index 86144ff3ddd..ec8b56a8c98 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -53,7 +53,7 @@ jobs: steps: !{{ common.setup_ec2_linux() }} !{{ common.checkout(deep_clone=False, directory="pytorch") }} - !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", checkout_pr_head=False) }} {%- if config["gpu_arch_type"] == 'cuda' and config["gpu_arch_version"].startswith('11') %} - name: Set BUILD_SPLIT_CUDA run: | diff --git a/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml index c1932cbf09e..7dc979ad0fb 100644 --- a/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-caffe2-linux-xenial-py3.7-gcc5.4.yml @@ -96,6 +96,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-docker-builds.yml b/.github/workflows/generated-docker-builds.yml index 0dc8ac9b98b..6687a8b7134 100644 --- a/.github/workflows/generated-docker-builds.yml +++ b/.github/workflows/generated-docker-builds.yml @@ -106,6 +106,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml index 6995b22347e..d8bc3694ede 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml index 0fd77eef860..61716b86f99 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml index 876e1e811f1..601e3cbb168 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml index 065f311e90f..a53ee7d40bf 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-ios-12-5-1-arm64.yml b/.github/workflows/generated-ios-12-5-1-arm64.yml index 2de63df2629..763356596b8 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml b/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml index 4306711a621..2dcd01d16c8 100644 --- a/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml +++ b/.github/workflows/generated-ios-12-5-1-x86-64-coreml.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml index 18553b41449..5562903e39a 100644 --- a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml +++ b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-ios-12-5-1-x86-64.yml b/.github/workflows/generated-ios-12-5-1-x86-64.yml index 0a92814866a..ce0401abca4 100644 --- a/.github/workflows/generated-ios-12-5-1-x86-64.yml +++ b/.github/workflows/generated-ios-12-5-1-x86-64.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml index fc55ce8dc28..6726c82ebe4 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda10.2-py3.7-gcc7.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml index 452c2007610..5d4ff086f1d 100644 --- a/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ b/.github/workflows/generated-libtorch-linux-xenial-cuda11.3-py3.7-gcc7.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-binary-conda.yml b/.github/workflows/generated-linux-binary-conda.yml index 6b3a74dec47..30d56f1ad6a 100644 --- a/.github/workflows/generated-linux-binary-conda.yml +++ b/.github/workflows/generated-linux-binary-conda.yml @@ -100,6 +100,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -490,6 +491,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -887,6 +889,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -1287,6 +1290,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -1687,6 +1691,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2086,6 +2091,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2476,6 +2482,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2873,6 +2880,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -3273,6 +3281,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -3673,6 +3682,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4072,6 +4082,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4462,6 +4473,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4859,6 +4871,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -5259,6 +5272,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -5659,6 +5673,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6058,6 +6073,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6448,6 +6464,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6845,6 +6862,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -7245,6 +7263,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -7645,6 +7664,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml index 6cfdc08cd04..24caf3c7f45 100644 --- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml +++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml @@ -101,6 +101,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -493,6 +494,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -885,6 +887,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -1277,6 +1280,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -1670,6 +1674,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2070,6 +2075,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2470,6 +2476,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2870,6 +2877,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -3270,6 +3278,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -3673,6 +3682,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4076,6 +4086,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4479,6 +4490,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4882,6 +4894,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -5285,6 +5298,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -5688,6 +5702,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6091,6 +6106,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6494,6 +6510,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6897,6 +6914,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -7300,6 +7318,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -7703,6 +7722,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml index c39fb1c690c..42c8401ed32 100644 --- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml +++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml @@ -101,6 +101,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -493,6 +494,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -885,6 +887,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -1277,6 +1280,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -1670,6 +1674,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2070,6 +2075,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2470,6 +2476,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2870,6 +2877,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -3270,6 +3278,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -3673,6 +3682,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4076,6 +4086,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4479,6 +4490,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4882,6 +4894,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -5285,6 +5298,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -5688,6 +5702,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6091,6 +6106,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6494,6 +6510,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6897,6 +6914,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -7300,6 +7318,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -7703,6 +7722,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout diff --git a/.github/workflows/generated-linux-binary-manywheel.yml b/.github/workflows/generated-linux-binary-manywheel.yml index a955984d7c7..5e65a54b54e 100644 --- a/.github/workflows/generated-linux-binary-manywheel.yml +++ b/.github/workflows/generated-linux-binary-manywheel.yml @@ -100,6 +100,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -490,6 +491,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -887,6 +889,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -1287,6 +1290,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -1687,6 +1691,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2087,6 +2092,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2479,6 +2485,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -2870,6 +2877,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -3260,6 +3268,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -3657,6 +3666,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4057,6 +4067,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4457,6 +4468,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -4857,6 +4869,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -5249,6 +5262,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -5640,6 +5654,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6030,6 +6045,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6427,6 +6443,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -6827,6 +6844,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -7227,6 +7245,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -7627,6 +7646,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -8019,6 +8039,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -8410,6 +8431,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -8800,6 +8822,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -9197,6 +9220,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -9597,6 +9621,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -9997,6 +10022,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -10397,6 +10423,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -10789,6 +10816,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index ee483708dfc..bd292201ab8 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -345,6 +346,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml index 181b6b3be13..7379f4b357e 100644 --- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml @@ -98,6 +98,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -346,6 +347,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml index 5f37b48464b..314415b1ef1 100644 --- a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml +++ b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -338,6 +339,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-docs-push.yml b/.github/workflows/generated-linux-docs-push.yml index 0ad84fdef3e..2a089807147 100644 --- a/.github/workflows/generated-linux-docs-push.yml +++ b/.github/workflows/generated-linux-docs-push.yml @@ -98,6 +98,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -306,6 +307,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-docs.yml b/.github/workflows/generated-linux-docs.yml index 5709b1a7eef..e2c0edb80d9 100644 --- a/.github/workflows/generated-linux-docs.yml +++ b/.github/workflows/generated-linux-docs.yml @@ -98,6 +98,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -306,6 +307,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml index 58f8cc3d056..bdfe986a0f7 100644 --- a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml @@ -98,6 +98,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -346,6 +347,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml index e1dc026af70..214a69a8984 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml index 7a51acf31e1..5a3d95f20d4 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-no-ops.yml @@ -96,6 +96,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml index 4dd594483b8..4a32d2662a9 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -345,6 +346,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml index df0dd5fb57f..65fb0acc559 100644 --- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml +++ b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-build.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml index 29a14fd9f41..37dc5901995 100644 --- a/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml +++ b/.github/workflows/generated-linux-xenial-py3-clang5-mobile-custom-build-static.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml index 5b538547df1..5c7c4d17db5 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml @@ -98,6 +98,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -346,6 +347,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml index 0005308beec..2512c071ab5 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml @@ -98,6 +98,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -346,6 +347,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml index 5778fe613db..3d0cb725a48 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -345,6 +346,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml index e9f11d265c7..74c9841ccf0 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc7-no-ops.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml index 1bb791a329b..f8bfb6cc763 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -345,6 +346,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-macos-10-15-py3-arm64.yml b/.github/workflows/generated-macos-10-15-py3-arm64.yml index ea97b3b9fac..9936eb34c56 100644 --- a/.github/workflows/generated-macos-10-15-py3-arm64.yml +++ b/.github/workflows/generated-macos-10-15-py3-arm64.yml @@ -42,6 +42,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml b/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml index c0745496769..6f4ac1c5c81 100644 --- a/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml +++ b/.github/workflows/generated-macos-10-15-py3-lite-interpreter-x86-64.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml index 41ae3259b52..8c0d6f884cd 100644 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ b/.github/workflows/generated-macos-11-py3-x86-64.yml @@ -44,6 +44,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -125,6 +126,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: false diff --git a/.github/workflows/generated-macos-arm64-binary-conda.yml b/.github/workflows/generated-macos-arm64-binary-conda.yml index 6c2dc4e5879..303c3607585 100644 --- a/.github/workflows/generated-macos-arm64-binary-conda.yml +++ b/.github/workflows/generated-macos-arm64-binary-conda.yml @@ -76,6 +76,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -86,6 +87,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -271,6 +273,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -281,6 +284,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -466,6 +470,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -476,6 +481,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-arm64-binary-wheel.yml b/.github/workflows/generated-macos-arm64-binary-wheel.yml index 1333425238e..52156e1e34b 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel.yml @@ -76,6 +76,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -86,6 +87,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -271,6 +273,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -281,6 +284,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -466,6 +470,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -476,6 +481,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -661,6 +667,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -671,6 +678,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-binary-conda.yml b/.github/workflows/generated-macos-binary-conda.yml index 3e43727760a..05450b6ec2f 100644 --- a/.github/workflows/generated-macos-binary-conda.yml +++ b/.github/workflows/generated-macos-binary-conda.yml @@ -74,6 +74,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -84,6 +85,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -269,6 +271,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -279,6 +282,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -464,6 +468,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -474,6 +479,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -659,6 +665,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -669,6 +676,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml index 4b03b779454..4403d9309d1 100644 --- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml +++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml @@ -79,6 +79,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -89,6 +90,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -280,6 +282,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -290,6 +293,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -481,6 +485,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -491,6 +496,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -682,6 +688,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -692,6 +699,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml index 2006f81d394..1c8692e7632 100644 --- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml +++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml @@ -79,6 +79,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -89,6 +90,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -280,6 +282,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -290,6 +293,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -481,6 +485,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -491,6 +496,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -682,6 +688,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -692,6 +699,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-binary-wheel.yml b/.github/workflows/generated-macos-binary-wheel.yml index 0aa84d70d9e..8b36a0e1db0 100644 --- a/.github/workflows/generated-macos-binary-wheel.yml +++ b/.github/workflows/generated-macos-binary-wheel.yml @@ -74,6 +74,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -84,6 +85,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -269,6 +271,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -279,6 +282,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -464,6 +468,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -474,6 +479,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder @@ -659,6 +665,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive path: pytorch - name: Clean PyTorch checkout @@ -669,6 +676,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml index d9182993f0c..3bfa5daa7f4 100644 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml @@ -96,6 +96,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -344,6 +345,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml index 0c2df124422..d8889821637 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7.yml @@ -96,6 +96,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml index 366395af1f2..aebecaf127f 100644 --- a/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml +++ b/.github/workflows/generated-periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7.yml @@ -96,6 +96,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml index 85e1ca4101b..1b3a4fad1d6 100644 --- a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml @@ -95,6 +95,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -343,6 +344,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml index 3c9c3c1199a..7d0db219cc8 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -345,6 +346,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml index 2e325fca8ad..bf3144e1892 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml @@ -96,6 +96,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -344,6 +345,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 11d24eafb62..bb63145beda 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -60,6 +60,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -194,6 +195,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml index f89ea43911e..cfac6389895 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml @@ -60,6 +60,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -194,6 +195,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml index 92da9655394..fb9486ba2a0 100644 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml +++ b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build.yml @@ -96,6 +96,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml index 95924b65d8a..b7bf245d48d 100644 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml +++ b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml index 7af766ba75a..074617996c2 100644 --- a/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml +++ b/.github/workflows/generated-pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single.yml @@ -97,6 +97,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 06db1e07c51..6d94c2f839c 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -61,6 +61,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -187,6 +188,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 8e84f9d5347..a7aa492a031 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -62,6 +62,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -196,6 +197,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive From bb736fac3338f1ab8f52b9ff16e6ce91eb20572c Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Wed, 16 Feb 2022 21:10:36 +0000 Subject: [PATCH 107/199] fix workflow lint Fixing issue due to land race Pull Request resolved: https://github.com/pytorch/pytorch/pull/72942 --- .../generated-pytorch-xla-linux-bionic-py3.7-clang8.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml index dceefb68815..3a49e0b9c39 100644 --- a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml +++ b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml @@ -100,6 +100,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive @@ -311,6 +312,7 @@ jobs: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} # deep clone, to allow use of git merge-base fetch-depth: 0 submodules: recursive From 0b117a3956a0ab1b1b0a2a677ce475095686f2e0 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 16 Feb 2022 13:23:29 -0800 Subject: [PATCH 108/199] Revert D34245091: [pytorch][PR] Improve numerical stability of `torch.distributions.wishart.Wishart` Test Plan: revert-hammer Differential Revision: D34245091 (https://github.com/pytorch/pytorch/commit/5343cfe9491b3c489e423f2be55086ad2f8e40c5) Original commit changeset: 1cd653c1d5c6 Original Phabricator Diff: D34245091 (https://github.com/pytorch/pytorch/commit/5343cfe9491b3c489e423f2be55086ad2f8e40c5) fbshipit-source-id: 90975456c5290b162da493ba0ef0a35920c73857 (cherry picked from commit 452fab82172c79186d1e7b53bb00cc0d1d790c4e) --- test/distributions/test_distributions.py | 107 +++++++++-------------- torch/distributions/exp_family.py | 2 +- torch/distributions/wishart.py | 53 +++++------ 3 files changed, 64 insertions(+), 98 deletions(-) diff --git a/test/distributions/test_distributions.py b/test/distributions/test_distributions.py index 3f3d0168b71..1855c8434be 100644 --- a/test/distributions/test_distributions.py +++ b/test/distributions/test_distributions.py @@ -34,7 +34,6 @@ import unittest from collections import namedtuple from itertools import product from random import shuffle -from packaging import version import torch @@ -2215,41 +2214,39 @@ class TestDistributions(TestCase): # We applied same tests in Multivariate Normal distribution for Wishart distribution def test_wishart_shape(self): - ndim = 3 - - df = torch.rand(5, requires_grad=True) + ndim - df_no_batch = torch.rand([], requires_grad=True) + ndim - df_multi_batch = torch.rand(6, 5, requires_grad=True) + ndim + df = (torch.rand(5, requires_grad=True) + 1) * 10 + df_no_batch = (torch.rand([], requires_grad=True) + 1) * 10 + df_multi_batch = (torch.rand(6, 5, requires_grad=True) + 1) * 10 # construct PSD covariance - tmp = torch.randn(ndim, 10) + tmp = torch.randn(3, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() # construct batch of PSD covariances - tmp = torch.randn(6, 5, ndim, 10) + tmp = torch.randn(6, 5, 3, 10) cov_batched = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_() prec_batched = cov_batched.inverse() scale_tril_batched = torch.linalg.cholesky(cov_batched) # ensure that sample, batch, event shapes all handled correctly - self.assertEqual(Wishart(df, cov).sample().size(), (5, ndim, ndim)) - self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (ndim, ndim)) - self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, ndim, ndim)) - self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, ndim, ndim)) - self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, ndim, ndim)) - self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, ndim, ndim)) - self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) - self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, ndim, ndim)) - self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) - self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) - self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) - self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) - self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) - self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) - self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, ndim, ndim)) - self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, ndim, ndim)) + self.assertEqual(Wishart(df, cov).sample().size(), (5, 3, 3)) + self.assertEqual(Wishart(df_no_batch, cov).sample().size(), (3, 3)) + self.assertEqual(Wishart(df_multi_batch, cov).sample().size(), (6, 5, 3, 3)) + self.assertEqual(Wishart(df, cov).sample((2,)).size(), (2, 5, 3, 3)) + self.assertEqual(Wishart(df_no_batch, cov).sample((2,)).size(), (2, 3, 3)) + self.assertEqual(Wishart(df_multi_batch, cov).sample((2,)).size(), (2, 6, 5, 3, 3)) + self.assertEqual(Wishart(df, cov).sample((2, 7)).size(), (2, 7, 5, 3, 3)) + self.assertEqual(Wishart(df_no_batch, cov).sample((2, 7)).size(), (2, 7, 3, 3)) + self.assertEqual(Wishart(df_multi_batch, cov).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) + self.assertEqual(Wishart(df, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) + self.assertEqual(Wishart(df_no_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) + self.assertEqual(Wishart(df_multi_batch, cov_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) + self.assertEqual(Wishart(df, precision_matrix=prec).sample((2, 7)).size(), (2, 7, 5, 3, 3)) + self.assertEqual(Wishart(df, precision_matrix=prec_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) + self.assertEqual(Wishart(df, scale_tril=scale_tril).sample((2, 7)).size(), (2, 7, 5, 3, 3)) + self.assertEqual(Wishart(df, scale_tril=scale_tril_batched).sample((2, 7)).size(), (2, 7, 6, 5, 3, 3)) # check gradients # Modified and applied the same tests for multivariate_normal @@ -2275,18 +2272,14 @@ class TestDistributions(TestCase): wishart_log_prob_gradcheck(df_no_batch, None, None, scale_tril_batched) def test_wishart_stable_with_precision_matrix(self): - ndim = 10 - x = torch.randn(ndim) + x = torch.randn(10) P = torch.exp(-(x - x.unsqueeze(-1)) ** 2) # RBF kernel - Wishart(torch.tensor(ndim), precision_matrix=P) + Wishart(torch.tensor(10), precision_matrix=P) @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_wishart_log_prob(self): - ndim = 3 - df = torch.rand([], requires_grad=True) + ndim - 1 - if version.parse(scipy.__version__) < version.parse("1.7.0"): - df += 1. - tmp = torch.randn(ndim, 10) + df = (torch.rand([], requires_grad=True) + 1) * 10 + tmp = torch.randn(3, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() @@ -2298,7 +2291,7 @@ class TestDistributions(TestCase): dist3 = Wishart(df, scale_tril=scale_tril) ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy()) - x = dist1.sample((1000,)) + x = dist1.sample((10,)) expected = ref_dist.logpdf(x.transpose(0, 2).numpy()) self.assertEqual(0.0, np.mean((dist1.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0) @@ -2306,14 +2299,14 @@ class TestDistributions(TestCase): self.assertEqual(0.0, np.mean((dist3.log_prob(x).detach().numpy() - expected)**2), atol=1e-3, rtol=0) # Double-check that batched versions behave the same as unbatched - df = torch.rand(5, requires_grad=True) + ndim - 1 - tmp = torch.randn(5, ndim, 10) + df = (torch.rand(5, requires_grad=True) + 1) * 3 + tmp = torch.randn(5, 3, 10) cov = (tmp.unsqueeze(-2) * tmp.unsqueeze(-3)).mean(-1).requires_grad_() dist_batched = Wishart(df, cov) dist_unbatched = [Wishart(df[i], cov[i]) for i in range(df.size(0))] - x = dist_batched.sample((1000,)) + x = dist_batched.sample((10,)) batched_prob = dist_batched.log_prob(x) unbatched_prob = torch.stack([dist_unbatched[i].log_prob(x[:, i]) for i in range(5)]).t() @@ -2323,34 +2316,28 @@ class TestDistributions(TestCase): @unittest.skipIf(not TEST_NUMPY, "NumPy not found") def test_wishart_sample(self): set_rng_seed(0) # see Note [Randomized statistical tests] - ndim = 3 - df = torch.rand([], requires_grad=True) + ndim - 1 - if version.parse(scipy.__version__) < version.parse("1.7.0"): - df += 1. - tmp = torch.randn(ndim, 10) + df = (torch.rand([], requires_grad=True) + 1) * 3 + tmp = torch.randn(3, 10) cov = (torch.matmul(tmp, tmp.t()) / tmp.size(-1)).requires_grad_() prec = cov.inverse().requires_grad_() scale_tril = torch.linalg.cholesky(cov).requires_grad_() - ref_dist = scipy.stats.wishart(df.item(), cov.detach().numpy()) - self._check_sampler_sampler(Wishart(df, cov), - ref_dist, + scipy.stats.wishart(df.item(), cov.detach().numpy()), 'Wishart(df={}, covariance_matrix={})'.format(df, cov), multivariate=True) self._check_sampler_sampler(Wishart(df, precision_matrix=prec), - ref_dist, + scipy.stats.wishart(df.item(), cov.detach().numpy()), 'Wishart(df={}, precision_matrix={})'.format(df, prec), multivariate=True) self._check_sampler_sampler(Wishart(df, scale_tril=scale_tril), - ref_dist, + scipy.stats.wishart(df.item(), cov.detach().numpy()), 'Wishart(df={}, scale_tril={})'.format(df, scale_tril), multivariate=True) def test_wishart_properties(self): - ndim = 5 - df = torch.rand([]) + ndim - 1 - scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim)) + df = (torch.rand([]) + 1) * 5 + scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(5, 5)) m = Wishart(df=df, scale_tril=scale_tril) self.assertEqual(m.covariance_matrix, m.scale_tril.mm(m.scale_tril.t())) self.assertEqual(m.covariance_matrix.mm(m.precision_matrix), torch.eye(m.event_shape[0])) @@ -2358,15 +2345,14 @@ class TestDistributions(TestCase): def test_wishart_moments(self): set_rng_seed(0) # see Note [Randomized statistical tests] - ndim = 3 - df = torch.rand([]) + ndim - 1 - scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(ndim, ndim)) + df = (torch.rand([]) + 1) * 3 + scale_tril = transform_to(constraints.lower_cholesky)(torch.randn(3, 3)) d = Wishart(df=df, scale_tril=scale_tril) - samples = d.rsample((ndim * ndim * 100000,)) + samples = d.rsample((100000,)) empirical_mean = samples.mean(0) - self.assertEqual(d.mean, empirical_mean, atol=0.5, rtol=0) + self.assertEqual(d.mean, empirical_mean, atol=5, rtol=0) empirical_var = samples.var(0) - self.assertEqual(d.variance, empirical_var, atol=0.5, rtol=0) + self.assertEqual(d.variance, empirical_var, atol=5, rtol=0) def test_exponential(self): rate = torch.randn(5, 5).abs().requires_grad_() @@ -4631,15 +4617,8 @@ class TestAgainstScipy(TestCase): scipy.stats.weibull_min(c=positive_var2[0], scale=positive_var[0]) ), ( - # scipy var for Wishart only supports scalars - Wishart( - (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0], - cov_tensor, - ), - scipy.stats.wishart( - (20 if version.parse(scipy.__version__) < version.parse("1.7.0") else 19) + positive_var[0].item(), - cov_tensor, - ), + Wishart(20 + positive_var[0], cov_tensor), # scipy var for Wishart only supports scalars + scipy.stats.wishart(20 + positive_var[0].item(), cov_tensor), ), ] diff --git a/torch/distributions/exp_family.py b/torch/distributions/exp_family.py index 7084714ee3d..669619d9db1 100644 --- a/torch/distributions/exp_family.py +++ b/torch/distributions/exp_family.py @@ -56,5 +56,5 @@ class ExponentialFamily(Distribution): gradients = torch.autograd.grad(lg_normal.sum(), nparams, create_graph=True) result += lg_normal for np, g in zip(nparams, gradients): - result -= (np * g).reshape(self._batch_shape + (-1,)).sum(-1) + result -= np * g return result diff --git a/torch/distributions/wishart.py b/torch/distributions/wishart.py index 04156915a4d..0dd431a0f7b 100644 --- a/torch/distributions/wishart.py +++ b/torch/distributions/wishart.py @@ -20,10 +20,6 @@ def _mvdigamma(x: torch.Tensor, p: int) -> torch.Tensor: - torch.arange(p, dtype=x.dtype, device=x.device).div(2).expand(x.shape + (-1,)) ).sum(-1) -def _clamp_with_eps(x: torch.Tensor) -> torch.Tensor: - # We assume positive input for this function - return x.clamp(min=torch.finfo(x.dtype).eps) - class Wishart(ExponentialFamily): r""" Creates a Wishart distribution parameterized by a symmetric positive definite matrix :math:`\Sigma`, @@ -31,9 +27,8 @@ class Wishart(ExponentialFamily): Example: >>> m = Wishart(torch.eye(2), torch.Tensor([2])) - >>> m.sample() # Wishart distributed with mean=`df * I` and - # variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j - + >>> m.sample() #Wishart distributed with mean=`df * I` and + #variance(x_ij)=`df` for i != j and variance(x_ij)=`2 * df` for i == j Args: covariance_matrix (Tensor): positive-definite covariance matrix precision_matrix (Tensor): positive-definite precision matrix @@ -61,7 +56,6 @@ class Wishart(ExponentialFamily): } support = constraints.positive_definite has_rsample = True - _mean_carrier_measure = 0 def __init__(self, df: Union[torch.Tensor, Number], @@ -86,7 +80,7 @@ class Wishart(ExponentialFamily): event_shape = param.shape[-2:] if self.df.le(event_shape[-1] - 1).any(): - raise ValueError(f"Value of df={df} expected to be greater than ndim - 1 = {event_shape[-1]-1}.") + raise ValueError(f"Value of df={df} expected to be greater than ndim={event_shape[-1]-1}.") if scale_tril is not None: self.scale_tril = param.expand(batch_shape + (-1, -1)) @@ -125,8 +119,9 @@ class Wishart(ExponentialFamily): new = self._get_checked_instance(Wishart, _instance) batch_shape = torch.Size(batch_shape) cov_shape = batch_shape + self.event_shape + df_shape = batch_shape new._unbroadcasted_scale_tril = self._unbroadcasted_scale_tril.expand(cov_shape) - new.df = self.df.expand(batch_shape) + new.df = self.df.expand(df_shape) new._batch_dims = [-(x + 1) for x in range(len(batch_shape))] @@ -177,25 +172,22 @@ class Wishart(ExponentialFamily): @property def mean(self): - return self.df.view(self._batch_shape + (1, 1)) * self.covariance_matrix + return self.df.view(self._batch_shape + (1, 1,)) * self.covariance_matrix @property def variance(self): V = self.covariance_matrix # has shape (batch_shape x event_shape) diag_V = V.diagonal(dim1=-2, dim2=-1) - return self.df.view(self._batch_shape + (1, 1)) * (V.pow(2) + torch.einsum("...i,...j->...ij", diag_V, diag_V)) + return self.df.view(self._batch_shape + (1, 1,)) * (V.pow(2) + torch.einsum("...i,...j->...ij", diag_V, diag_V)) def _bartlett_sampling(self, sample_shape=torch.Size()): p = self._event_shape[-1] # has singleton shape # Implemented Sampling using Bartlett decomposition - noise = _clamp_with_eps( - self._dist_chi2.rsample(sample_shape).sqrt() - ).diag_embed(dim1=-2, dim2=-1) - + noise = self._dist_chi2.rsample(sample_shape).sqrt().diag_embed(dim1=-2, dim2=-1) i, j = torch.tril_indices(p, p, offset=-1) noise[..., i, j] = torch.randn( - torch.Size(sample_shape) + self._batch_shape + (int(0.5 * p * (p - 1)),), + torch.Size(sample_shape) + self._batch_shape + (int(p * (p - 1) / 2),), dtype=noise.dtype, device=noise.device, ) @@ -258,11 +250,11 @@ class Wishart(ExponentialFamily): nu = self.df # has shape (batch_shape) p = self._event_shape[-1] # has singleton shape return ( - - 0.5 * nu * p * _log_2 + - nu * p * _log_2 / 2 - nu * self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1) - - torch.mvlgamma(0.5 * nu, p=p) - + 0.5 * (nu - p - 1) * torch.linalg.slogdet(value).logabsdet - - 0.5 * torch.cholesky_solve(value, self._unbroadcasted_scale_tril).diagonal(dim1=-2, dim2=-1).sum(dim=-1) + - torch.mvlgamma(nu / 2, p=p) + + (nu - p - 1) / 2 * torch.linalg.slogdet(value).logabsdet + - torch.cholesky_solve(value, self._unbroadcasted_scale_tril).diagonal(dim1=-2, dim2=-1).sum(dim=-1) / 2 ) def entropy(self): @@ -271,24 +263,19 @@ class Wishart(ExponentialFamily): V = self.covariance_matrix # has shape (batch_shape x event_shape) return ( (p + 1) * self._unbroadcasted_scale_tril.diagonal(dim1=-2, dim2=-1).log().sum(-1) - + 0.5 * p * (p + 1) * _log_2 - + torch.mvlgamma(0.5 * nu, p=p) - - 0.5 * (nu - p - 1) * _mvdigamma(0.5 * nu, p=p) - + 0.5 * nu * p + + p * (p + 1) * _log_2 / 2 + + torch.mvlgamma(nu / 2, p=p) + - (nu - p - 1) / 2 * _mvdigamma(nu / 2, p=p) + + nu * p / 2 ) @property def _natural_params(self): - nu = self.df # has shape (batch_shape) - p = self._event_shape[-1] # has singleton shape return ( + 0.5 * self.df, - 0.5 * self.precision_matrix, - 0.5 * (nu - p - 1), ) def _log_normalizer(self, x, y): - p = self._event_shape[-1] - return ( - (y + 0.5 * (p + 1)) * (- torch.linalg.slogdet(-2 * x).logabsdet + _log_2 * p) - + torch.mvlgamma(y + 0.5 * (p + 1), p=p) - ) + p = y.shape[-1] + return x * (- torch.linalg.slogdet(-2 * y).logabsdet + _log_2 * p) + _mvdigamma(x, p=p) From 486572223b80ce279c2007898d7789efcd89ddd3 Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Wed, 16 Feb 2022 13:37:13 -0800 Subject: [PATCH 109/199] Fix command example (#72847) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72847 Reviewed By: malfet Differential Revision: D34260868 Pulled By: kit1980 fbshipit-source-id: 1b225f3c2c7a822e44df4bbd91766e6533eab6d7 (cherry picked from commit c9e874c4d81a8f9ceee820e243c47b47a4361320) --- benchmarks/operator_benchmark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/operator_benchmark/README.md b/benchmarks/operator_benchmark/README.md index 9efa4a8c22b..59918f6fab3 100644 --- a/benchmarks/operator_benchmark/README.md +++ b/benchmarks/operator_benchmark/README.md @@ -136,7 +136,7 @@ $ python -m benchmark_all_test --list_tests Filter and run an operator (use add as an example): ``` -$ python -m benchmark_all_test --operator add --omp_num_threads 1 --mkl_num_threads 1 +$ python -m benchmark_all_test --operators add --omp_num_threads 1 --mkl_num_threads 1 ``` Note: this filter is based on the operator name rather than the file name. From e6fd28fb0560bccc267fc206a9b5052c255a4b34 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Wed, 16 Feb 2022 14:16:22 -0800 Subject: [PATCH 110/199] Revert D34126542: [Qunat] Add ConvTranspose reference module Test Plan: revert-hammer Differential Revision: D34126542 (https://github.com/pytorch/pytorch/commit/7a031ec17f5cfc052e618694d4bee28381a6051d) Original commit changeset: 7da167695a1f Original Phabricator Diff: D34126542 (https://github.com/pytorch/pytorch/commit/7a031ec17f5cfc052e618694d4bee28381a6051d) fbshipit-source-id: 14e40884807b9908017ae30af83a8dea23ff1f0f (cherry picked from commit f99a7f5a697b5ed411b061e3849dc29105126742) --- .../eager/test_quantize_eager_ptq.py | 126 ------------- torch/ao/quantization/quantize.py | 12 +- .../quantized/_reference/modules/__init__.py | 5 +- torch/nn/quantized/_reference/modules/conv.py | 169 +----------------- 4 files changed, 4 insertions(+), 308 deletions(-) diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py index f022072406f..6587740bdf9 100644 --- a/test/quantization/eager/test_quantize_eager_ptq.py +++ b/test/quantization/eager/test_quantize_eager_ptq.py @@ -3,7 +3,6 @@ import torch import torch.nn as nn import torch.nn.quantized as nnq -import torch.nn.quantized._reference as nnqr from torch.nn.utils.rnn import PackedSequence from torch.ao.quantization import ( quantize, @@ -75,131 +74,6 @@ import unittest import numpy as np class TestQuantizeEagerOps(QuantizationTestCase): - def _test_reference_module_impl(self, - float_module_class, - quantized_module_class, - extra_module_kwargs, - input_size): - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = float_module_class(**extra_module_kwargs) - self.quant = QuantStub() - self.dequant = DeQuantStub() - - def forward(self, x): - x = self.quant(x) - x = self.conv(x) - x = self.dequant(x) - return x - - class RefM(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = float_module_class(**extra_module_kwargs) - self.quant1 = QuantStub() - self.dequant1 = DeQuantStub() - self.quant2 = QuantStub() - self.dequant2 = DeQuantStub() - - def forward(self, x): - x = self.quant1(x) - x = self.dequant1(x) - x = self.conv(x) - x = self.quant2(x) - x = self.dequant2(x) - return x - - - data = torch.randn(*input_size, dtype=torch.float) - original_m = M() - original_ref_m = RefM() - torch.quantization.engine = 'qnnpack' - - original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach()) - original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach()) - - original_m.qconfig = torch.quantization.get_default_qconfig('qnnpack') - - m = prepare(original_m) - # calibration - m(data) - - m = convert(m) - # check if the module is properly quantized - self.assertEqual(type(m.quant), nnq.Quantize) - self.assertEqual(type(m.conv), quantized_module_class) - self.assertEqual(type(m.dequant), nnq.DeQuantize) - res = m(data) - - # quantize the reference model - original_ref_m.eval() - original_ref_m.qconfig = torch.quantization.get_default_qconfig('qnnpack') - - ref_m = prepare(original_ref_m) - ref_m(data) - reference_module_mapping = { - QuantStub: nnq.Quantize, - DeQuantStub: nnq.DeQuantize, - nn.Conv1d: nnqr.Conv1d, - nn.Conv2d: nnqr.Conv2d, - nn.Conv3d: nnqr.Conv3d, - nn.ConvTranspose1d: nnqr.ConvTranspose1d, - nn.ConvTranspose2d: nnqr.ConvTranspose2d, - nn.ConvTranspose3d: nnqr.ConvTranspose3d, - } - ref_m = convert(ref_m, mapping=reference_module_mapping) - ref_res = ref_m(data) - self.assertEqual(res, ref_res) - - def test_conv_1d(self): - self._test_reference_module_impl( - nn.Conv1d, - nnq.Conv1d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 1) - ) - - def test_conv_2d(self): - self._test_reference_module_impl( - nn.Conv2d, - nnq.Conv2d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 10, 10) - ) - - def test_conv_3d(self): - self._test_reference_module_impl( - nn.Conv3d, - nnq.Conv3d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 10, 10, 10) - ) - - def test_conv_transpose_1d(self): - self._test_reference_module_impl( - nn.ConvTranspose1d, - nnq.ConvTranspose1d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 1) - ) - - def test_conv_transpose_2d(self): - self._test_reference_module_impl( - nn.ConvTranspose2d, - nnq.ConvTranspose2d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 10, 10) - ) - - def test_conv_transpose_3d(self): - self._test_reference_module_impl( - nn.ConvTranspose3d, - nnq.ConvTranspose3d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 10, 10, 10) - ) - def _test_activation_op_impl( self, float_module_class, quantized_module_class, extra_module_kwargs): """ Implementation for testing common activation ops like leaky relu diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py index fad2b8abe6e..5afff09b64b 100644 --- a/torch/ao/quantization/quantize.py +++ b/torch/ao/quantization/quantize.py @@ -16,7 +16,7 @@ from torch.ao.quantization.quantization_mappings import ( _has_special_act_post_process, _get_special_act_post_process, ) -from .utils import get_qparam_dict + from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper from torch.ao.quantization.qconfig import ( add_module_to_qconfig_obs_ctr, @@ -565,15 +565,7 @@ def swap_module(mod, mapping, custom_module_class_mapping): new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) swapped = True elif type(mod) in mapping: - qmod = mapping[type(mod)] - if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE: - assert mod.qconfig is not None - weight_post_process = mod.qconfig.weight() - weight_post_process(mod.weight) - weight_qparams = get_qparam_dict(weight_post_process) - new_mod = qmod.from_float(mod, weight_qparams) - else: - new_mod = qmod.from_float(mod) + new_mod = mapping[type(mod)].from_float(mod) swapped = True if swapped: diff --git a/torch/nn/quantized/_reference/modules/__init__.py b/torch/nn/quantized/_reference/modules/__init__.py index efbefdbde60..441852c38f9 100644 --- a/torch/nn/quantized/_reference/modules/__init__.py +++ b/torch/nn/quantized/_reference/modules/__init__.py @@ -1,12 +1,9 @@ from .linear import Linear -from .conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d +from .conv import Conv1d, Conv2d, Conv3d __all__ = [ 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', - 'ConvTranspose1d', - 'ConvTranspose2d', - 'ConvTranspose3d', ] diff --git a/torch/nn/quantized/_reference/modules/conv.py b/torch/nn/quantized/_reference/modules/conv.py index 60aed0a91ac..ed151cb7f5e 100644 --- a/torch/nn/quantized/_reference/modules/conv.py +++ b/torch/nn/quantized/_reference/modules/conv.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from typing import Optional, Dict, Any, List +from typing import Optional, Dict, Any from torch.nn.common_types import _size_1_t from .utils import _quantize_weight, _quantize_and_dequantize_weight from .utils import _save_weight_qparams @@ -14,7 +14,6 @@ class _ConvNd(torch.nn.modules.conv._ConvNd): this is useful when user want to use this module in other backends like Glow. """ __annotations__ = {"bias": Optional[torch.Tensor]} - _IS_REFERENCE = True def _save_to_state_dict(self, destination, prefix, keep_vars): super()._save_to_state_dict(destination, prefix, keep_vars) @@ -218,169 +217,3 @@ class Conv3d(_ConvNd, nn.Conv3d): @classmethod def from_float(cls, float_conv, weight_qparams): return _ConvNd.from_float(cls, float_conv, weight_qparams) - -class _ConvTransposeNd(_ConvNd, torch.nn.modules.conv._ConvTransposeNd): - """ A reference version of nn.quantized.ConvTranspose2d - we will not pack the parameters in this module, since weight packing is an - optimization for quantized backends supported in PyTorch (fbgemm/qnnpack), - this is useful when user want to use this module in other backends like Glow. - """ - @staticmethod - def from_float(cls, float_conv, weight_qparams): - qref_conv = cls( - float_conv.in_channels, - float_conv.out_channels, - float_conv.kernel_size, # type: ignore[arg-type] - float_conv.stride, # type: ignore[arg-type] - float_conv.padding, # type: ignore[arg-type] - float_conv.output_padding, # type: ignore[arg-type] - float_conv.groups, - float_conv.bias is not None, # type: ignore[arg-type] - float_conv.dilation, # type: ignore[arg-type] - float_conv.padding_mode, - device=float_conv.weight.device, - dtype=float_conv.weight.dtype, - weight_qparams=weight_qparams) - qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach()) - if float_conv.bias is not None: - qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach()) - return qref_conv - - -class ConvTranspose1d(_ConvTransposeNd, nn.ConvTranspose1d): - def __init__(self, - in_channels: int, - out_channels: int, - kernel_size: _size_1_t, - stride: _size_1_t = 1, - padding: _size_1_t = 0, - output_padding: _size_1_t = 0, - groups: int = 1, - bias: bool = True, - dilation: _size_1_t = 1, - padding_mode: str = "zeros", - device=None, - dtype=None, - weight_qparams: Optional[Dict[str, Any]] = None): - nn.ConvTranspose1d.__init__( - self, in_channels, out_channels, kernel_size, stride, padding, output_padding, - groups, bias, dilation, padding_mode, device, dtype) - self._init_weight_qparams(weight_qparams, device) - - def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: - """ - we have: - w(float) -- quant - dequant \ - x(float) ------------- F.convTranspose1d --- - In the full model, we will see - w(float) -- quant - *dequant \ - x -- quant --- *dequant -- *F.convTranspose1d --- *quant - dequant - and the backend should be able to fuse the ops with `*` into a quantized conv1d - """ - - assert isinstance(self.padding, tuple) - # One cannot replace List by Tuple or Sequence in "_output_padding" because - # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. - output_padding = self._output_padding( - input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] - - weight_dequant = self.get_weight() - result = F.conv_transpose1d( - x, weight_dequant, self.bias, self.stride, - self.padding, output_padding, self.groups, self.dilation) - return result - - def _get_name(self): - return "QuantizedConvTranspose1d(Reference)" - - @classmethod - def from_float(cls, float_conv, weight_qparams): - return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) - -class ConvTranspose2d(_ConvTransposeNd, nn.ConvTranspose2d): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, output_padding=0, - groups=1, bias=True, dilation=1, - padding_mode='zeros', - device=None, - dtype=None, - weight_qparams: Optional[Dict[str, Any]] = None): - - nn.ConvTranspose2d.__init__( - self, in_channels, out_channels, kernel_size, stride, padding, output_padding, - groups, bias, dilation, padding_mode, device, dtype) - self._init_weight_qparams(weight_qparams, device) - - def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: - """ - we have: - w(float) -- quant - dequant \ - x(float) ------------- F.convTranspose2d --- - In the full model, we will see - w(float) -- quant - *dequant \ - x -- quant --- *dequant -- *F.convTranspose2d --- *quant - dequant - and the backend should be able to fuse the ops with `*` into a quantized conv2d - """ - assert isinstance(self.padding, tuple) - # One cannot replace List by Tuple or Sequence in "_output_padding" because - # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. - - output_padding = self._output_padding( - input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] - - weight_dequant = self.get_weight() - result = F.conv_transpose2d( - x, weight_dequant, self.bias, self.stride, - self.padding, output_padding, self.groups, self.dilation) - - return result - - def _get_name(self): - return "QuantizedConvTranspose2d(Reference)" - - @classmethod - def from_float(cls, float_conv, weight_qparams): - return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) - -class ConvTranspose3d(_ConvTransposeNd, nn.ConvTranspose3d): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, output_padding=0, - groups=1, bias=True, dilation=1, - padding_mode="zeros", - device=None, - dtype=None, - weight_qparams: Optional[Dict[str, Any]] = None): - nn.ConvTranspose3d.__init__( - self, in_channels, out_channels, kernel_size, stride, padding, output_padding, - groups, bias, dilation, padding_mode, device, dtype) - self._init_weight_qparams(weight_qparams, device) - - def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: - """ - we have: - w(float) -- quant - dequant \ - x(float) ------------- F.convTranspose3d --- - In the full model, we will see - w(float) -- quant - *dequant \ - x -- quant --- *dequant -- *F.convTranspose3d --- *quant - dequant - and the backend should be able to fuse the ops with `*` into a quantized conv3d - """ - - assert isinstance(self.padding, tuple) - # One cannot replace List by Tuple or Sequence in "_output_padding" because - # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. - output_padding = self._output_padding( - input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] - - weight_dequant = self.get_weight() - result = F.conv_transpose3d( - x, weight_dequant, self.bias, self.stride, - self.padding, output_padding, self.groups, self.dilation) - return result - - def _get_name(self): - return "QuantizedConvTranspose3d(Reference)" - - @classmethod - def from_float(cls, float_conv, weight_qparams): - return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) From 3d8b6d336119a6ffc401a90f50f21ba4079272a6 Mon Sep 17 00:00:00 2001 From: ganler Date: Wed, 16 Feb 2022 22:28:08 +0000 Subject: [PATCH 111/199] fix: onnx PReLU unidirectional broadcasting Fixes https://github.com/pytorch/pytorch/issues/70570 Pull Request resolved: https://github.com/pytorch/pytorch/pull/70571 --- test/onnx/test_pytorch_onnx_onnxruntime.py | 4 ++++ torch/onnx/symbolic_opset9.py | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index c71a9756408..283968612a0 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -6102,6 +6102,10 @@ class TestONNXRuntime(unittest.TestCase): dynamic_axes={"x": [1, 2]}, test_with_inputs=[y]) + def test_prelu_scalar(self): + x = torch.scalar_tensor(1.) + self.run_test(torch.nn.PReLU(), x, input_names=["x"]) + def test_relu6(self): class Relu6Model(torch.nn.Module): def __init__(self): diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index acdde766120..b73ee60b87e 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -694,10 +694,22 @@ def squeeze(g, self, dim=None): def prelu(g, self, weight): self_rank = sym_help._get_tensor_rank(self) - if self_rank is not None and self_rank > 2: - weight = sym_help._unsqueeze_helper(g, weight, list(range(1, self_rank - 1))) - return g.op("PRelu", self, weight) + if self_rank is not None: + if self_rank > 2: + # make weight unidirectional broadcastable + weight = sym_help._unsqueeze_helper(g, weight, list(range(1, self_rank - 1))) + elif self_rank == 0: + # weight is always rank 1. torch allows scalar self, and ONNX is ambiguous + # about whether this is allowed, but some implementations enforce + # rank(self) >= rank(weight), which makes sense. + self = sym_help._unsqueeze_helper(g, self, [0]) + self_rank = 1 + weight_rank = sym_help._get_tensor_rank(weight) + if self_rank is not None and weight_rank is not None: + assert self_rank >= weight_rank, \ + "rank(x) should be >= rank(slope) but got {} < {}".format(self_rank, weight_rank) + return g.op("PRelu", self, weight) def silu(g, input): return g.op("Mul", input, g.op("Sigmoid", input)) From 5cf2228405f20cd73ec40c188c8851c9897777db Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Wed, 16 Feb 2022 14:17:02 -0800 Subject: [PATCH 112/199] ci: Add documentation for github actions (#72943) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72943 Adds some documentation about our github actions setup and examples on how to add workflows / regenerate workflows Signed-off-by: Eli Uriegas Test Plan: Imported from OSS Reviewed By: kit1980 Differential Revision: D34283475 Pulled By: seemethere fbshipit-source-id: a4ac9711c19aaf9312361f46db681d4457ab790c (cherry picked from commit f352bba8e95539d4a49aa207d2816d8bf9a86f50) --- .github/README.md | 78 ++++++++++++++++++++++++++++++++++++++++ .github/requirements.txt | 1 + 2 files changed, 79 insertions(+) create mode 100644 .github/README.md create mode 100644 .github/requirements.txt diff --git a/.github/README.md b/.github/README.md new file mode 100644 index 00000000000..7ae4dcbaad0 --- /dev/null +++ b/.github/README.md @@ -0,0 +1,78 @@ +# pytorch/.github + +This directory contains workflows and scripts to support our CI infrastructure that runs on Github Actions. + + +## Workflows / Templates + +Our current Github Actions setup uses templates written in [Jinja](https://jinja.palletsprojects.com/en/3.0.x/) that are located in the +`.github/templates` directory to generate workflow files found in the `.github/workflows/` directory. + +These templates contain a couple of utility templates used to discern common utilities that can be +used amongst different templates. + +### (Re)Generating workflow files + +You will need `jinja2` in order to regenerate the workflow files which can be installed using: +```bash +pip install -r .github/requirements.txt +``` + +Workflows can be generated / regenerated using the following command: +```bash +.github/regenerate.sh +``` + +### Adding a new generated workflow + +New generated workflows can be added in the `.github/scripts/generate_ci_workflows.py` script. You can reference +examples from that script in order to add the workflow to the stream that is relevant to what you particularly +care about. + +Different parameters can be used to acheive different goals, i.e. running jobs on a cron, running only on trunk, etc. + +#### ciflow (specific) + +ciflow is the way we can get `non-default` workflows to run on specific PRs. Within the `generate_ci_workflows.py` script +you will notice a multitude of `LABEL_CIFLOW_` variables which correspond to labels on Github. Workflows that +do not run on ``LABEL_CIFLOW_DEFAULT` can be triggered on PRs by applying the label found in `generate_ci_workflows.py` + +Example: +```python + CIWorkflow( + arch="linux", + build_environment="periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck", + docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", + test_runner_type=LINUX_CUDA_TEST_RUNNER, + num_test_shards=2, + distributed_test=False, + timeout_after=360, + # Only run this on master 4 times per day since it does take a while + is_scheduled="0 */4 * * *", + ciflow_config=CIFlowConfig( + labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_SLOW_GRADCHECK, LABEL_CIFLOW_SLOW, LABEL_CIFLOW_SCHEDULED}, + ), + ), +``` + +This workflow does not get triggered by default since it does not contain the `LABEL_CIFLOW_DEFAULT` label in its CIFlowConfig but applying +the `LABEL_CIFLOW_SLOW_GRADCHECK` on your PR will trigger this specific workflow to run. + +#### ciflow (trunk) + +The label `ciflow/trunk` can be used to run `trunk` only workflows. This is especially useful if trying to re-land a PR that was +reverted for failing a `non-default` workflow. + +## Infra + +Currently most of our self hosted runners are hosted on AWS, for a comprehensive list of available runner types you +can reference `.github/scale-config.yml`. + +Exceptions to AWS for self hosted: +* ROCM runners + +### Adding new runner types + +New runner types can be added by committing changes to `.github/scale-config.yml`. Example: https://github.com/pytorch/pytorch/pull/70474 + +> NOTE: New runner types can only be used once the changes to `.github/scale-config.yml` have made their way into the default branch diff --git a/.github/requirements.txt b/.github/requirements.txt new file mode 100644 index 00000000000..7f7afbf3bf5 --- /dev/null +++ b/.github/requirements.txt @@ -0,0 +1 @@ +jinja2 From dbac0f5cdc561755bf20ff82a3facc70327170f9 Mon Sep 17 00:00:00 2001 From: Gary Miguel Date: Wed, 16 Feb 2022 14:53:24 -0800 Subject: [PATCH 113/199] Update persons of interest for ONNX (#72072) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72072 Reviewed By: H-Huang Differential Revision: D34230534 Pulled By: malfet fbshipit-source-id: ed5abdfacf0d9628c6cc99957fa578d71a79d025 (cherry picked from commit 4669c346c44ea04e46a7457eed6d79501c5ba19d) --- docs/source/community/persons_of_interest.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/community/persons_of_interest.rst b/docs/source/community/persons_of_interest.rst index b1d4954a657..906d5685984 100644 --- a/docs/source/community/persons_of_interest.rst +++ b/docs/source/community/persons_of_interest.rst @@ -149,13 +149,13 @@ C10 utils and operator dispatch - Dmytro Dzhulgakov (`dzhulgakov `__) - (emeritus) Sebastian Messmer (`smessmer `__) -ONNX <-> PyTorch -~~~~~~~~~~~~~~~~ -- Negin Raoof (`neginraoof `__) -- Gary Miguel (`garymm `__) +PyTorch -> ONNX +~~~~~~~~~~~~~~~ - Bowen Bao (`BowenBao `__) -- (emeritus) Lu Fang (`houseroad `__) +- Gary Miguel (`garymm `__) - (emeritus) Lara Haidar (`lara-hdr `__) +- (emeritus) Lu Fang (`houseroad `__) +- (emeritus) Negin Raoof (`neginraoof `__) - (emeritus) Spandan Tiwari (`spandantiwari `__) Mobile / Edge From bbac8c9c4816c87eae17a04b60d4103940c08f83 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 16 Feb 2022 14:55:10 -0800 Subject: [PATCH 114/199] [ONNX] List of files to consider for mergebot onnx rule (#72297) Summary: Based on past PRs, here is an non-exhaustive list of files to consider for extension. The PR is not meant to be final. Based on feedback and discussion, files could be dropped from the list, or PR could be updated to move code around such that extension is no longer needed. List of files below and description: * These files are for converting from IR to ONNX proto. These should be used only for ONNX. ``` "torch/csrc/jit/serialization/export.*", "torch/csrc/jit/serialization/onnx.*", ``` * This file is touched whenever pass signature is updated. ``` "torch/_C/__init__.pyi.in", ``` * These files are touched whenever pass signature is updated. Somehow it's been convention that onnx passes are also added here, but it could be possible to move them. Let me know what you think. ~~"torch/csrc/jit/python/init.cpp",~~ ~~"torch/csrc/jit/python/script_init.cpp",~~ Update: Bowen will move onnx passes to files under onnx folder. * ~~Touched when need new attr::xxx, or onnx::xxx.~~ ~~"aten/src/ATen/core/interned_strings.h"~~ Update: Nikita will help separate this file. malfet Pull Request resolved: https://github.com/pytorch/pytorch/pull/72297 Reviewed By: H-Huang Differential Revision: D34254666 Pulled By: malfet fbshipit-source-id: 032cfa590cbedf4648b7335fe8f09a2380ab14cb (cherry picked from commit 88653eadbf5b6dfe1f84acec8f1c3256a49f2f68) --- .github/merge_rules.json | 11 +- torch/csrc/jit/python/init.cpp | 135 ----------------------- torch/csrc/jit/python/script_init.cpp | 5 - torch/csrc/onnx/init.cpp | 147 ++++++++++++++++++++++++++ 4 files changed, 157 insertions(+), 141 deletions(-) diff --git a/.github/merge_rules.json b/.github/merge_rules.json index fdac34d185a..6df1db142f5 100644 --- a/.github/merge_rules.json +++ b/.github/merge_rules.json @@ -1,7 +1,16 @@ [ { "name": "ONNX exporter", - "patterns": ["torch/onnx/**", "torch/csrc/jit/passes/onnx/**", "torch/csrc/jit/passes/onnx.*", "test/onnx/**", "docs/source/onnx.rst"], + "patterns": [ + "torch/onnx/**", + "torch/csrc/jit/passes/onnx/**", + "torch/csrc/jit/passes/onnx.*", + "test/onnx/**", + "docs/source/onnx.rst", + "torch/csrc/jit/serialization/export.*", + "torch/csrc/jit/serialization/onnx.*", + "torch/_C/__init__.pyi.in" + ], "approved_by": ["BowenBao", "garymm"], "mandatory_app_id": 12274 }, diff --git a/torch/csrc/jit/python/init.cpp b/torch/csrc/jit/python/init.cpp index 1f3a26446e3..8ff0e34a21b 100644 --- a/torch/csrc/jit/python/init.cpp +++ b/torch/csrc/jit/python/init.cpp @@ -45,25 +45,6 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include @@ -166,17 +147,6 @@ void initJITBindings(PyObject* module) { .def( "_jit_debug_fuser_num_cached_kernel_specs", torch::jit::fuser::debugNumCachedKernelSpecs) - .def("_jit_pass_onnx_remove_print", RemovePrintOps) - .def("_jit_pass_onnx_preprocess_caffe2", PreprocessCaffe2Ops) - .def("_jit_pass_onnx", ToONNX) - .def( - "_jit_pass_onnx_assign_output_shape", - [](std::shared_ptr& graph, - const std::vector& tensors, - const python::IODescriptor& desc, - bool onnx_shape_inference = false) { - ONNXAssignOutputShape(graph, tensors, desc, onnx_shape_inference); - }) .def("_jit_pass_lower_all_tuples", LowerAllTuples) .def( "_new_symbolic_shape_symbol", @@ -205,7 +175,6 @@ void initJITBindings(PyObject* module) { .def( "_jit_pass_propagate_shapes_on_graph_and_build_compute", PropagateShapesAndBuildLargeShapeComputeGraph) - .def("_jit_pass_onnx_function_substitution", ONNXFunctionCallSubstitution) .def("_jit_pass_integer_value_refinement", RefineIntegerValues) .def( "_jit_set_symbolic_shapes_test_mode", @@ -213,88 +182,8 @@ void initJITBindings(PyObject* module) { .def( "_jit_symbolic_shapes_test_mode_enabled", &symbolicShapeAnalysisTestModeEnabled) - .def( - "_jit_pass_onnx_peephole", - [](std::shared_ptr& graph, - int opset_version, - bool fixed_batch_size) { - return PeepholeOptimizeONNX(graph, opset_version, fixed_batch_size); - }) - .def("_jit_pass_onnx_preprocess", PreprocessForONNX) - .def( - "_jit_pass_onnx_deduplicate_initializers", - [](std::shared_ptr& graph, - std::map params_dict, - bool is_train) { - DeduplicateInitializers(graph, params_dict, is_train); - return params_dict; - }, - pybind11::return_value_policy::move) - .def( - "_jit_pass_onnx_eval_peephole", - [](std::shared_ptr& graph, - std::map& paramsDict) { - EvalPeepholeONNX(graph, paramsDict); - return paramsDict; - }, - pybind11::return_value_policy::move) - .def( - "_jit_pass_onnx_cast_all_constant_to_floating", - CastAllConstantToFloating) - .def( - "_jit_pass_onnx_constant_fold", - [](std::shared_ptr& graph, - std::map& paramsDict, - int opset_version) { - ConstantFoldONNX( - graph, - paramsDict, - opset_version); // overload resolution - return paramsDict; - }, - pybind11::return_value_policy::move) - .def( - "_jit_pass_onnx_eliminate_unused_items", - [](std::shared_ptr& graph, - std::map& paramsDict) { - EliminateUnusedItemsONNX( - graph->block(), - paramsDict); // overload resolution - return paramsDict; - }, - pybind11::return_value_policy::move) - .def( - "_jit_pass_onnx_scalar_type_analysis", - [](std::shared_ptr& graph, - bool lowprecision_cast, - int opset_version) { - return ScalarTypeAnalysisForONNX( - graph, lowprecision_cast, opset_version); - }, - py::arg("graph"), - py::arg("lowprecision_cast") = true, - py::arg("opset_version")) - .def( - "_jit_pass_onnx_remove_inplace_ops_for_onnx", RemoveInplaceOpsForONNX) - .def( - "_jit_pass_onnx_node_shape_type_inference", - [](Node* n, - std::map& params_dict, - int opset_version) { - ONNXShapeTypeInference(n, params_dict, opset_version); - }) - .def( - "_jit_pass_onnx_graph_shape_type_inference", - [](std::shared_ptr& graph, - std::map& params_dict, - int opset_version) { - ONNXShapeTypeInference(graph, params_dict, opset_version); - }) - .def("_jit_pass_onnx_set_dynamic_input_shape", ONNXSetDynamicInputShape) .def("_jit_pass_autocast", Autocast) .def("_jit_set_autocast_mode", &setAutocastMode) - .def("_jit_pass_onnx_lint", ONNXLintGraph) - .def("_jit_pass_onnx_function_extraction", onnx::ONNXFunctionExtraction) .def("_jit_pass_fuse", FuseGraph) .def( "_jit_pass_replace_old_ops_with_upgraders", @@ -378,9 +267,6 @@ void initJITBindings(PyObject* module) { .def( "_jit_pass_fold_convbn", [](Module& module) { return FoldConvBatchNorm(module); }) - .def( - "_jit_onnx_list_model_parameters", - [](Module& module) { return list_module_parameters(module); }) .def( "_freeze_module", [](Module& module, @@ -598,7 +484,6 @@ void initJITBindings(PyObject* module) { .def("_jit_pass_erase_number_types", EraseNumberTypes) .def("_jit_pass_inline_fork_wait", InlineForkWait) .def("_jit_pass_inline", Inline) - .def("_jit_pass_prepare_division_for_onnx", PrepareDivisionForONNX) .def( "_jit_pass_lower_graph", [](std::shared_ptr& graph, const Module& self) { @@ -681,10 +566,6 @@ void initJITBindings(PyObject* module) { return py::reinterpret_steal( python::unflatten(vars, desc)); }) - .def("_jit_pass_onnx_block", BlockToONNX) - .def( - "_jit_onnx_convert_pattern_from_subblock", ConvertPatternFromSubblock) - .def("_jit_pass_fixup_onnx_controlflow_node", FixupONNXControlflowNode) .def("_jit_pass_canonicalize_graph_fuser_ops", CanonicalizeOps) .def("_jit_pass_decompose_ops", DecomposeOps) .def("_jit_pass_specialize_autogradzero", specializeAutogradZero) @@ -1030,22 +911,6 @@ void initJITBindings(PyObject* module) { std::vector& preserved_methods) { return metalOptimizeForMobile(module, preserved_methods); }) - .def( - "_jit_pass_onnx_unpack_quantized_weights", - [](std::shared_ptr& graph, - std::map& paramsDict) { - UnpackQuantizedWeights(graph, paramsDict); - return paramsDict; - }, - pybind11::return_value_policy::move) - .def( - "_jit_pass_onnx_quantization_insert_permutes", - [](std::shared_ptr& graph, - std::map& paramsDict) { - insertPermutes(graph, paramsDict); - return paramsDict; - }, - pybind11::return_value_policy::move) .def( "_jit_pass_filter_non_tensor_arguments", [](std::map params) { diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp index 0811cd33985..f73496722af 100644 --- a/torch/csrc/jit/python/script_init.cpp +++ b/torch/csrc/jit/python/script_init.cpp @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -2211,10 +2210,6 @@ void initJitScriptBindings(PyObject* module) { logging::LoggerBase, std::shared_ptr>(m, "NoopLogger") .def(py::init<>()); - m.def( - "_check_onnx_proto", - [](const std::string& proto_string) { check_onnx_proto(proto_string); }, - py::arg("proto_string")); m.def("_jit_is_script_object", [](const py::object& obj) { return py::isinstance(obj); }); diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp index 47e9625cefc..c7f33f20d26 100644 --- a/torch/csrc/onnx/init.cpp +++ b/torch/csrc/onnx/init.cpp @@ -2,11 +2,158 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace torch { namespace onnx { + +using namespace torch::jit; + void initONNXBindings(PyObject* module) { auto m = py::handle(module).cast(); + + // ONNX specific passes + m.def("_jit_pass_onnx_remove_print", RemovePrintOps) + .def("_jit_pass_onnx_preprocess_caffe2", PreprocessCaffe2Ops) + .def("_jit_pass_onnx", ToONNX) + .def( + "_jit_pass_onnx_assign_output_shape", + [](std::shared_ptr& graph, + const std::vector& tensors, + const python::IODescriptor& desc, + bool onnx_shape_inference = false) { + ONNXAssignOutputShape(graph, tensors, desc, onnx_shape_inference); + }) + .def("_jit_pass_onnx_function_substitution", ONNXFunctionCallSubstitution) + .def( + "_jit_pass_onnx_peephole", + [](std::shared_ptr& graph, + int opset_version, + bool fixed_batch_size) { + return PeepholeOptimizeONNX(graph, opset_version, fixed_batch_size); + }) + .def("_jit_pass_onnx_preprocess", PreprocessForONNX) + .def( + "_jit_pass_onnx_eval_peephole", + [](std::shared_ptr& graph, + std::map& paramsDict) { + EvalPeepholeONNX(graph, paramsDict); + return paramsDict; + }, + pybind11::return_value_policy::move) + .def( + "_jit_pass_onnx_cast_all_constant_to_floating", + CastAllConstantToFloating) + .def( + "_jit_pass_onnx_constant_fold", + [](std::shared_ptr& graph, + std::map& paramsDict, + int opset_version) { + ConstantFoldONNX( + graph, + paramsDict, + opset_version); // overload resolution + return paramsDict; + }, + pybind11::return_value_policy::move) + .def( + "_jit_pass_onnx_eliminate_unused_items", + [](std::shared_ptr& graph, + std::map& paramsDict) { + EliminateUnusedItemsONNX( + graph->block(), + paramsDict); // overload resolution + return paramsDict; + }, + pybind11::return_value_policy::move) + .def( + "_jit_pass_onnx_scalar_type_analysis", + [](std::shared_ptr& graph, + bool lowprecision_cast, + int opset_version) { + return ScalarTypeAnalysisForONNX( + graph, lowprecision_cast, opset_version); + }, + py::arg("graph"), + py::arg("lowprecision_cast") = true, + py::arg("opset_version")) + .def( + "_jit_pass_onnx_remove_inplace_ops_for_onnx", RemoveInplaceOpsForONNX) + .def( + "_jit_pass_onnx_node_shape_type_inference", + [](Node* n, + std::map& params_dict, + int opset_version) { + ONNXShapeTypeInference(n, params_dict, opset_version); + }) + .def( + "_jit_pass_onnx_graph_shape_type_inference", + [](std::shared_ptr& graph, + std::map& params_dict, + int opset_version) { + ONNXShapeTypeInference(graph, params_dict, opset_version); + }) + .def("_jit_pass_onnx_set_dynamic_input_shape", ONNXSetDynamicInputShape) + .def("_jit_pass_onnx_lint", ONNXLintGraph) + .def("_jit_pass_onnx_function_extraction", torch::jit::onnx::ONNXFunctionExtraction) + .def("_jit_pass_onnx_block", BlockToONNX) + .def( + "_jit_pass_onnx_unpack_quantized_weights", + [](std::shared_ptr& graph, + std::map& paramsDict) { + UnpackQuantizedWeights(graph, paramsDict); + return paramsDict; + }, + pybind11::return_value_policy::move) + .def( + "_jit_pass_onnx_quantization_insert_permutes", + [](std::shared_ptr& graph, + std::map& paramsDict) { + insertPermutes(graph, paramsDict); + return paramsDict; + }, + pybind11::return_value_policy::move) + .def( + "_jit_onnx_list_model_parameters", + [](Module& module) { return list_module_parameters(module); }) + .def("_jit_pass_prepare_division_for_onnx", PrepareDivisionForONNX) + .def( + "_jit_onnx_convert_pattern_from_subblock", ConvertPatternFromSubblock) + .def("_jit_pass_fixup_onnx_controlflow_node", FixupONNXControlflowNode) + .def( + "_jit_pass_onnx_deduplicate_initializers", + [](std::shared_ptr& graph, + std::map params_dict, + bool is_train) { + DeduplicateInitializers(graph, params_dict, is_train); + return params_dict; + }, + pybind11::return_value_policy::move); + + m.def( + "_check_onnx_proto", + [](const std::string& proto_string) { check_onnx_proto(proto_string); }, + py::arg("proto_string")); + auto onnx = m.def_submodule("_onnx"); py::enum_<::ONNX_NAMESPACE::TensorProto_DataType>(onnx, "TensorProtoDataType") .value("UNDEFINED", ::ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) From 352eeb2ef92f2d98d15792d15e6f8067945d6b85 Mon Sep 17 00:00:00 2001 From: lkct Date: Wed, 16 Feb 2022 15:02:21 -0800 Subject: [PATCH 115/199] doc fix `nn.Module`: docstring should come after class variable (#72912) Summary: Fixes https://github.com/pytorch/pytorch/issues/72862 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72912 Reviewed By: cpuhrsch Differential Revision: D34286017 Pulled By: jbschlosser fbshipit-source-id: d172f7600e7f66c30187996ee42c72bf273643cc (cherry picked from commit d9f9b5b4180fb1554eaca675a20661b979da2234) --- torch/nn/modules/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index 05c1ce462ad..d712f251f32 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -237,6 +237,7 @@ class Module: dump_patches: bool = False + _version: int = 1 r"""This allows better BC support for :meth:`load_state_dict`. In :meth:`state_dict`, the version number will be saved as in the attribute `_metadata` of the returned state dict, and thus pickled. `_metadata` is a @@ -247,7 +248,6 @@ class Module: be bumped, and the module's `_load_from_state_dict` method can compare the version number and do appropriate changes if the state dict is from before the change.""" - _version: int = 1 training: bool _is_full_backward_hook: Optional[bool] From 6a2624f7c487ce540ba58599c81b00649539f4f2 Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Wed, 16 Feb 2022 15:15:29 -0800 Subject: [PATCH 116/199] ci: Add credentials to upload_test_statistics Adds credentials to macOS workflows to upload_test_statistics when needed Signed-off-by: Eli Uriegas Pull Request resolved: https://github.com/pytorch/pytorch/pull/72955 --- .github/templates/common.yml.j2 | 6 +++++- .github/templates/macos_ci_workflow.yml.j2 | 2 +- .github/workflows/generated-macos-11-py3-x86-64.yml | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index e8c92296c6a..123d498363f 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -43,7 +43,7 @@ concurrency: run: .github/scripts/parse_ref.py {%- endmacro -%} -{%- macro upload_test_statistics(build_environment, when="always()", pytorch_directory="") -%} +{%- macro upload_test_statistics(build_environment, when="always()", pytorch_directory="", needs_credentials=False) -%} - name: Display and upload test statistics (Click Me) {%- if pytorch_directory %} working-directory: !{{ pytorch_directory }} @@ -59,6 +59,10 @@ concurrency: SHA1: ${{ github.event.pull_request.head.sha || github.sha }} TAG: ${{ steps.parse-ref.outputs.tag }} WORKFLOW_ID: '${{ github.run_id }}' +{%- if needs_credentials %} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }} +{%- endif %} shell: bash run: | python3 -m pip install -r requirements.txt diff --git a/.github/templates/macos_ci_workflow.yml.j2 b/.github/templates/macos_ci_workflow.yml.j2 index f8b0d4cc30e..bce1f88b6fe 100644 --- a/.github/templates/macos_ci_workflow.yml.j2 +++ b/.github/templates/macos_ci_workflow.yml.j2 @@ -147,7 +147,7 @@ jobs: !{{ common.render_test_results() }} !{{ common.upload_downloaded_files(name='macos', artifact_name="test-jsons", use_s3=False) }} !{{ common.upload_test_reports("macos", artifact_name="test-reports", use_s3=False) }} - !{{ common.upload_test_statistics(build_environment) }} + !{{ common.upload_test_statistics(build_environment, needs_credentials=True) }} {% endblock +%} {%- endif %} diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml index 8c0d6f884cd..75501c91948 100644 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ b/.github/workflows/generated-macos-11-py3-x86-64.yml @@ -218,6 +218,8 @@ jobs: SHA1: ${{ github.event.pull_request.head.sha || github.sha }} TAG: ${{ steps.parse-ref.outputs.tag }} WORKFLOW_ID: '${{ github.run_id }}' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }} shell: bash run: | python3 -m pip install -r requirements.txt From 2f222fc88cbfb682cd5e12d377f564d7fef6652d Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Wed, 16 Feb 2022 17:55:58 -0800 Subject: [PATCH 117/199] Mild refactor of native_functions.yaml dispatch parsing (#66109) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/66109 This refactor is no longer necessary for ufunc codegen, as I changed the format of ufuncs to not directly be inserted into the 'dispatch' key, but I think the refactored code here is better. The basic concept is to directly construct BackendMetadata as we are parsing entries of the dispatch dictionary, rather than post facto creating them later. This centralizes the compute and means that the creation of the backend index is just a simple reindexing by operator name (nothing nontrivial). Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D31385760 Pulled By: ezyang fbshipit-source-id: 4fcb491ba025d2aa6fd356586b57affb97a507fc (cherry picked from commit 21c93d41996120697f81168650b4f4b999d6902a) --- tools/codegen/model.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 6bc0d7df100..a5ae3a363d0 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -355,20 +355,27 @@ class NativeFunction: raw_dispatch = e.pop('dispatch', None) assert raw_dispatch is None or isinstance(raw_dispatch, dict), e - dispatch: Dict[DispatchKey, str] = {} + dispatch: Dict[DispatchKey, BackendMetadata] = {} if raw_dispatch is not None: assert not manual_kernel_registration, \ "cannot specify both manual_kernel_registration and dispatch; with " \ "manual registration, dispatch has no effect!" + redundant_composite_implicit_autograd = False for ks, v in raw_dispatch.items(): if ks == '__line__': continue # not worth tracking line numbers for dispatch entries assert isinstance(ks, str), e - assert isinstance(v, str), e for k in ks.split(","): dispatch_key = DispatchKey.parse(k.strip()) - dispatch[dispatch_key] = v - assert dispatch != {DispatchKey.CompositeImplicitAutograd: cpp.name(func)}, \ + # Why is 'structured' included? External backends (e.g. + # XLA) opt into which ops are structured independently + # of which in-tree ops are structured + dispatch[dispatch_key] = BackendMetadata( + v, structured=structured and is_structured_dispatch_key(dispatch_key)) + if dispatch_key is DispatchKey.CompositeImplicitAutograd and v == cpp.name(func): + redundant_composite_implicit_autograd = True + + assert not (len(dispatch) == 1 and redundant_composite_implicit_autograd), \ "unnecessary dispatch table for this function; just delete the dispatch " \ "key entirely" # if a function is a structured delegate, deleting the dispatch @@ -378,7 +385,7 @@ class NativeFunction: f"but got {dispatch[DispatchKey.CompositeImplicitAutograd]}. Rename your implementation to the expected " \ "name, then delete the dispatch table" elif not structured and structured_delegate is None: - dispatch[DispatchKey.CompositeImplicitAutograd] = cpp.name(func) + dispatch[DispatchKey.CompositeImplicitAutograd] = BackendMetadata(cpp.name(func), structured=False) assert not (DispatchKey.CompositeExplicitAutograd in dispatch and DispatchKey.CompositeImplicitAutograd in dispatch), \ "cannot specify both CompositeExplicitAutograd and CompositeImplicitAutograd on a single kernel; each " \ @@ -394,12 +401,11 @@ class NativeFunction: has_composite_implicit_autograd_kernel = DispatchKey.CompositeImplicitAutograd in dispatch.keys() has_composite_explicit_autograd_kernel = DispatchKey.CompositeExplicitAutograd in dispatch.keys() - # BackendMetadata is used to store any information about a NativeFunction that is backend dependent. - # The most obvious information is the kernel name, which usually contains the name of the backend in it for cpu/cuda. - # Why is 'structured' included? External backends (e.g. XLA) opt into which ops are structured - # independently of which in-tree ops are structured - backend_metadata = {k: {func.name: BackendMetadata( - kernel=v, structured=structured and is_structured_dispatch_key(k))} for k, v in dispatch.items()} + # We aren't going to store dispatch metadata inline in NativeFunctions; + # instead it is separately indexed by backend (so other backends can + # add more dispatch entries after the fact). Reindex the individual + # metadata by OperatorName! + backend_metadata = {k: {func.name: v} for k, v in dispatch.items()} # don't care if it exists or not; make it easier to use this function # with other yaml parsers that aren't setting __line__ in the dict From e2c1533c7b92e224dde8d620c031cd7885aa11b2 Mon Sep 17 00:00:00 2001 From: David Dang Date: Wed, 16 Feb 2022 18:21:59 -0800 Subject: [PATCH 118/199] [quant][core][gpu][eager] Improved quantized conv operator in cudnn (#72770) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72770 This PR improves upon PR70622 by removing the call to_make_per_tensor_quantized_tensor and directly creating a quantized int8 tensor that is passed into raw_cudnn_convolution_forward as opposed to a non-quantized int8 tensor. Test Plan: Imported from OSS Reviewed By: H-Huang Differential Revision: D34243926 Pulled By: dzdang fbshipit-source-id: 7725db27d0a276e8108086fecb7ecb18aa227102 (cherry picked from commit e20e99c7b979c0db60562a744a546592a20befa0) --- aten/src/ATen/native/quantized/cudnn/Conv.cpp | 65 +++++++++++-------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp index bae4b9e2cb9..01d2c4fe8b4 100644 --- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp +++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp @@ -45,6 +45,16 @@ cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, int64_t id, uint8_t .build(); } +cudnn_frontend::Tensor getTensorDescriptor(const IntArrayRef& shape, const IntArrayRef& strides, cudnnDataType_t cudnn_dtype, int64_t id, uint8_t alignment) { + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(cudnn_dtype) + .build(); +} + // TODO: there is a table from input dtype and weight dtype to operator dtype, // we can derive the operator dtype based on input dtype cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation) { @@ -166,8 +176,9 @@ at::SmallVector MakeConvOutputShape<2>( return {N, M, Y_H, Y_W}; } +// the parameter quantized_output is a quantized tensor void raw_cudnn_convolution_forward_out( - const Tensor& output, + const Tensor& quantized_output, const Tensor& input, const Tensor& weight, IntArrayRef padding, @@ -180,15 +191,15 @@ void raw_cudnn_convolution_forward_out( float requantize_multiplier ) { TORCH_CHECK(!benchmark, "not supported yet"); - if (output.numel() == 0) { + if (quantized_output.numel() == 0) { return; } - Tensor conv_output = at::empty_like(output, output.options().dtype(at::kFloat)); - Tensor requantize_multiplier_tensor = at::empty_like(output, output.options().dtype(at::kFloat)); + Tensor conv_output = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), at::MemoryFormat::ChannelsLast); + // TODO: compile empty & fill_ using full_like or full + Tensor requantize_multiplier_tensor = at::empty(quantized_output.sizes(), at::device(at::kCUDA).dtype(at::kFloat), at::MemoryFormat::ChannelsLast); requantize_multiplier_tensor.fill_(requantize_multiplier); cudnnHandle_t handle = getCudnnHandle(); - CacheKey key; setConvolutionParams(&key.params, input, weight, padding, stride, dilation, groups, deterministic, allow_tf32); // operator datatype needs to be int32 for int8 convolution, but we can @@ -201,7 +212,10 @@ void raw_cudnn_convolution_forward_out( auto run = [&](cudnn_frontend::ManagedOpaqueDescriptor plan_desc) { auto workspace_size = 0; auto workspace = at::empty({workspace_size}, input.options().dtype(kByte)); - void *data_ptrs[] = {reinterpret_cast(input.data_ptr()), conv_output.data_ptr(), reinterpret_cast(weight.data_ptr()), requantize_multiplier_tensor.data_ptr(), output.data_ptr()}; + void *data_ptrs[] = {reinterpret_cast(input.data_ptr()), conv_output.data_ptr(), + reinterpret_cast(weight.data_ptr()), + requantize_multiplier_tensor.data_ptr(), + reinterpret_cast(quantized_output.data_ptr())}; // std::cout << plan.describe() << " requires workspace " << workspace_size << std::endl; int64_t uids[] = {'x', 'y', 'w', 's', 'r'}; auto variantPack = cudnn_frontend::VariantPackBuilder() @@ -232,7 +246,7 @@ void raw_cudnn_convolution_forward_out( auto requant_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) .setxDesc(conv_op.getOutputTensor()) .setbDesc(getTensorDescriptor(requantize_multiplier_tensor, 's', getAlignment(requantize_multiplier_tensor))) - .setyDesc(getTensorDescriptor(output, 'r', getAlignment(output))) + .setyDesc(getTensorDescriptor(quantized_output.sizes(), quantized_output.strides(), CUDNN_DATA_INT8, 'r', getAlignment(quantized_output))) .setpwDesc(getPointWiseMulDescriptor(getCudnnDataType(requantize_multiplier_tensor))) .build(); // std::cout << "operator:" << requant_op.describe() << std::endl; @@ -273,6 +287,7 @@ void raw_cudnn_convolution_forward_out( return; } catch (cudnn_frontend::cudnnException &e) {std::cout << "cudnn error:" << e.what() << std::endl;} catch(CuDNNError &e) { std::cout << "other error" << e.what() << std::endl;} } + TORCH_CHECK(false, "Unable to find an engine to execute this computation"); } @@ -291,31 +306,33 @@ Tensor raw_cudnn_convolution_forward( bool benchmark, bool deterministic, bool allow_tf32, - float requantize_multiplier) { + float requantize_multiplier, + double output_scale, + int64_t output_zero_point) { // TODO: add dimension validations for input/weight/bias const int N = act.size(0); - const int C = act.size(1); const int D = kSpatialDim == 3 ? act.size(2) : 1; const int H = act.size(kSpatialDim); const int W = act.size(kSpatialDim + 1); const int M = weight.size(0); // output channels std::vector kernel_size = {weight.size(2), weight.size(3)}; - at::SmallVector output_shape; - output_shape = MakeConvOutputShape(N, M, {H, W}, kernel_size, stride, padding, dilation); - Tensor output_int8 = at::empty( + at::SmallVector output_shape{MakeConvOutputShape(N, M, {H, W}, + kernel_size, stride, padding, dilation)}; + Tensor quantized_output = at::_empty_affine_quantized( output_shape, - at::device(at::kCUDA).dtype(at::kChar), - at::MemoryFormat::ChannelsLast - ); - + at::device(at::kCUDA).dtype(ScalarType::QInt8), + output_scale, + output_zero_point, + at::MemoryFormat::ChannelsLast); raw_cudnn_convolution_forward_out( - output_int8, act, weight, + quantized_output, act, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, requantize_multiplier); - return output_int8; + + return quantized_output; } @@ -343,20 +360,16 @@ class QConvInt8 final { auto requantize_multiplier = act_scale * weight_scale / output_scale; // TODO: check all zero_points are zero/all tensors are symmetrically quantized - Tensor output_int8_requant = raw_cudnn_convolution_forward( + return raw_cudnn_convolution_forward( act.int_repr(), weight.int_repr(), IntArrayRef(padding.vec()), IntArrayRef(stride.vec()), IntArrayRef(dilation.vec()), groups, false /* benchmark */, true /* deterministic */, false /* allow_tf32 */, - requantize_multiplier + requantize_multiplier, + output_scale, + output_zero_point ); - - // clamping is done in cudnn kernels, which probably defaults to -128, 127 - // for int8 dtype, we may need to add new operators to the graph if - // we want to change the clamping - Tensor quantized_output = at::_make_per_tensor_quantized_tensor(output_int8_requant, output_scale, output_zero_point); - return quantized_output; } }; From 81fbeea760e4308a940b9ad934b78d57a500484d Mon Sep 17 00:00:00 2001 From: Vitaly Fedyunin Date: Wed, 16 Feb 2022 18:23:08 -0800 Subject: [PATCH 119/199] Add docstrings to native_channel_shuffle (#72919) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72919 Test Plan: Imported from OSS Reviewed By: bdhirsh Differential Revision: D34274717 Pulled By: VitalyFedyunin fbshipit-source-id: fa42f91ef2335e2594b19ef65d914c711f7a94fd (cherry picked from commit a6f6fe9112aa36d711f240f720615be09e2e13de) --- test/test_fx.py | 1 + tools/pyi/gen_pyi.py | 1 + torch/nn/functional.py | 44 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/test/test_fx.py b/test/test_fx.py index 9e86755eb8a..f70ba52a2bd 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -3538,6 +3538,7 @@ class TestFunctionalTracing(JitTestCase): "bilinear": BUILT_IN_FUNC, "celu_": BUILT_IN_FUNC, "channel_shuffle": BUILT_IN_FUNC, + "native_channel_shuffle": BUILT_IN_FUNC, "conv1d": BUILT_IN_FUNC, "conv2d": BUILT_IN_FUNC, "conv3d": BUILT_IN_FUNC, diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py index 1edd8d32a7a..73cc5fb2cbd 100644 --- a/tools/pyi/gen_pyi.py +++ b/tools/pyi/gen_pyi.py @@ -214,6 +214,7 @@ def gen_nn_functional(fm: FileManager) -> None: 'pixel_shuffle', 'pixel_unshuffle', 'channel_shuffle', + 'native_channel_shuffle', 'pdist', 'cosine_similarity', ] diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 1e4472fe16c..1c6dd93685b 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -3591,6 +3591,50 @@ Examples:: """, ) +native_channel_shuffle = _add_docstr( + torch.native_channel_shuffle, + r""" +native_channel_shuffle(input, groups) -> Tensor + +Native kernel level implementation of the `channel_shuffle`. +This function might become private in future releases, use with caution. + +Divide the channels in a tensor of shape :math:`(*, C , H, W)` +into g groups and rearrange them as :math:`(*, C \frac g, g, H, W)`, +while keeping the original tensor shape. + +See :class:`~torch.nn.ChannelShuffle` for details. + +Args: + input (Tensor): the input tensor + groups (int): number of groups to divide channels in and rearrange. + +Examples:: + + >>> input = torch.randn(1, 4, 2, 2) + >>> print(input) + [[[[1, 2], + [3, 4]], + [[5, 6], + [7, 8]], + [[9, 10], + [11, 12]], + [[13, 14], + [15, 16]], + ]] + >>> output = torch.nn.functional.native_channel_shuffle(input, 2) + >>> print(output) + [[[[1, 2], + [3, 4]], + [[9, 10], + [11, 12]], + [[5, 6], + [7, 8]], + [[13, 14], + [15, 16]], + ]] +""", +) @_overload # noqa: F811 def upsample(input: Tensor, size: Optional[int] = None, scale_factor: Optional[float] = None, mode: str = "nearest", align_corners: Optional[bool] = None) -> Tensor: # noqa: F811 From b5f2574f36dec86a71ff39d0e07e16d39178abde Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 16 Feb 2022 18:25:35 -0800 Subject: [PATCH 120/199] no longer coalesce sparse COO tensors before comparison (#69751) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/69751 cc nikitaved pearu cpuhrsch IvanYashchuk Test Plan: Imported from OSS Reviewed By: zou3519 Differential Revision: D34262453 Pulled By: ezyang fbshipit-source-id: e2e62d2aa03fc569d2951c880960b256f5dc4aaa (cherry picked from commit cb6b0ef7198c5252c51a8fec1c19e3c17b33cc87) --- test/distributed/test_c10d_gloo.py | 2 +- test/distributed/test_data_parallel.py | 2 +- test/test_autograd.py | 9 ++++++++- test/test_cuda.py | 2 +- test/test_sparse.py | 19 +++++++++---------- torch/testing/_internal/common_utils.py | 9 --------- 6 files changed, 20 insertions(+), 23 deletions(-) diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py index 0594aae287f..9cd515fb05c 100644 --- a/test/distributed/test_c10d_gloo.py +++ b/test/distributed/test_c10d_gloo.py @@ -1757,7 +1757,7 @@ class DistributedDataParallelTest( # Check that the gradients are sparse and identical vanilla_parameter = next(vanilla_model.parameters()) ddp_parameter = next(ddp_model.parameters()) - self.assertEqual(vanilla_parameter.grad, ddp_parameter.grad) + self.assertEqual(vanilla_parameter.grad.coalesce(), ddp_parameter.grad.coalesce()) @requires_gloo() @skip_if_lt_x_gpu(2) diff --git a/test/distributed/test_data_parallel.py b/test/distributed/test_data_parallel.py index 92ce8ccc56e..c1720344e49 100644 --- a/test/distributed/test_data_parallel.py +++ b/test/distributed/test_data_parallel.py @@ -383,7 +383,7 @@ class TestDataParallel(TestCase): self.assertEqual(out.get_device(), dev_id[0]) self.assertEqual(out, expected_out) for expected, param in zip(expected_grads, l.parameters()): - self.assertEqual(param.grad, expected) + self.assertEqual(param.grad.coalesce(), expected.coalesce()) # Check for None device_ids l = l.cuda() diff --git a/test/test_autograd.py b/test/test_autograd.py index a7b8097997c..1d4ef2ce384 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -6289,7 +6289,14 @@ for shape in [(1,), ()]: if y.is_sparse: y = y.to_dense() y.sum().backward() - self.assertEqual(2 * a, a.grad) + + actual = 2 * a + expected = a.grad + if a.is_sparse: + actual = actual.coalesce() + expected = expected.coalesce() + + self.assertEqual(actual, expected) for cuda in [False] + ([True] if torch.cuda.is_available() else []): for pin_memory in [True, False]: diff --git a/test/test_cuda.py b/test/test_cuda.py index c0ea06a0e19..c6fd1a6628d 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -64,7 +64,7 @@ def make_sparse_tensor(t, n, *sizes): torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0)) v = tensor._values() v = v.new(n).copy_(torch.randn(n)) - return t(i, v, torch.Size(sizes)) + return t(i, v, torch.Size(sizes)).coalesce() _cycles_per_ms = None diff --git a/test/test_sparse.py b/test/test_sparse.py index cbc98f572bd..17f86f3f0a6 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -416,7 +416,7 @@ class TestSparse(TestCase): a_coalesced = a.coalesce() self.assertTrue(a_coalesced.is_coalesced()) self.assertEqual(torch.tensor(12.3 * 2, dtype=dtype, device=device), a.to_dense()) - self.assertEqual(a, a.to_dense().to_sparse()) + self.assertEqual(a.coalesce(), a.coalesce().to_dense().to_sparse()) # tensor without value a = self.sparse_empty((), dtype=dtype, device=device) @@ -1598,7 +1598,6 @@ class TestSparse(TestCase): z = x1.coalesce() self.assertEqual(x1.is_coalesced(), coalesced) self.assertTrue(y.is_coalesced()) - self.assertEqual(x1, y) y._values().add_(1) if not x1.is_coalesced(): # check that coalesce is out of place if the original tensor is not @@ -1698,7 +1697,7 @@ class TestSparse(TestCase): exp_v = torch.tensor([7, 14, 3, 20], dtype=dtype, device=device) res = dense.sparse_mask(x) expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4]), dtype=dtype, device=device) - self.assertEqual(res, expected) + self.assertEqual(res.coalesce(), expected.coalesce()) i = self.index_tensor([ [1, 3, 0, 4], @@ -1710,7 +1709,7 @@ class TestSparse(TestCase): exp_v = torch.empty([4, 0], dtype=dtype, device=device) res = dense.sparse_mask(x) expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 0]), dtype=dtype, device=device) - self.assertEqual(res, expected) + self.assertEqual(res.coalesce(), expected.coalesce()) _test_sparse_mask_fixed() @@ -1746,7 +1745,7 @@ class TestSparse(TestCase): res = dense.sparse_mask(x) exp_v = torch.tensor([[7, 9], [14, 1], [3, 3], [20, 1]]) expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 2])) - self.assertEqual(res, expected) + self.assertEqual(res.coalesce(), expected.coalesce()) i = self.index_tensor([ [1, 3, 0, 4], @@ -1758,7 +1757,7 @@ class TestSparse(TestCase): res = dense.sparse_mask(x) exp_v = torch.empty(4, 2, 0) expected = self.sparse_tensor(i, exp_v, torch.Size([5, 4, 2, 0])) - self.assertEqual(res, expected) + self.assertEqual(res.coalesce(), expected.coalesce()) _test_sparse_mask_hybrid_fixed() @@ -2887,11 +2886,11 @@ class TestSparse(TestCase): self.assertEqual(torch.any(t), t_any) def test_isnan(self, device): - t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, 4]), device=device) - t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, False]), device=device) + t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([1, 4]), device=device) + t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([False, False]), device=device) self.assertEqual(torch.isnan(t).int(), t_nan.int()) - t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([1, float("nan")]), device=device) - t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [2, 0])), torch.tensor([False, True]), device=device) + t = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([1, float("nan")]), device=device) + t_nan = torch.sparse_coo_tensor(torch.tensor(([0, 0], [0, 2])), torch.tensor([False, True]), device=device) self.assertEqual(torch.isnan(t).int(), t_nan.int()) @coalescedonoff diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 6e67b77613c..56128952acc 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1570,15 +1570,6 @@ class TensorOrArrayPair(TensorLikePair): self._check_supported(tensor, id=id) return actual, expected - # TODO: As discussed in https://github.com/pytorch/pytorch/issues/68590#issuecomment-975333883, - # this relaxation should only be temporary and this overwrite should be removed completely in the future. - def _equalize_attributes(self, actual, expected): - actual, expected = super()._equalize_attributes(actual, expected) - if not actual.is_sparse: - return actual, expected - - return actual.coalesce(), expected.coalesce() - class UnittestPair(Pair): """Fallback ABC pair that handles non-numeric inputs. From 1f74e082e252a03aa54ecc111112aedb41d80f53 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Wed, 16 Feb 2022 18:25:35 -0800 Subject: [PATCH 121/199] only compare attributes for meta tensors (#72508) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72508 Todo: - [x] document this behavior - [x] add tests Test Plan: Imported from OSS Reviewed By: zou3519 Differential Revision: D34262452 Pulled By: ezyang fbshipit-source-id: bc5c9653d5c3ad5c6efccc9c8e0efc0d28e15104 (cherry picked from commit 233142c88e4cff02825c7e233aba9411a6df3e9f) --- test/test_binary_ufuncs.py | 4 +- test/test_modules.py | 3 +- test/test_tensor_creation_ops.py | 56 ++-- test/test_testing.py | 5 +- test/test_torch.py | 270 +++++++++--------- test/test_type_promotion.py | 9 +- test/test_view_ops.py | 11 +- torch/testing/_comparison.py | 40 +-- .../_internal/common_methods_invocations.py | 65 +++-- 9 files changed, 252 insertions(+), 211 deletions(-) diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py index 0bd2a9e4d52..b15fb21944f 100644 --- a/test/test_binary_ufuncs.py +++ b/test/test_binary_ufuncs.py @@ -20,7 +20,7 @@ from torch.testing._internal.common_utils import ( from torch.testing._internal.common_device_type import ( expectedFailureMeta, instantiate_device_type_tests, onlyCUDA, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast, precisionOverride, onlyNativeDeviceTypes, - skipCUDAIfRocm, skipIf, ops, OpDTypes) + skipCUDAIfRocm, skipIf, ops, OpDTypes, skipMeta) from torch.testing import make_tensor from torch.testing._internal.common_dtype import ( all_types_and_complex_and, integral_types_and, get_all_dtypes, get_all_int_dtypes, get_all_math_dtypes, @@ -1497,6 +1497,7 @@ class TestBinaryUfuncs(TestCase): self._test_pow(base, second_exp) @onlyNativeDeviceTypes + @skipMeta def test_pow_scalar_type_promotion(self, device): # Test against a scalar and non-scalar input inputs = [17, [17]] @@ -3393,6 +3394,7 @@ class TestBinaryUfuncs(TestCase): TypeError, 'received an invalid combination of arguments'): actual = torch.cumulative_trapezoid(torch.randn((3, 3)), x=torch.randn((3, 3)), dx=3) + @skipMeta @dtypes(torch.double) def test_pow_scalar_overloads_mem_overlap(self, device, dtype): sz = 3 diff --git a/test/test_modules.py b/test/test_modules.py index 448f8f5fa75..b3d658a5bc5 100644 --- a/test/test_modules.py +++ b/test/test_modules.py @@ -8,7 +8,7 @@ from operator import methodcaller import torch from torch.testing._internal.common_device_type import ( - instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol) + instantiate_device_type_tests, onlyCUDA, toleranceOverride, tol, skipMeta) from torch.testing._internal.common_modules import module_db, modules from torch.testing._internal.common_utils import ( TestCase, run_tests, freeze_rng_state, mock_wrapper, get_tensors_from, gradcheck, gradgradcheck) @@ -233,6 +233,7 @@ class TestModule(TestCase): @modules([module_info for module_info in module_db if 'inplace' in signature(module_info.module_cls).parameters]) + @skipMeta def test_check_inplace(self, device, dtype, module_info): # Check if the inplace variant of the module gives the same result as the out of place # variant. diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 62d595373b3..68ddec14711 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -14,7 +14,7 @@ from torch.testing import make_tensor from torch.testing._internal.common_utils import ( TestCase, run_tests, do_test_empty_full, TEST_WITH_ROCM, suppress_warnings, torch_to_numpy_dtype_dict, slowTest, - TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS) + TEST_SCIPY, IS_MACOS, IS_PPC, IS_WINDOWS, parametrize) from torch.testing._internal.common_device_type import ( expectedFailureMeta, instantiate_device_type_tests, deviceCountAtLeast, onlyNativeDeviceTypes, onlyCPU, largeTensorTest, precisionOverride, dtypes, @@ -2786,36 +2786,44 @@ class TestTensorCreation(TestCase): sparse_size, dtype=torch.float64) self.assertEqual(sparse_with_dtype.device, torch.device('cpu')) + def _test_signal_window_functions(self, name, dtype, device, **kwargs): + import scipy.signal as signal + + torch_method = getattr(torch, name + '_window') + if not dtype.is_floating_point: + with self.assertRaisesRegex(RuntimeError, r'floating point'): + torch_method(3, dtype=dtype) + return + for size in [0, 1, 2, 5, 10, 50, 100, 1024, 2048]: + for periodic in [True, False]: + res = torch_method(size, periodic=periodic, **kwargs, device=device, dtype=dtype) + # NB: scipy always returns a float64 result + ref = torch.from_numpy(signal.get_window((name, *(kwargs.values())), size, fftbins=periodic)) + self.assertEqual(res, ref, exact_dtype=False) + with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'): + torch_method(3, layout=torch.sparse_coo) + self.assertTrue(torch_method(3, requires_grad=True).requires_grad) + self.assertFalse(torch_method(3).requires_grad) + @onlyNativeDeviceTypes @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3}) @unittest.skipIf(not TEST_SCIPY, "Scipy not found") @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long) @dtypes(torch.float, torch.double, torch.long) - def test_signal_window_functions(self, device, dtype): - import scipy.signal as signal - - def test(name, kwargs): - torch_method = getattr(torch, name + '_window') - if not dtype.is_floating_point: - with self.assertRaisesRegex(RuntimeError, r'floating point'): - torch_method(3, dtype=dtype) - return - for size in [0, 1, 2, 5, 10, 50, 100, 1024, 2048]: - for periodic in [True, False]: - res = torch_method(size, periodic=periodic, **kwargs, device=device, dtype=dtype) - # NB: scipy always returns a float64 result - ref = torch.from_numpy(signal.get_window((name, *(kwargs.values())), size, fftbins=periodic)) - self.assertEqual(res, ref, exact_dtype=False) - with self.assertRaisesRegex(RuntimeError, r'not implemented for sparse types'): - torch_method(3, layout=torch.sparse_coo) - self.assertTrue(torch_method(3, requires_grad=True).requires_grad) - self.assertFalse(torch_method(3).requires_grad) - - for window in ['hann', 'hamming', 'bartlett', 'blackman']: - test(window, kwargs={}) + @parametrize("window", ['hann', 'hamming', 'bartlett', 'blackman']) + def test_signal_window_functions(self, device, dtype, window): + self._test_signal_window_functions(window, dtype, device) + @onlyNativeDeviceTypes + # See https://github.com/pytorch/pytorch/issues/72630 + @skipMeta + @precisionOverride({torch.bfloat16: 5e-2, torch.half: 1e-3}) + @unittest.skipIf(not TEST_SCIPY, "Scipy not found") + @dtypesIfCUDA(torch.float, torch.double, torch.bfloat16, torch.half, torch.long) + @dtypes(torch.float, torch.double, torch.long) + def test_kaiser_window(self, device, dtype): for num_test in range(50): - test('kaiser', kwargs={'beta': random.random() * 30}) + self._test_signal_window_functions('kaiser', dtype, device, beta=random.random() * 30) def test_tensor_factories_empty(self, device): # ensure we can create empty tensors from each factory function diff --git a/test/test_testing.py b/test/test_testing.py index 3cfef8cee39..1fe06a22934 100644 --- a/test/test_testing.py +++ b/test/test_testing.py @@ -574,11 +574,10 @@ class TestAssertClose(TestCase): def test_meta(self): actual = torch.empty((2, 2), device="meta") - expected = actual.clone() + expected = torch.empty((2, 2), device="meta") for fn in assert_close_with_inputs(actual, expected): - with self.assertRaisesRegex(NotImplementedError, "meta"): - fn() + fn() def test_mismatching_layout(self): strided = torch.empty((2, 2)) diff --git a/test/test_torch.py b/test/test_torch.py index e2422d1477d..16cf9e2e61f 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -37,7 +37,7 @@ from torch.testing._internal.common_utils import ( skipCUDAMemoryLeakCheckIf, BytesIOContext, noarchTest, skipIfRocm, skipIfNoSciPy, TemporaryFileName, TemporaryDirectoryName, wrapDeterministicFlagAPITest, DeterministicGuard, CudaSyncGuard, - skipIfNotRegistered, bytes_to_scalar) + skipIfNotRegistered, bytes_to_scalar, parametrize) from multiprocessing.reduction import ForkingPickler from torch.testing._internal.common_device_type import ( expectedFailureMeta, @@ -793,158 +793,158 @@ class TestTorchDeviceType(TestCase): self.assertFalse(t1.is_set_to(t2)) self.assertFalse(t2.is_set_to(t1)) - def test_broadcast(self, device): - - # all functions - fns = { - "dist", "atan2", "pow", "lerp", "add", - "sub", "mul", "div", "fmod", "remainder", - "eq", "ge", "gt", "le", "lt", "max", "min", "ne", - "addcdiv", "addcmul", "masked_scatter", "masked_select", "masked_fill", - "map", "map2", "copy" - } + # See https://github.com/pytorch/pytorch/issues/72650 + @skipMeta + @parametrize( + "fn", + [ + "dist", "atan2", "pow", "lerp", "add", "sub", "mul", "div", "fmod", "remainder", "eq", "ge", "gt", "le", + "lt", "max", "min", "ne", "addcdiv", "addcmul", "masked_scatter", "masked_select", "masked_fill", "map", + "map2", "copy", + ], + ) + def test_broadcast(self, fn, device): # functions with three tensor arguments fns_3_args = {"map2"} fns_value_kwarg = {"addcdiv", "addcmul"} - for fn in fns: - (dims_small, dims_large, dims_full) = self._select_broadcastable_dims() - full1d = torch.randn(*dims_full, device=device).flatten().float() - small = torch.randn(*dims_small, device=device).float() - large = torch.randn(*dims_large, device=device).float() - small_expanded = small.expand(*dims_full) - large_expanded = large.expand(*dims_full) - small2 = None - small2_expanded = None - if fn in fns_3_args or fn in fns_value_kwarg: - # create another smaller tensor - (dims_small2, _, _) = self._select_broadcastable_dims(dims_full) - small2 = torch.randn(*dims_small2, device=device).float() - small2_expanded = small2.expand(*dims_full) + (dims_small, dims_large, dims_full) = self._select_broadcastable_dims() + full1d = torch.randn(*dims_full, device=device).flatten().float() + small = torch.randn(*dims_small, device=device).float() + large = torch.randn(*dims_large, device=device).float() + small_expanded = small.expand(*dims_full) + large_expanded = large.expand(*dims_full) + small2 = None + small2_expanded = None + if fn in fns_3_args or fn in fns_value_kwarg: + # create another smaller tensor + (dims_small2, _, _) = self._select_broadcastable_dims(dims_full) + small2 = torch.randn(*dims_small2, device=device).float() + small2_expanded = small2.expand(*dims_full) - if small.is_cuda and fn in ['map', 'map2']: - # map and map2 are not implementd on CUDA tensors - continue + if small.is_cuda and fn in ['map', 'map2']: + # map and map2 are not implementd on CUDA tensors + return - if hasattr(large_expanded, fn): - # run through tensor versions of functions - # and verify fully expanded inputs give same results - expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded} + if hasattr(large_expanded, fn): + # run through tensor versions of functions + # and verify fully expanded inputs give same results + expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded} - def tensorfn(myfn, t1, t2): - if fn == "lerp": - return myfn(t1, 0.5) - elif fn == "masked_select": - return myfn(t1 < 0) - elif fn == "masked_scatter": - return myfn(t1 < 0.5, full1d) - elif fn == "masked_fill": - return myfn(t1 < 0.5, 1.0) - elif fn in fns_3_args: - return myfn(1, t1, t2) - elif fn in fns_value_kwarg: - return myfn(t1, t2, value=1) - else: - return myfn(t1) - - # test various orders - for first, second, third in [(large, small, small2), (small, large, small2), - (small2, small, large), (small2, large, small)]: - if first is None: - break # ignore last iter when small2 is None - method_expanded = getattr(expanded[first], fn) - method = getattr(first, fn) - r1 = tensorfn(method_expanded, expanded[second], expanded[third]) - r2 = tensorfn(method, second, third) - self.assertEqual(r1, r2) - - # now for torch. versions of functions - if hasattr(torch, fn): - fntorch = getattr(torch, fn) - expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded} - - def torchfn(t1, t2, t3): - if fn == "lerp": - return fntorch(t1, t2, 0.5) - elif fn == "masked_select": - return fntorch(t1, t2 < 0) - elif fn == "masked_scatter": - return fntorch(t1, t2 < 0.5, full1d) - elif fn == "masked_fill": - return fntorch(t1, t2 < 0.5, 1.0) - elif fn in fns_3_args: - return fntorch(t1, 1.0, t2, t3) - elif fn in fns_value_kwarg: - return fntorch(t1, t2, t3, value=1.0) - else: - return fntorch(t1, t2) - - # test various orders - for first, second, third in [(large, small, small2), (small, large, small2), - (small2, small, large), (small2, large, small)]: - if first is None: - break # ignore last iter when small2 is None - r1 = torchfn(expanded[first], expanded[second], expanded[third]) - r2 = torchfn(first, second, third) - self.assertEqual(r1, r2) - - # now for in place functions - # in-place tensor is not broadcastable; test only guaranteed - # to work by broadcasting other argument(s) - if not hasattr(large_expanded, fn + "_"): - continue - - # need to clone largeExpanded so we can reuse, since functions are in-place - large_expanded_clone = large_expanded.clone() - - def tensorfn_inplace(t0, t1, t2=None): - t0_fn = getattr(t0, fn + "_") + def tensorfn(myfn, t1, t2): if fn == "lerp": - return t0_fn(t1, 0.5) + return myfn(t1, 0.5) + elif fn == "masked_select": + return myfn(t1 < 0) elif fn == "masked_scatter": - return t0_fn(t1 < 0.5, full1d) + return myfn(t1 < 0.5, full1d) elif fn == "masked_fill": - return t0_fn(t1 < 0.5, 1.0) - elif fn == "map": - return t0_fn(t1, lambda x, y: x + y) - elif fn == "map2": - return t0_fn(t1, t2, lambda x, y, z: x + y + z) + return myfn(t1 < 0.5, 1.0) elif fn in fns_3_args: - return t0_fn(1.0, t1, t2) + return myfn(1, t1, t2) elif fn in fns_value_kwarg: - return t0_fn(t1, t2, value=1.0) + return myfn(t1, t2, value=1) else: - return t0_fn(t1) - # in-place pointwise operations don't actually work if the in-place - # tensor is 0-strided (numpy has the same issue) - if (0 not in large_expanded.stride() and 0 not in large_expanded_clone.stride()): - r1 = tensorfn_inplace(large_expanded, small_expanded, small2_expanded) - r2 = tensorfn_inplace(large_expanded_clone, small, small2) + return myfn(t1) + + # test various orders + for first, second, third in [(large, small, small2), (small, large, small2), + (small2, small, large), (small2, large, small)]: + if first is None: + break # ignore last iter when small2 is None + method_expanded = getattr(expanded[first], fn) + method = getattr(first, fn) + r1 = tensorfn(method_expanded, expanded[second], expanded[third]) + r2 = tensorfn(method, second, third) self.assertEqual(r1, r2) - def broadcastable(t0, t1, t2=None): - try: - t1.expand_as(t0) - if t2 is not None: - t2.expand_as(t0) - except RuntimeError: - return False - return True + # now for torch. versions of functions + if hasattr(torch, fn): + fntorch = getattr(torch, fn) + expanded = {large: large_expanded, small: small_expanded, small2: small2_expanded} - def _test_in_place_broadcastable(t0, t1, t2=None): - if not broadcastable(t0, t1, t2): - same_size = t0.numel() == t1.numel() and (t0.numel() == t2.numel() if t2 is not None else True) - if not same_size: - self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2)) + def torchfn(t1, t2, t3): + if fn == "lerp": + return fntorch(t1, t2, 0.5) + elif fn == "masked_select": + return fntorch(t1, t2 < 0) + elif fn == "masked_scatter": + return fntorch(t1, t2 < 0.5, full1d) + elif fn == "masked_fill": + return fntorch(t1, t2 < 0.5, 1.0) + elif fn in fns_3_args: + return fntorch(t1, 1.0, t2, t3) + elif fn in fns_value_kwarg: + return fntorch(t1, t2, t3, value=1.0) else: - tensorfn_inplace(t0, t1, t2) + return fntorch(t1, t2) - if fn not in fns_3_args and fn not in fns_value_kwarg: - _test_in_place_broadcastable(small, large_expanded) - _test_in_place_broadcastable(small, large) + # test various orders + for first, second, third in [(large, small, small2), (small, large, small2), + (small2, small, large), (small2, large, small)]: + if first is None: + break # ignore last iter when small2 is None + r1 = torchfn(expanded[first], expanded[second], expanded[third]) + r2 = torchfn(first, second, third) + self.assertEqual(r1, r2) + + # now for in place functions + # in-place tensor is not broadcastable; test only guaranteed + # to work by broadcasting other argument(s) + if not hasattr(large_expanded, fn + "_"): + return + + # need to clone largeExpanded so we can reuse, since functions are in-place + large_expanded_clone = large_expanded.clone() + + def tensorfn_inplace(t0, t1, t2=None): + t0_fn = getattr(t0, fn + "_") + if fn == "lerp": + return t0_fn(t1, 0.5) + elif fn == "masked_scatter": + return t0_fn(t1 < 0.5, full1d) + elif fn == "masked_fill": + return t0_fn(t1 < 0.5, 1.0) + elif fn == "map": + return t0_fn(t1, lambda x, y: x + y) + elif fn == "map2": + return t0_fn(t1, t2, lambda x, y, z: x + y + z) + elif fn in fns_3_args: + return t0_fn(1.0, t1, t2) + elif fn in fns_value_kwarg: + return t0_fn(t1, t2, value=1.0) else: - _test_in_place_broadcastable(small2, small_expanded, large_expanded) - _test_in_place_broadcastable(small2, small, large) + return t0_fn(t1) + # in-place pointwise operations don't actually work if the in-place + # tensor is 0-strided (numpy has the same issue) + if (0 not in large_expanded.stride() and 0 not in large_expanded_clone.stride()): + r1 = tensorfn_inplace(large_expanded, small_expanded, small2_expanded) + r2 = tensorfn_inplace(large_expanded_clone, small, small2) + self.assertEqual(r1, r2) + + def broadcastable(t0, t1, t2=None): + try: + t1.expand_as(t0) + if t2 is not None: + t2.expand_as(t0) + except RuntimeError: + return False + return True + + def _test_in_place_broadcastable(t0, t1, t2=None): + if not broadcastable(t0, t1, t2): + same_size = t0.numel() == t1.numel() and (t0.numel() == t2.numel() if t2 is not None else True) + if not same_size: + self.assertRaises(RuntimeError, lambda: tensorfn_inplace(t0, t1, t2)) + else: + tensorfn_inplace(t0, t1, t2) + + if fn not in fns_3_args and fn not in fns_value_kwarg: + _test_in_place_broadcastable(small, large_expanded) + _test_in_place_broadcastable(small, large) + else: + _test_in_place_broadcastable(small2, small_expanded, large_expanded) + _test_in_place_broadcastable(small2, small, large) @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "cublas runtime error") @onlyCUDA @@ -2963,7 +2963,7 @@ else: index = torch.tensor([0], device=device) x.index_fill_(1, index, 0) self.assertEqual(x, torch.tensor([[0, 2], [0, 5]], dtype=dtype, device=device)) - if not x.is_complex(): + if not x.is_complex() and not device == "meta": with self.assertRaisesRegex(RuntimeError, r"Scalar"): x.index_fill_(1, index, 1 + 1j) # Make sure that the result stays 0-dim while applied to diff --git a/test/test_type_promotion.py b/test/test_type_promotion.py index 01e96a3fe11..f32a89933f0 100644 --- a/test/test_type_promotion.py +++ b/test/test_type_promotion.py @@ -9,7 +9,7 @@ import torch from torch.testing._internal.common_utils import (TestCase, run_tests, load_tests, TEST_NUMPY, torch_to_numpy_dtype_dict) from torch.testing._internal.common_device_type import (instantiate_device_type_tests, onlyNativeDeviceTypes, - dtypes, dtypesIfCUDA, onlyCPU, expectedFailureMeta) + dtypes, dtypesIfCUDA, onlyCPU, expectedFailureMeta, skipMeta) from torch.testing._internal.common_dtype import ( get_all_dtypes, get_all_math_dtypes, get_all_int_dtypes, get_all_fp_dtypes ) @@ -937,7 +937,11 @@ class TestTypePromotion(TestCase): elif op in real_only_ops and dtypes[0].is_complex: with self.assertRaises(RuntimeError): op(t, out=out) - elif op in float_only_ops and (not dtypes[0].is_floating_point and not dtypes[0].is_complex): + elif ( + op in float_only_ops + and (not dtypes[0].is_floating_point and not dtypes[0].is_complex) + and device != "meta" + ): with self.assertRaises(RuntimeError): op(t, out=out) else: @@ -947,6 +951,7 @@ class TestTypePromotion(TestCase): # Verifies that the out= argument doesn't affect the computation, that # is, out = op(...) and op(..., out=out) produce the same result. @onlyNativeDeviceTypes + @skipMeta def test_computation_ignores_out(self, device): t = torch.tensor(33000, dtype=torch.float16, device=device) out = torch.empty(0, dtype=torch.float64, device=device) diff --git a/test/test_view_ops.py b/test/test_view_ops.py index 2678db1d74d..37d08e39e63 100644 --- a/test/test_view_ops.py +++ b/test/test_view_ops.py @@ -14,7 +14,7 @@ from torch.testing._internal.common_utils import ( torch_to_numpy_dtype_dict, ) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes) + (instantiate_device_type_tests, onlyCPU, dtypes, onlyNativeDeviceTypes, skipMeta) from torch.testing._internal.common_dtype import ( get_all_dtypes, get_all_int_dtypes, get_all_fp_dtypes, get_all_complex_dtypes ) @@ -729,6 +729,7 @@ class TestViewOps(TestCase): s = t.contiguous() self.assertTrue(s is t) + @skipMeta def test_contiguous_nonview(self, device): t = torch.ones(5, 5, device=device) nv = t.t().contiguous() @@ -754,6 +755,7 @@ class TestViewOps(TestCase): v[6] = 0 self.assertEqual(t[1, 1], v[6]) + @skipMeta def test_reshape_nonview(self, device): t = torch.ones(5, 5, device=device) nv = torch.reshape(t.t(), (25,)) @@ -806,7 +808,8 @@ class TestViewOps(TestCase): idx_nv = (0,) * nv.ndim self.assertTrue(not nv._is_view()) nv[idx_nv] = 0 - self.assertNotEqual(t[idx_t], nv[idx_nv]) + if device != "meta": + self.assertNotEqual(t[idx_t], nv[idx_nv]) t = torch.ones(2, 3, 2, 3, device=device).transpose(2, 3) nv = t.flatten(1, 3) assert_is_nonview(t, nv) @@ -1027,7 +1030,9 @@ class TestOldViewOps(TestCase): self.assertRaises(RuntimeError, lambda: x.reshape(-1, -1)) y = torch.randn(4, 4, 4, device=device)[:, 0, :] - self.assertNotEqual(y.data_ptr(), y.reshape(-1).data_ptr()) + # .data_ptr() on meta tensors is always 0 so they are equal regardless of the reshape + if device != "meta": + self.assertNotEqual(y.data_ptr(), y.reshape(-1).data_ptr()) self.assertEqual(y.contiguous().view(-1), y.reshape(-1)) self.assertEqual(y.reshape(2, 2, 4).data_ptr(), y.data_ptr()) diff --git a/torch/testing/_comparison.py b/torch/testing/_comparison.py index 993b3d1d5cb..50101355b8c 100644 --- a/torch/testing/_comparison.py +++ b/torch/testing/_comparison.py @@ -598,29 +598,12 @@ class TensorLikePair(Pair): def compare(self) -> None: actual, expected = self.actual, self.expected - with self._handle_meta_tensor_data_access(): - self._compare_attributes(actual, expected) - actual, expected = self._equalize_attributes(actual, expected) + self._compare_attributes(actual, expected) + if any(input.device.type == "meta" for input in (actual, expected)): + return - self._compare_values(actual, expected) - - @contextlib.contextmanager - def _handle_meta_tensor_data_access(self): - """Turns a vanilla :class:`NotImplementedError` stemming from data access on a meta tensor into an expressive - :class:`ErrorMeta`. - - Although it looks like meta tensors could be handled upfront, we need to do it lazily: there are use cases - where a meta tensor wraps a data tensors and dispatches all operator calls to it. Thus, although the tensor is - a meta tensor, it behaves like a regular one. - """ - try: - yield - except NotImplementedError as error: - if "meta" not in str(error).lower(): - raise error - - # TODO: See https://github.com/pytorch/pytorch/issues/68592 - raise self._make_error_meta(NotImplementedError, "Comparing meta tensors is currently not supported.") + actual, expected = self._equalize_attributes(actual, expected) + self._compare_values(actual, expected) def _compare_attributes( self, @@ -1103,10 +1086,15 @@ def assert_close( \lvert \text{actual} - \text{expected} \rvert \le \texttt{atol} + \texttt{rtol} \cdot \lvert \text{expected} \rvert - and they have the same :attr:`~torch.Tensor.device` (if ``check_device`` is ``True``), same ``dtype`` (if - ``check_dtype`` is ``True``), and the same stride (if ``check_stride`` is ``True``). Non-finite values - (``-inf`` and ``inf``) are only considered close if and only if they are equal. ``NaN``'s are only considered equal - to each other if ``equal_nan`` is ``True``. + Non-finite values (``-inf`` and ``inf``) are only considered close if and only if they are equal. ``NaN``'s are + only considered equal to each other if ``equal_nan`` is ``True``. + + In addition, they are only considered close if they have the same + - :attr:`~torch.Tensor.device` (if ``check_device`` is ``True``), + - ``dtype`` (if ``check_dtype`` is ``True``), + - ``layout`` (if ``check_layout`` is ``True``), and + - stride (if ``check_stride`` is ``True``). + If either ``actual`` or ``expected`` is a meta tensor, only the attribute checks will be performed. If ``actual`` and ``expected`` are sparse (either having COO or CSR layout), their strided members are checked individually. Indices, namely ``indices`` for COO or ``crow_indices`` and ``col_indices`` for CSR layout, diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index e984d4b7f8c..e1f3a1f5032 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -14480,9 +14480,21 @@ op_db: List[OpInfo] = [ # These paths have different dtype support. Also JIT supports, # most variants but not all of them. So we split the OpInfo entries, # for `norm` based on the code-paths and JIT support. - OpInfo('norm', - sample_inputs_func=sample_inputs_norm, - dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16)), + OpInfo( + "norm", + sample_inputs_func=sample_inputs_norm, + dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), + skips=( + # AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast from a result + # of dtype torch.float32 into an out= with dtype torch.long + DecorateInfo( + unittest.expectedFailure, + "TestCommon", + "test_out", + device_type="meta", + ), + ), + ), OpInfo('norm', variant_test_name='nuc', sample_inputs_func=sample_inputs_norm_nuc, @@ -14517,19 +14529,40 @@ op_db: List[OpInfo] = [ # Arguments for call are not valid. DecorateInfo(unittest.expectedFailure, 'TestJit', 'test_variant_consistency_jit', dtypes=(torch.complex64, torch.float32,)), # noqa: B950 )), - OpInfo('norm', - variant_test_name='inf', - sample_inputs_func=sample_inputs_norm_inf, - dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), - backward_dtypesIfCPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), - skips=( - # https://github.com/pytorch/pytorch/issues/67517 - DecorateInfo(unittest.skip("Skipped!"), 'TestCommon', 'test_noncontiguous_samples'), - # following 2 tests failed intermittenly - DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_grad', device_type='cpu', dtypes=(torch.complex128,)), # noqa: B950 - DecorateInfo(unittest.skip("Skipped!"), 'TestGradients', 'test_fn_gradgrad', device_type='cpu', dtypes=(torch.complex128,)), # noqa: B950 - ) - ), + OpInfo( + "norm", + variant_test_name="inf", + sample_inputs_func=sample_inputs_norm_inf, + dtypes=floating_and_complex_types_and(torch.float16, torch.bfloat16), + backward_dtypesIfCPU=floating_and_complex_types_and(torch.float16, torch.bfloat16), + skips=( + # https://github.com/pytorch/pytorch/issues/67517 + DecorateInfo(unittest.skip("Skipped!"), "TestCommon", "test_noncontiguous_samples"), + # following 2 tests failed intermittenly + DecorateInfo( + unittest.skip("Skipped!"), + "TestGradients", + "test_fn_grad", + device_type="cpu", + dtypes=(torch.complex128,), + ), + DecorateInfo( + unittest.skip("Skipped!"), + "TestGradients", + "test_fn_gradgrad", + device_type="cpu", + dtypes=(torch.complex128,), + ), + # AssertionError: RuntimeError not raised : Expected RuntimeError when doing an unsafe cast from a result + # of dtype torch.float32 into an out= with dtype torch.long + DecorateInfo( + unittest.expectedFailure, + "TestCommon", + "test_out", + device_type="meta", + ), + ), + ), OpInfo('t', sample_inputs_func=sample_inputs_t, supports_out=False, From 456d96d544e999670832a35cfd69ec7b552b3083 Mon Sep 17 00:00:00 2001 From: Pearu Peterson Date: Wed, 16 Feb 2022 18:40:19 -0800 Subject: [PATCH 122/199] Generate static docstrings for torch._masked functions. (#72865) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72865 Fixes #72636 Test Plan: Imported from OSS Reviewed By: zou3519 Differential Revision: D34286183 Pulled By: cpuhrsch fbshipit-source-id: 9cf81bfed6ba8c82593f6a1d9e0b20d0a083310d (cherry picked from commit 0a3f57896b5fd6f9646c5c7eeed034d02de832e5) --- tools/update_masked_docs.py | 58 +++ torch/_masked/__init__.py | 29 +- torch/_masked/_docs.py | 734 ++++++++++++++++++++++++++++++++++++ 3 files changed, 815 insertions(+), 6 deletions(-) create mode 100644 tools/update_masked_docs.py create mode 100644 torch/_masked/_docs.py diff --git a/tools/update_masked_docs.py b/tools/update_masked_docs.py new file mode 100644 index 00000000000..6d705d59100 --- /dev/null +++ b/tools/update_masked_docs.py @@ -0,0 +1,58 @@ +"""This script updates the file torch/_masked/_docs.py that contains +the generated doc-strings for various masked operations. The update +should be triggered whenever a new masked operation is introduced to +torch._masked package. Running the script requires that torch package +is functional. +""" + +import os + +def main() -> None: + + target = os.path.join('torch', '_masked', '_docs.py') + + try: + import torch + except ImportError as msg: + print(f'Failed to import torch required to build {target}: {msg}') + return + + if os.path.isfile(target): + with open(target) as _f: + current_content = _f.read() + else: + current_content = '' + + _new_content = [] + _new_content.append('''\ +# -*- coding: utf-8 -*- +# This file is generated, do not modify it! +# +# To update this file, run the update masked docs script as follows: +# +# python tools/update_masked_docs.py +# +# The script must be called from an environment where the development +# version of torch package can be imported and is functional. +# +''') + + for func_name in sorted(torch._masked.__all__): + func = getattr(torch._masked, func_name) + func_doc = torch._masked._generate_docstring(func) + _new_content.append(f'{func_name}_docstring = """{func_doc}"""\n') + + new_content = '\n'.join(_new_content) + + if new_content == current_content: + print(f'Nothing to update in {target}') + return + + with open(target, 'w') as _f: + _f.write(new_content) + + print(f'Successfully updated {target}') + + +if __name__ == '__main__': + main() diff --git a/torch/_masked/__init__.py b/torch/_masked/__init__.py index a1b398cb2f4..e3ed37af443 100644 --- a/torch/_masked/__init__.py +++ b/torch/_masked/__init__.py @@ -2,8 +2,10 @@ from typing import Optional, Tuple, List, Union, Any +import warnings import torch from torch import Tensor +from . import _docs # A workaround to support both TorchScript and MyPy: from typing import TYPE_CHECKING @@ -27,6 +29,26 @@ def _apply_docstring_templates(func): """Decorator that applies docstring templates to function docstring and returns the function instance. """ + + doc_string = getattr(_docs, f'{func.__name__}_docstring', None) + if doc_string is None: + warnings.warn( + f'No documentation string available for {func.__name__}.' + ' PyTorch team should run `python tools/update_masked_docs.py`' + ' to generate the missing docstrings.') + else: + func.__doc__ = doc_string + + # Expose function as public symbol + __all__.append(func.__name__) + + return func + + +def _generate_docstring(func): + """An utility function called from tools/update_masked_docs.py + script to update the module torch._masked._docs.py + """ docstring_templates = dict( reduction_signature='''\ {function_name}(input, {operation_args}, *, {operation_kwargs}) -> Tensor''', @@ -297,12 +319,7 @@ defined as ``x[i]/max(norm(x, p), eps)``.''') doc_template = '\n\n'.join([f'{{{op_kind}_{sec}}}' for sec in doc_sections]) else: doc_template = func.__doc__ - func.__doc__ = doc_template.format_map(templates) - - # Expose function as public symbol - __all__.append(func.__name__) - - return func + return doc_template.format_map(templates) def _reduction_identity(op_name: str, input: Tensor, *args): diff --git a/torch/_masked/_docs.py b/torch/_masked/_docs.py new file mode 100644 index 00000000000..b8519b5f8f7 --- /dev/null +++ b/torch/_masked/_docs.py @@ -0,0 +1,734 @@ +# -*- coding: utf-8 -*- +# This file is generated, do not modify it! +# +# To update this file, run the update masked docs script as follows: +# +# python tools/update_masked_docs.py +# +# The script must be called from an environment where the development +# version of torch package can be imported and is functional. +# + +amax_docstring = """amax(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns maximum of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of maximum operation, which is used to start the +reduction, depends on input dtype. For instance, for float32, uint8, +and int32 dtypes, the identity values are ``-inf``, ``0``, and ``-2147483648``, respectively. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in maximum computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of maximum operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.amax(input, 1, mask=mask) + tensor([ -1, -9223372036854775808]) +""" + +amin_docstring = """amin(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns minimum of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of minimum operation, which is used to start the +reduction, depends on input dtype. For instance, for float32, uint8, +and int32 dtypes, the identity values are ``inf``, ``255``, and ``2147483647``, respectively. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in minimum computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of minimum operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.amin(input, 1, mask=mask) + tensor([ -3, 9223372036854775807]) +""" + +log_softmax_docstring = """log_softmax(input, dim, *, dtype=None, mask=None) -> Tensor + +Returns log_softmax of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. LogSoftmax of i-th element in ``x`` is +defined as ``log(exp(x[i])/sum(exp(x)))``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +log_softmax computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the log_softmax output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which log_softmax is computed. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.log_softmax(input, 1, mask=mask) + tensor([[-2.1269, -inf, -0.1269], + [ nan, nan, nan]]) +""" + +mean_docstring = """mean(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns mean of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +By definition, the identity value of a mean operation is the mean +value of the tensor. If all elements of the input tensor along given +dimension(s) :attr:`dim` are masked-out, the identity value of the +mean is undefined. Due to this ambiguity, the elements of output +tensor with strided layout, that correspond to fully masked-out +elements, have ``nan`` values. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in mean computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of mean operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.mean(input, 1, mask=mask) + tensor([-2., nan]) +""" + +norm_docstring = """norm(input, ord, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns norm of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of norm operation, which is used to start the +reduction, is ``0.0``, except for ``ord=-inf`` it is +``inf``. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in norm computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of norm operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + ord (int, float, optional): the order of vector norm. Default: 2. + See :func:`torch.linalg.vector_norm` for a list of supported norms. + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.norm(input, 2.0, 1, mask=mask) + tensor([3.1623, 0.0000]) +""" + +normalize_docstring = """normalize(input, ord, dim, *, eps=1e-12, dtype=None, mask=None) -> Tensor + +Returns normalize of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Normalize of i-th element in ``x`` is +defined as ``x[i]/max(norm(x, p), eps)``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +normalize computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the normalize output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + ord (int, float): the order of vector norm. Default: 2. + See :func:`torch.linalg.vector_norm` for a list of supported norms. + dim (int): the dimension along which normalize is computed. + +Keyword args: + eps (float, optional): small value to avoid division by zero. Default: 1e-12. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.normalize(input, 2.0, 1, mask=mask) + tensor([[-0.9487, 0.0000, -0.3162], + [ 0.0000, 0.0000, 0.0000]]) +""" + +prod_docstring = """prod(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns product of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of product operation, which is used to start the reduction, is ``1``. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in product computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of product operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.prod(input, 1, mask=mask) + tensor([3, 1]) +""" + +softmax_docstring = """softmax(input, dim, *, dtype=None, mask=None) -> Tensor + +Returns softmax of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Softmax of i-th element in ``x`` is +defined as ``exp(x[i])/sum(exp(x))``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +softmax computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the softmax output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which softmax is computed. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.softmax(input, 1, mask=mask) + tensor([[0.1192, 0.0000, 0.8808], + [ nan, nan, nan]]) +""" + +softmin_docstring = """softmin(input, dim, *, dtype=None, mask=None) -> Tensor + +Returns softmin of all the slices in the :attr:`input` tensor +along :attr:`dim` while the :attr:`input` elements are masked out +according to the boolean tensor :attr:`mask`. + +Let ``x`` be a sequence of unmasked elements of one-dimensional slice +of the :attr:`input` tensor. Softmin of i-th element in ``x`` is +defined as ``exp(-x[i])/sum(exp(-x))``. + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True then +the corresponding element in :attr:`input` tensor will be included in +softmin computation, otherwise the element is ignored. + +The values of masked-out elements of the output tensor have undefined +value: it may or may not be set to zero or nan; the choice may correspond to +the value that leads to the most efficient storage of :attr:`output` +tensor. + +The mask of the softmin output tensor can be computed as +``torch.broadcast_to(mask, input.shape)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int): the dimension along which softmin is computed. + +Keyword args: + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3., -2., -1.], [ 0., 1., 2.]]) + >>> input + tensor([[-3., -2., -1.], + [ 0., 1., 2.]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.softmin(input, 1, mask=mask) + tensor([[0.8808, 0.0000, 0.1192], + [ nan, nan, nan]]) +""" + +sum_docstring = """sum(input, dim, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns sum of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of sum operation, which is used to start the reduction, is ``0``. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in sum computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of sum operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.sum(input, 1, mask=mask) + tensor([-4, 0]) +""" + +var_docstring = """var(input, dim, unbiased, *, keepdim=False, dtype=None, mask=None) -> Tensor + +Returns variance of all the elements in the :attr:`input` +tensor along the given dimension(s) :attr:`dim` while the :attr:`input` +elements are masked out according to the boolean tensor +:attr:`mask`. + +The identity value of sample variance operation is undefined. The +elements of output tensor with strided layout, that correspond to +fully masked-out elements, have ``nan`` values. + +If :attr:`keepdim` is ``True``, the output tensor is of the same size +as :attr:`input` except in the dimension(s) :attr:`dim` where it is of +size 1. Otherwise, :attr:`dim` is squeezed (see +:func:`torch.squeeze`), resulting in the output tensor having 1 (or +``len(dim)``) fewer dimension(s). + +The boolean tensor :attr:`mask` defines the "validity" of +:attr:`input` tensor elements: if :attr:`mask` element is True +then the corresponding element in :attr:`input` tensor will be +included in variance computation, otherwise the element is +ignored. + +When all elements of :attr:`input` along the given dimension +:attr:`dim` are ignored (fully masked-out), the corresponding element +of the output tensor will have undefined value: it may or may not +correspond to the identity value of variance operation; the +choice may correspond to the value that leads to the most efficient +storage of :attr:`output` tensor. + +The mask of the output tensor can be computed as +``torch.any(torch.broadcast_to(mask, input.shape), dim, keepdim=keepdim, +dtype=torch.bool)``. + +The shapes of the :attr:`mask` tensor and the :attr:`input` tensor +don't need to match, but they must be :ref:`broadcastable +` and the dimensionality of the :attr:`mask` +tensor must not be greater than of the :attr:`input` tensor. + +Args: + input (Tensor): the input tensor + dim (int or tuple of ints, optional): the dimension or dimensions to reduce. + Default: None that is equivalent to ``tuple(range(input.ndim))``. + unbiased (bool): when True, use Bessel’s correction, otherwise, compute + the uncorrected sample variance. + +Keyword args: + keepdim (bool, optional): whether the output tensor has + :attr:`dim` retained or not. Default: False. + dtype (:class:`torch.dtype`, optional): the desired data type + of returned tensor. If specified, the input tensor is + casted to :attr:`dtype` before the operation is + performed. Default: None. + mask (:class:`torch.Tensor`, optional): the boolean tensor + containing the binary mask of validity of input tensor + elements. + Default: None that is equivalent to ``torch.ones(input.shape, dtype=torch.bool)``. + +Example:: + + >>> input = tensor([[-3, -2, -1], [ 0, 1, 2]]) + >>> input + tensor([[-3, -2, -1], + [ 0, 1, 2]]) + >>> mask = tensor([[ True, False, True], [False, False, False]]) + >>> mask + tensor([[ True, False, True], + [False, False, False]]) + >>> torch._masked.var(input, 1, False, mask=mask) + tensor([1., nan]) +""" From e785c0a1ab689737ae3c202342c4a729f9e89bcf Mon Sep 17 00:00:00 2001 From: Pearu Peterson Date: Wed, 16 Feb 2022 18:49:18 -0800 Subject: [PATCH 123/199] Enable Half/BFloat16 support for to_dense and coalesce methods. (#72397) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72397 Test Plan: Imported from OSS Reviewed By: jbschlosser, zou3519 Differential Revision: D34286114 Pulled By: cpuhrsch fbshipit-source-id: a4f7e2abc3b2d37437cbd09d693c1b409bb011b9 (cherry picked from commit 74f94447fcf12ff7c740e1008c84d0df9ec9e1f5) --- aten/src/ATen/native/cpu/BlasKernel.cpp | 4 +-- aten/src/ATen/native/sparse/SparseTensor.cpp | 3 +- .../ATen/native/sparse/SparseTensorMath.cpp | 4 +-- .../sparse/cuda/SparseCUDATensorMath.cu | 4 +-- test/test_sparse.py | 35 ++++++------------- test/test_sparse_csr.py | 3 +- 6 files changed, 19 insertions(+), 34 deletions(-) diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp index c5c938818d0..6b00c78195b 100644 --- a/aten/src/ATen/native/cpu/BlasKernel.cpp +++ b/aten/src/ATen/native/cpu/BlasKernel.cpp @@ -190,7 +190,7 @@ void cpublas_gemm_impl( } void cpublas_axpy_impl(at::ScalarType type, int64_t n, const Scalar& _a, const void *_x, int64_t incx, void *_y, int64_t incy){ - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(type, "cpublas_axpy_impl", + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::kHalf, at::kBFloat16, type, "cpublas_axpy_impl", [&] { auto a = _a.to(); auto x = static_cast(_x); @@ -202,7 +202,7 @@ void cpublas_axpy_impl(at::ScalarType type, int64_t n, const Scalar& _a, const v } void cpublas_copy_impl(at::ScalarType type, int64_t n, const void *_x, int64_t incx, void *_y, int64_t incy){ - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(type, "cpublas_copy_impl", + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::kHalf, at::kBFloat16, type, "cpublas_copy_impl", [&] { auto x = static_cast(_x); auto y = static_cast(_y); diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 814acad4c7f..6de64bfbf2c 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -640,7 +640,8 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) { auto indicesBufferAccessor = indicesBuffer.accessor(); int64_t i = -1; - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(values.scalar_type(), "coalesce", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, values.scalar_type(), + "coalesce", [&] { int64_t prev = -1; int64_t blockSize = values.stride(0); scalar_t* values_ptr = values.data_ptr(); diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 611154fdee2..c23486336a1 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -676,7 +676,7 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen dstBuffer.add_(srcBuffer, value); } } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::Bool, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(at::ScalarType::Bool, at::ScalarType::BFloat16, at::ScalarType::Half, commonDtype, "add_dense_sparse", [&] { add_dense_sparse_worker_cpu(resultBuffer, value, sparse, indices, valuesBuffer); }); @@ -781,7 +781,7 @@ SparseTensor& mul_out_sparse_cpu(const Tensor& t_, const Tensor& src_, SparseTen s_i++; } } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX( + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, commonDtype, "mul_out_sparse", [&] { auto r_accessor = r_buffer.accessor(); auto t_accessor = t_values.accessor(); diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu index 0d99e298ec9..9dbf562300f 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu @@ -503,8 +503,8 @@ SparseTensor& mul_out_sparse_cuda(const SparseTensor& t_, const SparseTensor& sr TORCH_CHECK(cuda::getApplyGrid(valueSize, grid, curDevice), "mul: Argument #0: tensor too large or too many dimensions"); Tensor resultNnz = at::empty({1}, CUDA(kLong)); - AT_DISPATCH_ALL_TYPES_AND( - at::ScalarType::Half, commonDtype, "mul_out_sparse_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2( + at::ScalarType::Half, at::ScalarType::BFloat16, commonDtype, "mul_out_sparse_cuda", [&] { apply::valueSparseIntersectionKernel<<>>( TensorMulOp(), I_INFO(r_indices_), I_INFO(t_indices_), I_INFO(s_indices_), diff --git a/test/test_sparse.py b/test/test_sparse.py index 17f86f3f0a6..0ad1a91b56b 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -299,24 +299,18 @@ class TestSparse(TestCase): RuntimeError, lambda: self.sparse_tensor(indices, values, torch.Size([2, 4, 2, 1]))) - @dtypes(*floating_and_complex_types_and(torch.float16)) + @dtypes(*floating_and_complex_types_and(torch.float16, torch.bfloat16)) def test_to_dense(self, device, dtype): def test_tensor(x, res): x.to_dense() # Tests triple to_dense for memory corruption x.to_dense() x.to_dense() - # We dont have to_dense for half types, so we don't request - # exact_dtype if res.type is torch.float16. dense_x = x.to_dense() safe_dense_x = self.safeToDense(x) - if (res.dtype == torch.float16): - exact_dtype = False - else: - exact_dtype = True - dense_x = dense_x.to(res.dtype) - safe_dense_x = safe_dense_x.to(res.dtype) - self.assertEqual(res, dense_x, exact_dtype=exact_dtype) - self.assertEqual(res, safe_dense_x, exact_dtype=exact_dtype) + dense_x = dense_x.to(res.dtype) + safe_dense_x = safe_dense_x.to(res.dtype) + self.assertEqual(res, dense_x) + self.assertEqual(res, safe_dense_x) def fn(x): return x.to_dense() @@ -360,16 +354,9 @@ class TestSparse(TestCase): res = torch.empty((3, 4, 5, 0), dtype=dtype, device=device) test_tensor(x, res) - # half tensors on cpu don't implement to_dense, so need to convert to float - def _to_dense_half_safe(self, tensor): - if(tensor.dtype == torch.half and tensor.device.type == 'cpu'): - return tensor.to(torch.float).to_dense().to(torch.half) - else: - return tensor.to_dense() - @coalescedonoff @skipIfRocm - @dtypes(torch.float16, torch.float64, torch.int, torch.cfloat, torch.cdouble) + @dtypes(torch.float16, torch.bfloat16, torch.float64, torch.int, torch.cfloat, torch.cdouble) def test_to_sparse(self, device, dtype, coalesced): shape = [5, 2, 10, 4] max_nnz = 1 @@ -382,9 +369,9 @@ class TestSparse(TestCase): coalesced=coalesced) expected = expected.to(dtype) - d = self._to_dense_half_safe(expected) + d = expected.to_dense() result = d.to_sparse(dim) - self.assertEqual(d, self._to_dense_half_safe(result)) # == not implemented for sparse tensors yet + self.assertEqual(d, result.to_dense()) self.assertEqual(expected.size(), result.size()) self.assertEqual(dim, result.sparse_dim()) @@ -1990,8 +1977,7 @@ class TestSparse(TestCase): sparse_tensor.requires_grad_() @coalescedonoff - @dtypes(*get_all_dtypes(include_bool=False, include_half=False, - include_bfloat16=False, include_complex=False)) + @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_complex=False)) def test_log1p(self, device, dtype, coalesced): if coalesced: input_coalesced = torch.sparse_coo_tensor( @@ -2134,8 +2120,7 @@ class TestSparse(TestCase): op(sparse_tensor) @coalescedonoff - @dtypes(*get_all_dtypes(include_bool=False, include_half=False, - include_bfloat16=False, include_complex=False)) + @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_complex=False)) def test_asin_arcsin(self, device, dtype, coalesced): if coalesced: input_coalesced = torch.sparse_coo_tensor( diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py index 3ba12fe4d07..5ca01171824 100644 --- a/test/test_sparse_csr.py +++ b/test/test_sparse_csr.py @@ -1370,7 +1370,6 @@ class TestSparseCSR(TestCase): self.assertEqual(actual.col_indices(), expect.col_indices()) self.assertEqual(actual._nnz(), expect._nnz()) - @unittest.expectedFailure @ops(sparse_csr_unary_ufuncs, dtypes=OpDTypes.supported, allowed_dtypes=[torch.double, torch.cdouble]) def test_autograd_sparse_csr_unary(self, device, dtype, op): @@ -1486,7 +1485,7 @@ class TestSparseCSR(TestCase): args = [make_tensor(a.shape, device=device, dtype=dtype, noncontiguous=True, requires_grad=True) for a in sample.args] self.assertTrue(torch.autograd.gradcheck(fn, args, fast_mode=True)) - @dtypes(*get_all_dtypes(include_bool=False, include_half=False, include_bfloat16=False)) + @dtypes(*get_all_dtypes(include_bool=False)) def test_direct_coo_csr_conversion(self, device, dtype): for m, n in itertools.product([5, 2, 0], [5, 2, 0]): size = (m, n) From 5ea74b4996d4dd9ed52263f12b2e7777c7e0f661 Mon Sep 17 00:00:00 2001 From: Don Jang Date: Wed, 16 Feb 2022 21:03:47 -0800 Subject: [PATCH 124/199] [Static Runtime] Remove ProcessedNode::num_outputs_ (#72592) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72592 Only code paths that are not perf-critical read `ProcessedNode::num_outputs_` and also its static feature of the op that `ProcessedNode` instance is executing. Therefore, it's better to move `ProcessedNode::num_outputs_` into `ProcessedFunction::num_outputs_` and let `ProcessedNode` access it via `ProcessedNode::fn_` for its occasional use. Note that this prevents duplicating num_outputs_ per node & per Static Runtime instance since `ProcessedFunction` instances are shared across all runtime instances. It's confirmed that this change reduces the `sizeof(ProcessedNode)` by 14% from local instrumentation as follows: - Before -- sizeof(ProcessedNode): 56 - After -- sizeof(Processednode): 48 Test Plan: `buck test //caffe2/benchmarks/static_runtime:static_runtime_cpptest` Reviewed By: mikeiovine Differential Revision: D33984792 fbshipit-source-id: e29ffc97b799e679215f42e1e85cd3fcd7e88983 (cherry picked from commit 0f7003f4dfd6473a70355ca3c6f51498abf1d7be) --- torch/csrc/jit/runtime/static/impl.cpp | 23 ++++++++------------ torch/csrc/jit/runtime/static/impl.h | 30 +++++++++++++++----------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 54a33ed0b12..f8984129582 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -1734,7 +1734,8 @@ ProcessedFunction::ProcessedFunction( Node* node, bool enable_out_variant, bool check_memory_overlap) - : check_memory_overlap_(check_memory_overlap) { + : check_memory_overlap_(check_memory_overlap), + num_outputs_(node->outputs().size()) { if (enable_out_variant) { f_ = getOutOfPlaceOperation(node); if (f_) { @@ -1791,13 +1792,7 @@ ProcessedNode::ProcessedNode( fn_(fn), inputs_(std::move(inputs)), outputs_offset_(outputs_offset) { - TORCH_CHECK( - node->outputs().size() < (1 << (sizeof(num_outputs_) * 8)), - node->outputs().size(), - " outputs to ProcessedNode ", - node->kind().toQualString(), - " is too many to use 2-byte indexing"); - num_outputs_ = node->outputs().size(); + TORCH_CHECK(num_outputs() == node->outputs().size()); } std::vector ProcessedNode::inputs_ivalue_vec() const { @@ -1869,12 +1864,12 @@ bool ProcessedNode::verify_no_memory_overlap(bool force_check) const { } bool ProcessedNode::verify_outputs_dont_overlap_each_other() const { - for (const auto i : c10::irange(num_outputs_)) { + for (const auto i : c10::irange(num_outputs())) { if (!Output(i).isTensor()) { continue; } const auto& out0_t = Output(i).toTensor(); - for (const auto j : c10::irange(i + 1, num_outputs_)) { + for (const auto j : c10::irange(i + 1, num_outputs())) { if (!Output(j).isTensor()) { continue; } @@ -1894,7 +1889,7 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const { // skip memory overlap check for mutable or view ops with only one output bool skip_check = !schema || ((schema->is_mutable() || !fn_->checkMemoryOverlap()) && - num_outputs_ == 1); + num_outputs() == 1); if (!force_check && skip_check) { if (!schema) { VLOG(2) << "Detected that op schema is null"; @@ -1902,7 +1897,7 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const { } VLOG(2) << "schema->is_mutable: " << schema->is_mutable() << ", fn_->checkMemoryOverlap: " << fn_->checkMemoryOverlap() - << ", num_outputs_: " << num_outputs_; + << ", num_outputs_: " << num_outputs(); return true; } @@ -1912,7 +1907,7 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const { continue; } const auto& in_t = in->toTensor(); - for (const auto j : c10::irange(num_outputs_)) { + for (const auto j : c10::irange(num_outputs())) { const IValue& out = Output(j); if (!out.isTensor()) { continue; @@ -1949,7 +1944,7 @@ void ProcessedNode::verify_and_correct_memory_overlap() { continue; } const auto& in_t = in.toTensor(); - for (const auto j : c10::irange(num_outputs_)) { + for (const auto j : c10::irange(num_outputs())) { auto& output = Output(j); if (output.isTensor()) { check_and_correct_overlap_with(in_t, output); diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index c0b4f5b0b2d..27fcf2e5a24 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -752,10 +752,15 @@ class TORCH_API ProcessedFunction { return check_memory_overlap_; } + size_t num_outputs() const { + return num_outputs_; + } + private: std::function f_; Kind kind_{ProcessedFunction::Kind::kOutVariant}; bool check_memory_overlap_{false}; + size_t num_outputs_{0}; }; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) @@ -777,10 +782,9 @@ class TORCH_API ProcessedNode { ProcessedNode(const ProcessedNode& other) : node_(other.node_), fn_(other.fn_), - overlap_detected_(other.overlap_detected_), inputs_(other.inputs_), outputs_offset_(other.outputs_offset_), - num_outputs_(other.num_outputs_), + overlap_detected_(other.overlap_detected_), values_(other.values_), // It doesn't really make sense to copy block runners, // each processed node needs its own. This is OK to do @@ -797,10 +801,9 @@ class TORCH_API ProcessedNode { } node_ = other.node_; fn_ = other.fn_; - overlap_detected_ = other.overlap_detected_; inputs_ = other.inputs_; outputs_offset_ = other.outputs_offset_; - num_outputs_ = other.num_outputs_; + overlap_detected_ = other.overlap_detected_; values_ = other.values_; block_runners_ = nullptr; return *this; @@ -825,21 +828,23 @@ class TORCH_API ProcessedNode { // Output is readwrite IValue& Output(uint32_t i) { - DCHECK(i < num_outputs_); + DCHECK(i < num_outputs()); return values_[outputs_offset_ + i]; } C10_NODISCARD const IValue& Output(uint32_t i) const { - DCHECK(i < num_outputs_); + DCHECK(i < num_outputs()); return values_[outputs_offset_ + i]; } - C10_NODISCARD c10::ArrayRef outputs() const { - return c10::ArrayRef(values_ + outputs_offset_, num_outputs_); + size_t num_outputs() const { + DCHECK(fn_ != nullptr); + return fn_->num_outputs(); } - C10_NODISCARD auto num_outputs() const { - return num_outputs_; + C10_NODISCARD c10::ArrayRef outputs() const { + return c10::ArrayRef( + values_ + outputs_offset_, num_outputs()); } C10_NODISCARD uint16_t num_inputs() const { @@ -885,7 +890,7 @@ class TORCH_API ProcessedNode { } C10_NODISCARD uint16_t output_ivalue_index(uint16_t i) const { - DCHECK(i < num_outputs_); + DCHECK(i < num_outputs()); return outputs_offset_ + i; } // used in debug mode @@ -907,10 +912,9 @@ class TORCH_API ProcessedNode { Node* node_; const ProcessedFunction* fn_; - bool overlap_detected_{false}; ProcessedNodeInputs inputs_; uint16_t outputs_offset_; - uint16_t num_outputs_; + bool overlap_detected_{false}; IValue* values_ = nullptr; // unowned // For control flow; processed nodes may have sub-blocks which can // be executed by op implementations. From 540cb5fee2d59cb7eea9d35d4ff006d889503277 Mon Sep 17 00:00:00 2001 From: Jordan Fix Date: Wed, 16 Feb 2022 22:31:51 -0800 Subject: [PATCH 125/199] [graph_manipulation] Unpack list of outputs (#72940) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72940 att Reviewed By: jackm321 Differential Revision: D34282062 fbshipit-source-id: 743710c18e1f38286d1b91c91868bb22c760f3ca (cherry picked from commit fd2bdd189d4587cd78949f27fce674701f42be18) --- torch/fx/passes/graph_manipulation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/fx/passes/graph_manipulation.py b/torch/fx/passes/graph_manipulation.py index 4c429e42421..9718e7b72f5 100644 --- a/torch/fx/passes/graph_manipulation.py +++ b/torch/fx/passes/graph_manipulation.py @@ -451,9 +451,9 @@ def serialize_module(fx_module: GraphModule, weights: Dict, name_prefix="") -> D get_output_arg_info, ) - # If there're multiple outputs then node_rep["args"][0] will be a tuple. - # In this case we want to unpack the tuple. - if isinstance(node_rep["args"][0], tuple): + # If there're multiple outputs then node_rep["args"][0] will be a tuple or + # list. In this case we want to unpack the tuple or list. + if isinstance(node_rep["args"][0], (tuple, list)): node_rep["args"] = node_rep["args"][0] else: node_rep["args"] = map_aggregate(node.args, get_arg_info) From cee84f4051fd6c348e48fdc5ae5b9ce7138f51ae Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Wed, 16 Feb 2022 23:05:27 -0800 Subject: [PATCH 126/199] fix model dump for the lowered module (#72866) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72866 https://github.com/pytorch/pytorch/pull/71597 adds a wrapper `torch.jit.LoweredWrapper` and it breaks the model dump. Fix the model_dump in the notebook ghstack-source-id: 149311636 Test Plan: CI and test with N509022 Before: {F701413403} After: {F701412963} Reviewed By: iseeyuan Differential Revision: D34247216 fbshipit-source-id: 695b02b03675fae596bb450441b327e4cdcffe9c (cherry picked from commit d46a82a4c125722ca17f67b576b2aec523de1f15) --- torch/utils/model_dump/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torch/utils/model_dump/__init__.py b/torch/utils/model_dump/__init__.py index 64287faf2a1..23e3682f645 100644 --- a/torch/utils/model_dump/__init__.py +++ b/torch/utils/model_dump/__init__.py @@ -120,7 +120,11 @@ def hierarchical_pickle(data): } if isinstance(data, torch.utils.show_pickle.FakeObject): typename = f"{data.module}.{data.name}" - if typename.startswith("__torch__.") or typename.startswith("torch.jit.LoweredModule."): + if ( + typename.startswith("__torch__.") or + typename.startswith("torch.jit.LoweredWrapper.") or + typename.startswith("torch.jit.LoweredModule.") + ): assert data.args == () return { "__module_type__": typename, From c22b8a42e6038ed2f6a161114cf3d8faac3f6e9a Mon Sep 17 00:00:00 2001 From: Facebook Community Bot Date: Thu, 17 Feb 2022 00:32:33 -0800 Subject: [PATCH 127/199] Automated submodule update: FBGEMM (#72805) Summary: This is an automated pull request to update the first-party submodule for [pytorch/FBGEMM](https://github.com/pytorch/FBGEMM). New submodule commit: https://github.com/pytorch/FBGEMM/commit/f4e0fcd9d5f7395fd7d97feb748c83168deb7b84 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72805 Test Plan: Ensure that CI jobs succeed on GitHub before landing. Reviewed By: jasonjk-park Differential Revision: D34218445 fbshipit-source-id: a4ed3bbe0bb36eedddfedc0babce3b65684957d0 (cherry picked from commit fc38eef46b90e0390e0c3c35a077019e292976ff) --- third_party/fbgemm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/fbgemm b/third_party/fbgemm index 1ddff63cd3a..365abe3ee87 160000 --- a/third_party/fbgemm +++ b/third_party/fbgemm @@ -1 +1 @@ -Subproject commit 1ddff63cd3a99bdd8f52e8147dbfe723522d2f48 +Subproject commit 365abe3ee878b2592e9a33f937d96df0048d99dd From e0e1e0b114d57542e65a054d05a81618e786fe18 Mon Sep 17 00:00:00 2001 From: zsef123 Date: Thu, 17 Feb 2022 06:28:46 -0800 Subject: [PATCH 128/199] Fix empty tensor handling in RReLU (#70496) Summary: Fixes https://github.com/pytorch/pytorch/issues/70489 Add handling if `numel == 0` Pull Request resolved: https://github.com/pytorch/pytorch/pull/70496 Reviewed By: zou3519, cpuhrsch Differential Revision: D34286069 Pulled By: jbschlosser fbshipit-source-id: a63797fe1ea05e5a192bc8e43327949b36ceebf2 (cherry picked from commit b410abe85e6554406550ed5cb36ffbc417214d66) --- aten/src/ATen/native/cuda/RreluWithNoise.cu | 7 +++++++ test/test_nn.py | 15 +++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/aten/src/ATen/native/cuda/RreluWithNoise.cu b/aten/src/ATen/native/cuda/RreluWithNoise.cu index 048118cf792..b73097758fd 100644 --- a/aten/src/ATen/native/cuda/RreluWithNoise.cu +++ b/aten/src/ATen/native/cuda/RreluWithNoise.cu @@ -1,6 +1,7 @@ #include #include #include +#include namespace at { namespace native { @@ -132,6 +133,12 @@ Tensor& rrelu_with_noise_out_cuda(const Tensor& self, bool training, c10::optional generator, Tensor& output) { + at::native::resize_output(output, self.sizes()); + + if (self.numel() == 0) { + return output; + } + TensorArg self_arg{self, "self", 1}, noise_arg{noise, "noise", 2}, output_arg{output, "output", 3}; checkAllSameGPU("rrelu_with_noise_out_cuda", {self_arg, noise_arg, output_arg}); diff --git a/test/test_nn.py b/test/test_nn.py index cdfddcc2d21..abea2592aee 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -14707,6 +14707,21 @@ class TestNNDeviceType(NNTestCase): with self.assertRaises(RuntimeError): torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device), -2) + def test_nn_empty(self, device): + # One off tests to ensure scalars from nn.yaml are properly applied + def verify_scalars(input, output): + self.assertEqual(input.shape, output.shape) + self.assertEqual(0, output.numel()) + + for input_shape in [(0), (0, 2)]: + for module in [torch.nn.ELU, torch.nn.Hardtanh, torch.nn.LeakyReLU, torch.nn.LogSigmoid, + torch.nn.RReLU, torch.nn.Softshrink, torch.nn.Softplus, torch.nn.Sigmoid, + torch.nn.Tanh]: + input = torch.randn(input_shape, device=device, requires_grad=True) + m = module() + output = m(input) + verify_scalars(input, output) + def test_nn_scalars(self, device): # One off tests to ensure scalars from nn.yaml are properly applied def verify_scalars(input, output): From fc832d476df0eb03c94ffe022ef8fc0b87198a09 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 17 Feb 2022 08:13:00 -0800 Subject: [PATCH 129/199] gitignore tools/bazel executable (#72878) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72878 Signed-off-by: Edward Z. Yang Test Plan: Imported from OSS Reviewed By: anjali411 Differential Revision: D34252470 Pulled By: ezyang fbshipit-source-id: 5b4d6738c2fed7c1acc860fd9addaca8a24fa937 (cherry picked from commit 5aa28474a262859a0b543e14f53691650c5752ed) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 71e9d56255e..4a332afb8d0 100644 --- a/.gitignore +++ b/.gitignore @@ -255,6 +255,9 @@ cmake-build-debug # # Below files are not deleted by "setup.py clean". +# Downloaded bazel +tools/bazel + # Visual Studio Code files .vs /.vscode/* From ec8d6777255821bed73b471eadddde068cd60c0b Mon Sep 17 00:00:00 2001 From: Don Jang Date: Thu, 17 Feb 2022 08:47:33 -0800 Subject: [PATCH 130/199] [Static Runtime] Add a script to auto-generate out variant dispatchers (#72602) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72602 This change adds a script to auto-generate out variant dispatchers from `caffe2/aten/src/ATen/native/native_functions.yaml` for Static Runtime. Test Plan: The following diff, D33373928, adds out variant dispatchers generated by this diff. Reviewed By: mikeiovine Differential Revision: D33373919 fbshipit-source-id: 1f6d0766c561c405a2d61d0171064f39e3d15a79 (cherry picked from commit b3a1923331808f119b65776452786278dd66769f) --- tools/codegen/static_runtime/__init__.py | 0 tools/codegen/static_runtime/config.py | 138 ++++++++ .../static_runtime/gen_static_runtime_ops.py | 148 +++++++++ .../codegen/static_runtime/gen_structured.py | 310 ++++++++++++++++++ 4 files changed, 596 insertions(+) create mode 100644 tools/codegen/static_runtime/__init__.py create mode 100644 tools/codegen/static_runtime/config.py create mode 100644 tools/codegen/static_runtime/gen_static_runtime_ops.py create mode 100644 tools/codegen/static_runtime/gen_structured.py diff --git a/tools/codegen/static_runtime/__init__.py b/tools/codegen/static_runtime/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tools/codegen/static_runtime/config.py b/tools/codegen/static_runtime/config.py new file mode 100644 index 00000000000..959c16e8b37 --- /dev/null +++ b/tools/codegen/static_runtime/config.py @@ -0,0 +1,138 @@ +from tools.codegen.model import NativeFunctionsGroup + +from typing import Dict + + +def func_name_base_str(g: NativeFunctionsGroup) -> str: + return str(g.functional.func.name.name.base) + +is_hand_written_ops_ = frozenset(("add", "addmm", "all", "any", "argmin", "bmm", "clamp", + "cumsum", "div", "fmod", "leaky_relu", "log", "mul", "pow", + "remainder", "sigmoid", "sign", "sub", "tanh")) + +def is_hand_written(g: NativeFunctionsGroup) -> bool: + name_base = func_name_base_str(g) + return name_base in is_hand_written_ops_ + +def override_test_values(arg_map: Dict[str, str], op_name: str, index: int) -> None: + assert index == 0 or index == 1 + if op_name == "addmv": + if index == 0: + arg_map["self"] = "at::rand({2})" + arg_map["mat"] = "at::rand({2, 2})" + arg_map["vec"] = "at::rand({2})" + else: + arg_map["self"] = "at::rand({35})" + arg_map["mat"] = "at::rand({35, 35})" + arg_map["vec"] = "at::rand({35})" + return + if op_name == "acosh": + if index == 0: + arg_map["self"] = "at::rand({2, 2, 2}) + at::ones({2, 2, 2})" + else: + arg_map["self"] = "at::rand({5, 5, 5}) + at::ones({5, 5, 5})" + return + if op_name == "index_add": + if index == 0: + arg_map["self"] = "at::rand({2})" + arg_map["dim"] = "0" + arg_map["index"] = "at::randint(0, 1, {2}, at::kInt)" + arg_map["source"] = "at::rand({2})" + arg_map["alpha"] = "2" + else: + arg_map["self"] = "at::rand({16})" + arg_map["dim"] = "0" + arg_map["index"] = "at::randint(0, 10, {16}, at::kInt)" + arg_map["source"] = "at::rand({16})" + arg_map["alpha"] = "2" + return + if op_name == "adaptive_max_pool2d_backward": + if index == 0: + arg_map["grad_output"] = "at::randint(-3, 2, {2,2,2})" + arg_map["self"] = "at::randint(-3, 2, {2,2,2})" + arg_map["indices"] = "at::randint(0, 1, {2,2,2}, at::kLong)" + else: + arg_map["grad_output"] = "at::randint(-3, 3, {3,3,3})" + arg_map["self"] = "at::randint(-3, 2, {3,3,3})" + arg_map["indices"] = "at::randint(0, 1, {3,3,3}, at::kLong)" + return + if op_name == "adaptive_max_pool3d_backward": + if index == 0: + arg_map["grad_output"] = "at::randint(-3, 2, {2,2,2,2})" + arg_map["self"] = "at::randint(-3, 2, {2,2,2,2})" + arg_map["indices"] = "at::randint(0, 1, {2,2,2,2}, at::kLong)" + else: + arg_map["grad_output"] = "at::randint(-3, 3, {3,3,3,3})" + arg_map["self"] = "at::randint(-3, 2, {3,3,3,3})" + arg_map["indices"] = "at::randint(0, 1, {3,3,3,3}, at::kLong)" + return + if op_name == "gather": + if index == 0: + arg_map["self"] = "at::randint(1, 100, {2,2,2}, at::kInt)" + arg_map["dim"] = "1" + arg_map["index"] = "at::randint(0, 1, {2,2,2}, torch::kInt64)" + arg_map["sparse_grad"] = "false" + else: + arg_map["self"] = "at::randint(1, 100, {5,5,5}, at::kInt)" + arg_map["dim"] = "1" + arg_map["index"] = "at::randint(0, 4, {5,5,5}, torch::kInt64)" + arg_map["sparse_grad"] = "false" + return + if op_name == "nll_loss_backward": + if index == 0: + arg_map["grad_output"] = "at::rand({})" + arg_map["self"] = "at::rand({6})" + arg_map["target"] = "at::randint(0, 5, {6}, torch::kInt64)" + arg_map["weight"] = "at::rand({6})" + arg_map["reduction"] = "1" + arg_map["ignore_index"] = "1" + arg_map["total_weight"] = "at::rand({})" + else: + arg_map["grad_output"] = "at::rand({})" + arg_map["self"] = "at::rand({36})" + arg_map["target"] = "at::randint(0, 11, {36}, torch::kInt64)" + arg_map["weight"] = "at::rand({36})" + arg_map["reduction"] = "1" + arg_map["ignore_index"] = "1" + arg_map["total_weight"] = "at::rand({})" + return + if op_name in ["scatter", "scatter_add", "_scatter_reduce"]: + if index == 0: + arg_map["self"] = "at::randint(1, 100, {2,2,2}, torch::kInt64)" + arg_map["index"] = "at::randint(0, 1, {2,2,2}, torch::kInt64)" + arg_map["src"] = "at::randint(1, 100, {2,2,2}, torch::kInt64)" + else: + arg_map["self"] = "at::randint(1, 100, {5,5,5}, torch::kInt64)" + arg_map["index"] = "at::randint(0, 1, {5,5,5}, torch::kInt64)" + arg_map["src"] = "at::randint(1, 100, {5,5,5}, torch::kInt64)" + if "reduce" in arg_map: + arg_map["reduce"] = "\"sum\"" if op_name == "_scatter_reduce" else "\"add\"" + return + if op_name == "special_zeta": + if index == 0: + arg_map["self"] = "at::rand({2,2,2}, at::kDouble) + at::ones({2,2,2})" + arg_map["other"] = "at::rand({2,2,2}, at::kDouble) + at::ones({2,2,2})" + else: + arg_map["self"] = "at::rand({5,5,5}, at::kDouble) + at::ones({5,5,5})" + arg_map["other"] = "at::rand({5,5,5}, at::kDouble) + at::ones({5,5,5})" + return + if op_name == "_convert_indices_from_csr_to_coo": + if index == 0: + arg_map["crow_indices"] = "torch::tensor({1}, torch::kInt32)" + arg_map["col_indices"] = "torch::tensor({0, 1, 0}, torch::kInt32)" + arg_map["out_int32"] = "false" + else: + arg_map["crow_indices"] = "torch::tensor({0, 1}, torch::kInt32)" + arg_map["col_indices"] = "torch::tensor({0, 1, 0, 2, 1, 2}, torch::kInt32)" + arg_map["out_int32"] = "false" + return + if op_name == "_convert_indices_from_coo_to_csr": + if index == 0: + arg_map["self"] = "at::randint(0, 3, {2}, at::kInt)" + arg_map["size"] = "10" + arg_map["out_int32"] = "false" + else: + arg_map["self"] = "at::randint(0, 3, {12}, at::kInt)" + arg_map["size"] = "24" + arg_map["out_int32"] = "false" + return diff --git a/tools/codegen/static_runtime/gen_static_runtime_ops.py b/tools/codegen/static_runtime/gen_static_runtime_ops.py new file mode 100644 index 00000000000..ab2eedb867a --- /dev/null +++ b/tools/codegen/static_runtime/gen_static_runtime_ops.py @@ -0,0 +1,148 @@ +from tools.codegen import gen +from tools.codegen.context import native_function_manager +from tools.codegen.model import NativeFunctionsGroup +from tools.codegen.static_runtime import gen_structured + +import argparse +import itertools +import os +from typing import Sequence + +# Given a list of `grouped_native_functions` sorted by their op names, return a list of +# lists each of which groups ops that share the base name. For example, `mean` and +# `mean.dim` are grouped together by this function. +def group_functions_by_op_name(grouped_native_functions: + Sequence[NativeFunctionsGroup]) -> Sequence[Sequence[NativeFunctionsGroup]]: + if not grouped_native_functions: + return [] + groups = [] + current_op_name = None + current_group = None + + def is_supported(g: NativeFunctionsGroup) -> bool: + with native_function_manager(g): + return gen_structured.is_supported(g) + + eligible_ops = (g for g in grouped_native_functions if is_supported(g)) + groups = [list(group) for k, group in (itertools.groupby(eligible_ops, key=lambda g: g.functional.func.name.name.base))] + return groups + +def clang_format(cpp_file_path: str) -> None: + import subprocess + subprocess.run(["clang-format", "-i", cpp_file_path]) + +def write_cpp(cpp_ops: Sequence[str], file_path: str) -> None: + code = "\n".join(cpp_ops) + generated = f"""// @lint-ignore-every CLANGTIDY HOWTOEVEN +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace torch {{ +namespace jit {{ + +{code} + +}} // namespace jit +}} // namespace torch +""" + with open(file_path, "w") as f: + f.write(generated) + clang_format(file_path) + + +def write_test_cpp(cpp_ops: Sequence[str], file_path: str) -> None: + code = "\n".join(cpp_ops) + generated = f"""// @lint-ignore-every CLANGTIDY HOWTOEVEN +#include +#include +#include + +#include "test_utils.h" + +using namespace caffe2; +using namespace torch; +using namespace torch::jit; +using namespace torch::jit::test; +using c10::IValue; + +{code} + +""" + with open(file_path, "w") as f: + f.write(generated) + clang_format(file_path) + +def main() -> None: + parser = argparse.ArgumentParser(description='Generate ATen source files') + parser.add_argument( + '-s', + '--source-path', + help='path to source directory for ATen', + default='aten/src/ATen') + parser.add_argument( + '-p', + '--generated-ops-cpp-path', + help='path to directory to generate op dispatcher .cpp file', + default='torch/csrc/jit/runtime/static/generated_ops.cpp') + parser.add_argument( + '-t', + '--generated-ops-test-cpp-path', + help='path to directory to generate op dispatcher .cpp file', + default='benchmarks/static_runtime/test_generated_ops.cc') + options = parser.parse_args() + native_yaml_path = os.path.join(options.source_path, 'native/native_functions.yaml') + parsed_yaml = gen.parse_native_yaml(native_yaml_path) + native_functions, backend_indices = parsed_yaml.native_functions, parsed_yaml.backend_indices + grouped_native_functions = gen.get_grouped_native_functions(native_functions) + structured_native_functions = [g for g in grouped_native_functions + if isinstance(g, NativeFunctionsGroup)] + supported_function_groups = group_functions_by_op_name(structured_native_functions) + + gen_out_variant_dispatcher = gen_structured.GenOutVariantDispatcher() + result = [gen_out_variant_dispatcher(groups) for groups in supported_function_groups] + + gen_out_variant_dispatcher_test_case = gen_structured.GenOutVariantDispatcherTestCase() + test_result = [gen_out_variant_dispatcher_test_case(groups) for groups in supported_function_groups] + + write_cpp(result, options.generated_ops_cpp_path) + write_test_cpp(test_result, options.generated_ops_test_cpp_path) + + print("total grouped native ops: %d" % len(grouped_native_functions)) + print("structured grouped native ops: %d" % len(structured_native_functions)) + supported_grouped_functions = sum([len(groups) for groups in supported_function_groups]) + print("generated grouped native ops: %d" % supported_grouped_functions) + +if __name__ == '__main__': + main() diff --git a/tools/codegen/static_runtime/gen_structured.py b/tools/codegen/static_runtime/gen_structured.py new file mode 100644 index 00000000000..2db829e4fe7 --- /dev/null +++ b/tools/codegen/static_runtime/gen_structured.py @@ -0,0 +1,310 @@ +import tools.codegen.api.cpp as cpp +from tools.codegen.context import native_function_manager +from tools.codegen.model import (Argument, BaseTy, FunctionSchema, OptionalType, + SelfArgument, + BaseType, NativeFunctionsGroup, TensorOptionsArguments, Type) +from tools.codegen.static_runtime import config + +import math +from typing import List, Optional, Sequence, Tuple, Union + + +def has_alias(arguments: Sequence[Union[Argument, SelfArgument, TensorOptionsArguments]]) -> bool: + for arg in arguments: + annotation = getattr(arg, "annotation", None) + if not annotation: + continue + alias_set = getattr(annotation, "alias_set", ()) + if alias_set: + return True + return False + +def is_supported(g: NativeFunctionsGroup) -> bool: + if not g.structured: + return False + if config.is_hand_written(g): + return False + if has_alias(g.out.func.arguments.non_out): + # This op may create an alias of inputs. + return False + if len(g.out.func.arguments.out) > 1: + # More than 1 output values. + return False + if "at::Tensor &" != cpp.returns_type(g.out.func.returns).cpp_type(): + # Returns a non-Tensor value. + return False + for arg in g.out.func.schema_order_arguments(): + maybe_method = ivalue_type_conversion_method(arg.type) + if not maybe_method: + # Type converting is unsupported yet. + return False + return True + +def ivalue_type_conversion_method(arg_type: Union[BaseType, OptionalType, Type]) -> Optional[Tuple[bool, str]]: + """ + Return the method call expression of `c10::ivalue' to convert its contained value to + the expected value of `arg_type` type. For example, for `arg_type` == BaseTy.Tensor, + this function returns ".toTensor()", so that it can be appended to the ivalue's + variable name to get the value of the expected type. + """ + type_conversion_methods = { + BaseTy.Tensor: ((True, "toTensor()"), (False, "toOptional()")), + BaseTy.int: ((False, "toInt()"), (False, "toOptional()")), + BaseTy.bool: ((False, "toBool()"), (False, "toOptional()")), + BaseTy.Scalar: ((False, "toScalar()"), (False, "toOptional()")), + BaseTy.ScalarType: ((False, "toScalarType()"), (False, "toOptional()")), + BaseTy.str: ((False, "toStringView()"), (False, "toOptional()"))} + + base_ty_object = None + if isinstance(arg_type, BaseType): + base_ty_object = arg_type.name + elif isinstance(arg_type, OptionalType): + assert isinstance(arg_type.elem, BaseType) + base_ty_object = arg_type.elem.name + else: + return None + + if base_ty_object not in type_conversion_methods: + return None + methods = type_conversion_methods[base_ty_object] + if isinstance(arg_type, BaseType): + return methods[0] + return methods[1] + +should_use_int_tensor_ops_ = frozenset(("bitwise_not", "bitwise_and", "bitwise_or", "bitwise_xor", "gcd", + "lcm", "scatter", "gather", "_convert_indices_from_coo_to_csr", + "_convert_indices_from_csr_to_coo")) + +def should_use_int_tensor(op_name: str) -> bool: + return op_name in should_use_int_tensor_ops_ + +test_tensor_dim_ops_1_ = frozenset(("addmv", "index_add", "_convert_indices_from_coo_to_csr", + "_convert_indices_from_csr_to_coo", "nll_loss_backward")) +test_tensor_dim_ops_2_ = frozenset(("addmm", "mm")) + +def test_tensor_dim(op_name: str) -> int: + if op_name in test_tensor_dim_ops_1_: + return 1 + if op_name in test_tensor_dim_ops_2_: + return 2 + return 3 + +def test_value_expression(arg_type: Union[BaseType, OptionalType, Type], index: int, op_name: str) -> str: + num_tensors = 16 if index == 0 else 64 + num_dim = test_tensor_dim(op_name) + size_per_dim = math.ceil(num_tensors / float(num_dim)) + size_per_dim += size_per_dim % 2 + tensor_size_ex = "{%s}" % (",".join([f"{size_per_dim}"] * num_dim)) + if should_use_int_tensor(op_name): + tensor_expression = f"at::randint(1, 100, {tensor_size_ex}, at::kInt)" + else: + tensor_expression = f"at::rand({tensor_size_ex})" + + value_expressions = { + BaseTy.Tensor: tensor_expression, + BaseTy.int: "1", + BaseTy.bool: "false", + BaseTy.Scalar: "2", + BaseTy.ScalarType: "at::ScalarType::Float", + BaseTy.str: "\"floor\""} + + base_ty_object = None + if isinstance(arg_type, BaseType): + base_ty_object = arg_type.name + else: + assert isinstance(arg_type, OptionalType) and isinstance(arg_type.elem, BaseType) + base_ty_object = arg_type.elem.name + assert base_ty_object in value_expressions, "not expected type" + value_expression = value_expressions[base_ty_object] + return value_expression + +def generate_test_value_definitions(g: NativeFunctionsGroup, index: int) -> str: + schema = g.functional.func + assert not schema.is_out_fn() + schema_name = schema.name.name.base + arg_map = {} + for arg in schema.schema_order_arguments(): + test_value_exp = test_value_expression(arg.type, index, schema_name) + arg_map[arg.name] = test_value_exp + config.override_test_values(arg_map, schema_name, index) + arg_populations = [] + for arg_name, arg_value in arg_map.items(): + arg_populations.append(f'auto {arg_name}{index} = {arg_value}') + return ";\n ".join(arg_populations) + ";" + +def generate_test_value_names(g: NativeFunctionsGroup, index: int) -> str: + schema = g.functional.func + assert not schema.is_out_fn() + return ",".join(f"{arg.name}{index}" for arg in schema.schema_order_arguments()) + +generate_test_ir_arguments_base_ty_to_type_str_ = { + BaseTy.Tensor: 'Tensor', BaseTy.int: 'int', BaseTy.float: 'float', + BaseTy.str: 'str', BaseTy.Scalar: 'int', BaseTy.ScalarType: 'int', + BaseTy.bool: 'bool'} + +def generate_test_ir_arguments(g: NativeFunctionsGroup) -> List[Tuple[str, Optional[str]]]: + def ir_argument(arg: Argument) -> Tuple[str, Optional[str]]: + t = arg.type + add_optional = False + if isinstance(t, OptionalType): + t = t.elem + add_optional = True + assert isinstance(t, BaseType) + type_str = None + if t.name in generate_test_ir_arguments_base_ty_to_type_str_: + type_str = generate_test_ir_arguments_base_ty_to_type_str_[t.name] + if type_str and add_optional: + type_str = f'{type_str}?' + return ("%" + arg.name, type_str) + + schema = g.functional.func + assert not schema.is_out_fn() + return [ir_argument(arg) for arg in schema.schema_order_arguments()] + +def generate_arg_extraction(g: NativeFunctionsGroup) -> str: + schema = g.functional.func + assert not schema.is_out_fn() + arg_populations = [] + for i, arg in enumerate(schema.schema_order_arguments()): + maybe_method = ivalue_type_conversion_method(arg.type) + assert maybe_method + is_reference, type_conversion_method = maybe_method + reference = "&" if is_reference else "" + arg_populations.append(f'const auto{reference} {arg.name} = p_node->Input({i}).{type_conversion_method}') + return ";\n ".join(arg_populations) + ";" + +def generate_non_out_variant_call(g: NativeFunctionsGroup) -> str: + schema = g.functional.func + assert not schema.is_out_fn() + arg_names = (arg.name for arg in schema.schema_order_arguments()) + return f'at::cpu::{cpp.name(schema)}({",".join(arg_names)})' + +def generate_out_variant_call(g: NativeFunctionsGroup) -> str: + schema = g.out.func + assert schema.is_out_fn() + arg_names = [out_arg.name for out_arg in schema.arguments.out] + for arg in schema.arguments.non_out: + if isinstance(arg, SelfArgument): + arg_names.append(arg.argument.name) + else: + assert isinstance(arg, Argument) + arg_names.append(arg.name) + cpp_func_name = cpp.name(schema) + cpp_arg_names = ",".join(arg_names) + return f'at::cpu::{cpp_func_name}({cpp_arg_names})' + + +def should_check_resize(schema: FunctionSchema) -> bool: + schema_str = str(schema) + type_variant_op_name = schema_str[: schema_str.find("(")] + return type_variant_op_name not in ("isin.Scalar_Tensor", "index_add") + + +def op_name_from_group(g: NativeFunctionsGroup) -> str: + return g.functional.func.name.name.base + + +class GenOutVariantDispatcher: + def __call__(self, groups: Sequence[NativeFunctionsGroup]) -> str: + if not groups: + return "" + generated_type_variants = [] + for g in groups: + with native_function_manager(g): + assert is_supported(g) + assert isinstance(g, NativeFunctionsGroup) + generated_type_variant = self.gen_structured(g) + generated_type_variants.append(generated_type_variant) + op_name = op_name_from_group(groups[0]) + body = "\n".join(generated_type_variants) + generated = f""" +REGISTER_OPERATOR_FUNCTOR( + aten::{op_name}, + aten_{op_name}, + [](Node* n) -> SROperator {{ + {body} + LogAndDumpSchema(n); + return nullptr; + }}); +""" + return generated + + def gen_structured(self, g: NativeFunctionsGroup) -> str: + functional = g.functional + schema = str(functional.func) + op_name = op_name_from_group(g) + populated_argument = generate_arg_extraction(g) + functional_variant_call = generate_non_out_variant_call(g) + assert len(g.out.func.arguments.out) == 1 + out_variable_name = str(g.out.func.arguments.out[0].name) + out_variant_call = generate_out_variant_call(g) + generated = f""" + if (n->matches(torch::schema("aten::{schema}"))) {{ + return [](ProcessedNode* p_node) {{ + {populated_argument} + if (p_node->Output(0).isNone()) {{ + p_node->Output(0) = {functional_variant_call}; + return; + }} + auto& {out_variable_name} = p_node->Output(0).toTensor(); + fastResizeToZero({out_variable_name}); + {out_variant_call}; + }}; + }}""" + return generated + + +class GenOutVariantDispatcherTestCase: + def __call__(self, groups: Sequence[NativeFunctionsGroup]) -> str: + if not groups: + return "" + generated_type_variants = [] + for g in groups: + with native_function_manager(g): + assert is_supported(g) + assert isinstance(g, NativeFunctionsGroup) + generated_type_variant = self.gen_structured_test_case(g) + generated_type_variants.append(generated_type_variant) + return "\n".join(generated_type_variants) + + def gen_structured_test_case(self, g: NativeFunctionsGroup) -> str: + functional = g.functional + schema = str(functional.func) + assert schema.find("(") > 0 + type_variant_op_name = schema[: schema.find("(")].replace(".", "_") + op_name = op_name_from_group(g) + assert type_variant_op_name.startswith(op_name) + + arg_types = generate_test_ir_arguments(g) + arg_declarations = ", ".join((arg_name if arg_type is None + else f"{arg_name}: {arg_type}" + for arg_name, arg_type in arg_types)) + arg_names = ", ".join((arg_name for arg_name, _ in arg_types)) + assert (len(functional.func.returns) == 1 and isinstance(functional.func.returns[0].type, BaseType) and + functional.func.returns[0].type.name is BaseTy.Tensor) + test_value_definitions = generate_test_value_definitions(g, 0) + test_value_names = generate_test_value_names(g, 0) + test_value_definitions2 = generate_test_value_definitions(g, 1) + test_value_names2 = generate_test_value_names(g, 1) + check_resize = "true" if should_check_resize(functional.func) else "false" + generated = f""" +TEST(StaticRuntime, autogen_{type_variant_op_name}) {{ + const std::string script = R"IR( + graph({arg_declarations}): + %bias: None = prim::Constant() + %ret = aten::{op_name}({arg_names}) + %cloned = aten::clone(%ret, %bias) + return (%cloned) + )IR"; + + {test_value_definitions} + std::vector args{{{test_value_names}}}; + testStaticRuntime(script, args, {{}}, /*use_allclose=*/false, /*use_equalnan=*/false, /*check_resize=*/{check_resize}); + + {test_value_definitions2} + std::vector args2{{{test_value_names2}}}; + testStaticRuntime(script, args, args2, /*use_allclose=*/false, /*use_equalnan=*/false, /*check_resize=*/{check_resize}); + +}} +""" + return generated From d1c5f9e43993157f1c66e835c98624e4daef8564 Mon Sep 17 00:00:00 2001 From: Mike Iovine Date: Thu, 17 Feb 2022 10:18:33 -0800 Subject: [PATCH 131/199] [JIT][SR] Introduce prim::IfThenElse (#72587) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72587 This pattern frequently appears in a few graphs: ``` %result = prim::If(%condition) block0(): -> (%a) block1(): -> (%b) ``` This is slow, particularly in static runtime. Static runtime creates memory planners/block runners for each sub-block, which eats up a lot of memory and introduces a lot of extra overhead for this relatively simple operation. This diff introduces a new op that replaces nodes like the above with a single op meant to act like a ternary operator: ``` %result = prim::IfThenElse(%condition, %a, %b) ``` Test Plan: New unit tests Reviewed By: eellison Differential Revision: D34091789 fbshipit-source-id: eb6a8c460c39b4c019a1f4ab1f3f1e5b6edc400c (cherry picked from commit 0f1b335e5b83f402bda2dcdd9ecb411e0b67c651) --- aten/src/ATen/core/interned_strings.h | 1 + .../static_runtime/test_static_runtime.cc | 16 ++++++ test/cpp/jit/CMakeLists.txt | 1 + test/cpp/jit/test_add_if_then_else.cpp | 53 ++++++++++++++++++ tools/build_variables.bzl | 1 + torch/csrc/jit/passes/add_if_then_else.cpp | 55 +++++++++++++++++++ torch/csrc/jit/passes/add_if_then_else.h | 11 ++++ .../runtime/profiling_graph_executor_impl.cpp | 7 +++ .../runtime/profiling_graph_executor_impl.h | 1 + torch/csrc/jit/runtime/register_prim_ops.cpp | 11 ++++ torch/csrc/jit/runtime/static/impl.cpp | 5 +- torch/csrc/jit/runtime/static/impl.h | 3 +- torch/csrc/jit/runtime/static/native_ops.cpp | 12 ++++ 13 files changed, 175 insertions(+), 2 deletions(-) create mode 100644 test/cpp/jit/test_add_if_then_else.cpp create mode 100644 torch/csrc/jit/passes/add_if_then_else.cpp create mode 100644 torch/csrc/jit/passes/add_if_then_else.h diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index b2d6a43731f..88f275093d1 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -96,6 +96,7 @@ namespace c10 { _(prim, With) \ _(prim, Enter) \ _(prim, Exit) \ + _(prim, IfThenElse) \ _(aten, Bool) \ _(aten, Int) \ _(aten, FloatImplicit) \ diff --git a/benchmarks/static_runtime/test_static_runtime.cc b/benchmarks/static_runtime/test_static_runtime.cc index bc923e707e1..c3e9a050ff1 100644 --- a/benchmarks/static_runtime/test_static_runtime.cc +++ b/benchmarks/static_runtime/test_static_runtime.cc @@ -2720,3 +2720,19 @@ TEST(StaticRuntime, ToList) { )JIT"; testStaticRuntime(src, {at::randn({2, 2})}); } + +TEST(StaticRuntime, IfThenElse) { + const auto src = R"IR( + graph(%cond: bool, %a: Tensor, %b: Tensor): + %none: NoneType = prim::Constant() + %c: Tensor = prim::IfThenElse(%cond, %a, %b) + %d: Tensor = aten::clone(%c, %none) + return (%d) + )IR"; + + std::vector args1{true, at::randn({1}), at::randn({1})}; + std::vector args2{false, at::randn({1}), at::randn({1})}; + + testStaticRuntime(src, args1); + testStaticRuntime(src, args2); +} diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt index cfdbb28a676..7358af08582 100644 --- a/test/cpp/jit/CMakeLists.txt +++ b/test/cpp/jit/CMakeLists.txt @@ -39,6 +39,7 @@ endif() # Build the cpp gtest binary containing the cpp-only tests. set(JIT_TEST_SRCS + ${JIT_TEST_ROOT}/test_add_if_then_else.cpp ${JIT_TEST_ROOT}/test_alias_analysis.cpp ${JIT_TEST_ROOT}/test_argument_spec.cpp ${JIT_TEST_ROOT}/test_autodiff.cpp diff --git a/test/cpp/jit/test_add_if_then_else.cpp b/test/cpp/jit/test_add_if_then_else.cpp new file mode 100644 index 00000000000..4850e1ab425 --- /dev/null +++ b/test/cpp/jit/test_add_if_then_else.cpp @@ -0,0 +1,53 @@ +#include + +#include +#include +#include + +namespace torch { +namespace jit { + +TEST(AddIfThenElseOpTest, AddIfThenElseOpSimple) { + const auto src = R"IR( + graph(%cond: bool, %a: Tensor, %b: Tensor): + %result: Tensor = prim::If(%cond) + block0(): + -> (%a) + block1(): + -> (%b) + return (%result) + )IR"; + + auto graph = std::make_shared(); + parseIR(src, graph.get()); + EXPECT_TRUE(AddIfThenElseOp(graph)); + + testing::FileCheck() + .check_count("= prim::IfThenElse", 1, /*exactly*/ true) + ->check_count("= prim::If", 0, /*exactly*/ true) + ->run(*graph); +} + +TEST(AddIfThenElseOpTest, NoIfThenElseOpMultipleOutputs) { + const auto src = R"IR( + graph(%cond: bool, %a: Tensor, %b: Tensor): + %result1: Tensor, %result2: Tensor = prim::If(%cond) + block0(): + -> (%a, %b) + block1(): + -> (%b, %a) + return (%result1, %result2) + )IR"; + + auto graph = std::make_shared(); + parseIR(src, graph.get()); + EXPECT_FALSE(AddIfThenElseOp(graph)); + + testing::FileCheck() + .check_count("= prim::IfThenElse", 0, /*exactly*/ true) + ->check_count("= prim::If", 1, /*exactly*/ true) + ->run(*graph); +} + +} // namespace jit +} // namespace torch diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index c6a7e5a0791..67f2def297c 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -213,6 +213,7 @@ core_sources_full_mobile_no_backend_interface = [ "torch/csrc/jit/operator_upgraders/utils.cpp", "torch/csrc/jit/operator_upgraders/upgraders.cpp", "torch/csrc/jit/operator_upgraders/upgraders_entry.cpp", + "torch/csrc/jit/passes/add_if_then_else.cpp", "torch/csrc/jit/passes/annotate_warns.cpp", "torch/csrc/jit/passes/bailout_graph.cpp", "torch/csrc/jit/passes/batch_mm.cpp", diff --git a/torch/csrc/jit/passes/add_if_then_else.cpp b/torch/csrc/jit/passes/add_if_then_else.cpp new file mode 100644 index 00000000000..72a085fd021 --- /dev/null +++ b/torch/csrc/jit/passes/add_if_then_else.cpp @@ -0,0 +1,55 @@ +#include +#include + +namespace torch { +namespace jit { + +namespace { + +bool hasNoNodes(Block* block) { + auto nodes = block->nodes(); + return nodes.begin() == nodes.end(); +} + +bool hasTrivialSubBlocks(Node* node) { + const auto blocks = node->blocks(); + DCHECK_EQ(blocks.size(), 2); + + return hasNoNodes(blocks[0]) && hasNoNodes(blocks[1]); +} + +} // namespace + +bool AddIfThenElseOp(std::shared_ptr& graph) { + std::vector to_replace; + DepthFirstGraphNodeIterator graph_it(graph); + for (auto* node = graph_it.next(); node != nullptr; node = graph_it.next()) { + if (node->kind() != prim::If) { + continue; + } + if (node->outputs().size() != 1) { + continue; + } + if (hasTrivialSubBlocks(node)) { + to_replace.push_back(node); + } + } + + for (auto* node : to_replace) { + auto* if_then_else_node = graph->create(prim::IfThenElse, 1); + if_then_else_node->addInput(node->input()); + auto blocks = node->blocks(); + if_then_else_node->addInput(blocks[0]->return_node()->input()); + if_then_else_node->addInput(blocks[1]->return_node()->input()); + + if_then_else_node->insertBefore(node); + if_then_else_node->output()->copyMetadata(node->output()); + + node->output()->replaceAllUsesWith(if_then_else_node->output()); + node->destroy(); + } + return !to_replace.empty(); +} + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/passes/add_if_then_else.h b/torch/csrc/jit/passes/add_if_then_else.h new file mode 100644 index 00000000000..c6b3f9376d6 --- /dev/null +++ b/torch/csrc/jit/passes/add_if_then_else.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +namespace torch { +namespace jit { + +TORCH_API bool AddIfThenElseOp(std::shared_ptr& graph); + +} // namespace jit +} // namespace torch diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp index c0fc02e34d4..66a71a08596 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -650,6 +651,7 @@ const ExecutionPlan& ProfilingGraphExecutorImpl::getOptimizedPlanFor( // replaces a fallback graph inserted by // specialize_autogradzero if one exists replaceFallbackGraphWithFallbackFunction(copy->block()); + runFinalOptimizations(copy); GRAPH_DUMP("Optimized Graph: ", copy); optimized_plan_ = ExecutionPlan(copy, function_name_, *remaining_bailout_depth_); @@ -749,5 +751,10 @@ void ProfilingGraphExecutorImpl::replaceFallbackGraphWithFallbackFunction( } } +void ProfilingGraphExecutorImpl::runFinalOptimizations( + std::shared_ptr& graph) { + AddIfThenElseOp(graph); +} + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h index 560eaca2cc3..117873934db 100644 --- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h +++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h @@ -39,6 +39,7 @@ struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase { std::shared_ptr& graph, size_t remaining_depth); void replaceFallbackGraphWithFallbackFunction(Block* b); + void runFinalOptimizations(std::shared_ptr& graph); std::unique_ptr pr_; c10::optional profiling_plan_; // plan to run in order to profiling the code diff --git a/torch/csrc/jit/runtime/register_prim_ops.cpp b/torch/csrc/jit/runtime/register_prim_ops.cpp index 0bf4f22aa7f..de8bc3ae86e 100644 --- a/torch/csrc/jit/runtime/register_prim_ops.cpp +++ b/torch/csrc/jit/runtime/register_prim_ops.cpp @@ -700,6 +700,17 @@ static const std::vector opGenArgs{ push(stack, at::stack(inputs, dim)); }, aliasAnalysisFromSchema()), + OperatorGeneratorArgs( + TORCH_SELECTIVE_SCHEMA( + "prim::IfThenElse(bool cond, Any(a) x, Any(b) y) -> Any(a|b)"), + [](Stack& stack) { + const auto cond = stack[stack.size() - 3].toBool(); + stack[stack.size() - 3] = + std::move(stack[stack.size() - (cond ? 2 : 1)]); + stack.pop_back(); + stack.pop_back(); + }, + aliasAnalysisFromSchema()), OperatorGeneratorArgs( TORCH_SELECTIVE_SCHEMA( "aten::eq.enum(AnyEnumType a, AnyEnumType b) -> bool"), diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index f8984129582..595e428e535 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -173,6 +174,7 @@ void OptimizeGraph( UseVariadicGroupedAccessor(graph); EliminateNoOps( graph, /* custom_ops */ {fromQualString("fb::scale_gradient")}); + AddIfThenElseOp(graph); GRAPH_DUMP("Final graph after optimizations: ", graph); } @@ -1846,8 +1848,9 @@ static bool checkNoMemoryOverlap(const at::Tensor& a, const at::Tensor& b) { } bool ProcessedNode::verify_no_memory_overlap(bool force_check) const { - const static std::array special_case_ops = { + const static std::array special_case_ops = { fromQualString("prim::TypeCheck"), + fromQualString("prim::IfThenElse"), fromQualString("static_runtime::select_tensor"), fromQualString("static_runtime::VarTupleUnpack"), fromQualString("static_runtime::dict_unpack"), diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index 27fcf2e5a24..6f3b0d9018a 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -58,10 +58,11 @@ TORCH_API inline bool doesNotHeapAllocateWhenStoredInIValue(const Type& type) { } TORCH_API inline bool borrowsOutputs(c10::Symbol kind) { - static const std::array symbols_with_borrowed_outputs = { + static const std::array symbols_with_borrowed_outputs = { c10::Symbol::fromQualString("static_runtime::select_tensor"), c10::Symbol::fromQualString("static_runtime::dict_unpack"), c10::Symbol::fromQualString("static_runtime::VarTupleUnpack"), + c10::Symbol::fromQualString("prim::IfThenElse"), }; return std::find( symbols_with_borrowed_outputs.begin(), diff --git a/torch/csrc/jit/runtime/static/native_ops.cpp b/torch/csrc/jit/runtime/static/native_ops.cpp index 33e2e27a7de..5d71e6b8135 100644 --- a/torch/csrc/jit/runtime/static/native_ops.cpp +++ b/torch/csrc/jit/runtime/static/native_ops.cpp @@ -946,5 +946,17 @@ REGISTER_NATIVE_OPERATOR_FUNCTOR( }; }); +// See [Borrowed IValue Outputs] +REGISTER_NATIVE_OPERATOR_FUNCTOR( + prim::IfThenElse, + prim_IfThenElse, + [](Node*) -> SROperator { + return [](ProcessedNode* pnode) { + const auto condition = pnode->Input(0).toBool(); + pnode->Output(0) = condition ? createBorrowedIValue(pnode->Input(1)) + : createBorrowedIValue(pnode->Input(2)); + }; + }); + } // namespace jit } // namespace torch From 6448f1bceef102d13cdee074ae184b27a10aa98d Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Thu, 17 Feb 2022 10:38:06 -0800 Subject: [PATCH 132/199] Revert D34283475: ci: Add documentation for github actions Test Plan: revert-hammer Differential Revision: D34283475 (https://github.com/pytorch/pytorch/commit/5cf2228405f20cd73ec40c188c8851c9897777db) Original commit changeset: a4ac9711c19a Original Phabricator Diff: D34283475 (https://github.com/pytorch/pytorch/commit/5cf2228405f20cd73ec40c188c8851c9897777db) fbshipit-source-id: c8453b8edf711cf877313016c38016b15a47fd3f (cherry picked from commit 20c97b6293517c773b49395bdf53b53deaa7f89f) --- .github/README.md | 78 ---------------------------------------- .github/requirements.txt | 1 - 2 files changed, 79 deletions(-) delete mode 100644 .github/README.md delete mode 100644 .github/requirements.txt diff --git a/.github/README.md b/.github/README.md deleted file mode 100644 index 7ae4dcbaad0..00000000000 --- a/.github/README.md +++ /dev/null @@ -1,78 +0,0 @@ -# pytorch/.github - -This directory contains workflows and scripts to support our CI infrastructure that runs on Github Actions. - - -## Workflows / Templates - -Our current Github Actions setup uses templates written in [Jinja](https://jinja.palletsprojects.com/en/3.0.x/) that are located in the -`.github/templates` directory to generate workflow files found in the `.github/workflows/` directory. - -These templates contain a couple of utility templates used to discern common utilities that can be -used amongst different templates. - -### (Re)Generating workflow files - -You will need `jinja2` in order to regenerate the workflow files which can be installed using: -```bash -pip install -r .github/requirements.txt -``` - -Workflows can be generated / regenerated using the following command: -```bash -.github/regenerate.sh -``` - -### Adding a new generated workflow - -New generated workflows can be added in the `.github/scripts/generate_ci_workflows.py` script. You can reference -examples from that script in order to add the workflow to the stream that is relevant to what you particularly -care about. - -Different parameters can be used to acheive different goals, i.e. running jobs on a cron, running only on trunk, etc. - -#### ciflow (specific) - -ciflow is the way we can get `non-default` workflows to run on specific PRs. Within the `generate_ci_workflows.py` script -you will notice a multitude of `LABEL_CIFLOW_` variables which correspond to labels on Github. Workflows that -do not run on ``LABEL_CIFLOW_DEFAULT` can be triggered on PRs by applying the label found in `generate_ci_workflows.py` - -Example: -```python - CIWorkflow( - arch="linux", - build_environment="periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck", - docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", - test_runner_type=LINUX_CUDA_TEST_RUNNER, - num_test_shards=2, - distributed_test=False, - timeout_after=360, - # Only run this on master 4 times per day since it does take a while - is_scheduled="0 */4 * * *", - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_SLOW_GRADCHECK, LABEL_CIFLOW_SLOW, LABEL_CIFLOW_SCHEDULED}, - ), - ), -``` - -This workflow does not get triggered by default since it does not contain the `LABEL_CIFLOW_DEFAULT` label in its CIFlowConfig but applying -the `LABEL_CIFLOW_SLOW_GRADCHECK` on your PR will trigger this specific workflow to run. - -#### ciflow (trunk) - -The label `ciflow/trunk` can be used to run `trunk` only workflows. This is especially useful if trying to re-land a PR that was -reverted for failing a `non-default` workflow. - -## Infra - -Currently most of our self hosted runners are hosted on AWS, for a comprehensive list of available runner types you -can reference `.github/scale-config.yml`. - -Exceptions to AWS for self hosted: -* ROCM runners - -### Adding new runner types - -New runner types can be added by committing changes to `.github/scale-config.yml`. Example: https://github.com/pytorch/pytorch/pull/70474 - -> NOTE: New runner types can only be used once the changes to `.github/scale-config.yml` have made their way into the default branch diff --git a/.github/requirements.txt b/.github/requirements.txt deleted file mode 100644 index 7f7afbf3bf5..00000000000 --- a/.github/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -jinja2 From 0942af7c4b22d5219fb30799ac4c16db6940f62b Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Thu, 17 Feb 2022 09:49:44 -0800 Subject: [PATCH 133/199] [ci] switch arm mac jobs to periodic, delete fulljit jobs As discussed in #72933 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72996 --- .github/generated-ciflow-ruleset.json | 18 +- .github/scripts/generate_ci_workflows.py | 34 +--- .github/templates/ios_ci_workflow.yml.j2 | 6 +- .../generated-ios-12-5-1-arm64-coreml.yml | 7 +- .../generated-ios-12-5-1-arm64-custom-ops.yml | 7 +- .../generated-ios-12-5-1-arm64-full-jit.yml | 144 -------------- .../generated-ios-12-5-1-arm64-metal.yml | 7 +- .../workflows/generated-ios-12-5-1-arm64.yml | 7 +- .../generated-ios-12-5-1-x86-64-full-jit.yml | 177 ------------------ 9 files changed, 28 insertions(+), 379 deletions(-) delete mode 100644 .github/workflows/generated-ios-12-5-1-arm64-full-jit.yml delete mode 100644 .github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index c13e357b645..81abc2237bc 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -7,11 +7,9 @@ "ios-12-5-1-arm64", "ios-12-5-1-arm64-coreml", "ios-12-5-1-arm64-custom-ops", - "ios-12-5-1-arm64-full-jit", "ios-12-5-1-arm64-metal", "ios-12-5-1-x86-64", "ios-12-5-1-x86-64-coreml", - "ios-12-5-1-x86-64-full-jit", "libtorch-linux-xenial-cuda10.2-py3.7-gcc7", "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", @@ -163,11 +161,9 @@ "ios-12-5-1-arm64", "ios-12-5-1-arm64-coreml", "ios-12-5-1-arm64-custom-ops", - "ios-12-5-1-arm64-full-jit", "ios-12-5-1-arm64-metal", "ios-12-5-1-x86-64", - "ios-12-5-1-x86-64-coreml", - "ios-12-5-1-x86-64-full-jit" + "ios-12-5-1-x86-64-coreml" ], "ciflow/libtorch": [ "libtorch-linux-xenial-cuda10.2-py3.7-gcc7", @@ -210,11 +206,9 @@ "ios-12-5-1-arm64", "ios-12-5-1-arm64-coreml", "ios-12-5-1-arm64-custom-ops", - "ios-12-5-1-arm64-full-jit", "ios-12-5-1-arm64-metal", "ios-12-5-1-x86-64", "ios-12-5-1-x86-64-coreml", - "ios-12-5-1-x86-64-full-jit", "macos-10-15-py3-arm64", "macos-10-15-py3-lite-interpreter-x86-64", "macos-11-py3-x86-64" @@ -236,6 +230,10 @@ "linux-xenial-py3.7-clang7-asan" ], "ciflow/scheduled": [ + "ios-12-5-1-arm64", + "ios-12-5-1-arm64-coreml", + "ios-12-5-1-arm64-custom-ops", + "ios-12-5-1-arm64-metal", "linux-docs-push", "periodic-libtorch-linux-bionic-cuda11.5-py3.7-gcc7", "periodic-libtorch-linux-xenial-cuda11.1-py3.7-gcc7", @@ -255,14 +253,8 @@ "ciflow/trunk": [ "caffe2-linux-xenial-py3.7-gcc5.4", "docker-builds", - "ios-12-5-1-arm64", - "ios-12-5-1-arm64-coreml", - "ios-12-5-1-arm64-custom-ops", - "ios-12-5-1-arm64-full-jit", - "ios-12-5-1-arm64-metal", "ios-12-5-1-x86-64", "ios-12-5-1-x86-64-coreml", - "ios-12-5-1-x86-64-full-jit", "libtorch-linux-xenial-cuda10.2-py3.7-gcc7", "libtorch-linux-xenial-cuda11.3-py3.7-gcc7", "linux-bionic-cuda10.2-py3.9-gcc7", diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 4bcfed7c88d..0d5a86a11d3 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -727,9 +727,10 @@ IOS_WORKFLOWS = [ ios_arch="arm64", ios_platform="OS", test_runner_type=MACOS_TEST_RUNNER_10_15, + is_scheduled="45 4,10,16,22 * * *", exclude_test=True, ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, + labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, ), ), CIWorkflow( @@ -738,20 +739,10 @@ IOS_WORKFLOWS = [ ios_arch="arm64", ios_platform="OS", test_runner_type=MACOS_TEST_RUNNER_10_15, + is_scheduled="45 4,10,16,22 * * *", exclude_test=True, ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-arm64-full-jit", - ios_arch="arm64", - ios_platform="OS", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, + labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, ), ), CIWorkflow( @@ -760,9 +751,10 @@ IOS_WORKFLOWS = [ ios_arch="arm64", ios_platform="OS", test_runner_type=MACOS_TEST_RUNNER_10_15, + is_scheduled="45 4,10,16,22 * * *", exclude_test=True, ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, + labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, ), ), CIWorkflow( @@ -771,9 +763,10 @@ IOS_WORKFLOWS = [ ios_arch="arm64", ios_platform="OS", test_runner_type=MACOS_TEST_RUNNER_10_15, + is_scheduled="45 4,10,16,22 * * *", exclude_test=True, ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, + labels={LABEL_CIFLOW_SCHEDULED, LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, ), ), CIWorkflow( @@ -798,17 +791,6 @@ IOS_WORKFLOWS = [ labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, ), ), - CIWorkflow( - arch="macos", - build_environment="ios-12-5-1-x86-64-full-jit", - ios_arch="x86_64", - ios_platform="SIMULATOR", - test_runner_type=MACOS_TEST_RUNNER_10_15, - exclude_test=True, - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_IOS, LABEL_CIFLOW_MACOS}, - ), - ), ] MACOS_WORKFLOWS = [ diff --git a/.github/templates/ios_ci_workflow.yml.j2 b/.github/templates/ios_ci_workflow.yml.j2 index f837a500a26..79f5b104921 100644 --- a/.github/templates/ios_ci_workflow.yml.j2 +++ b/.github/templates/ios_ci_workflow.yml.j2 @@ -9,13 +9,13 @@ name: !{{ build_environment }} on: {%- if is_default %} pull_request: -{%- endif -%} - +{%- endif %} {%- if is_scheduled %} schedule: - cron: !{{ is_scheduled }} -{%- else %} +{%- endif %} push: +{%- if not is_scheduled %} branches: - master - release/* diff --git a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml index d8bc3694ede..7640a34c634 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-coreml.yml @@ -4,15 +4,14 @@ name: ios-12-5-1-arm64-coreml on: + schedule: + - cron: 45 4,10,16,22 * * * push: - branches: - - master - - release/* tags: - 'ciflow/all/*' - 'ciflow/ios/*' - 'ciflow/macos/*' - - 'ciflow/trunk/*' + - 'ciflow/scheduled/*' workflow_dispatch: env: diff --git a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml index 61716b86f99..75bc1f77252 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-custom-ops.yml @@ -4,15 +4,14 @@ name: ios-12-5-1-arm64-custom-ops on: + schedule: + - cron: 45 4,10,16,22 * * * push: - branches: - - master - - release/* tags: - 'ciflow/all/*' - 'ciflow/ios/*' - 'ciflow/macos/*' - - 'ciflow/trunk/*' + - 'ciflow/scheduled/*' workflow_dispatch: env: diff --git a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml deleted file mode 100644 index 601e3cbb168..00000000000 --- a/.github/workflows/generated-ios-12-5-1-arm64-full-jit.yml +++ /dev/null @@ -1,144 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-arm64-full-jit - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-arm64-full-jit - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: OS - IOS_ARCH: arm64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-arm64-full-jit-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - -concurrency: - group: ios-12-5-1-arm64-full-jit-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml index a53ee7d40bf..2a9da911d79 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64-metal.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64-metal.yml @@ -4,15 +4,14 @@ name: ios-12-5-1-arm64-metal on: + schedule: + - cron: 45 4,10,16,22 * * * push: - branches: - - master - - release/* tags: - 'ciflow/all/*' - 'ciflow/ios/*' - 'ciflow/macos/*' - - 'ciflow/trunk/*' + - 'ciflow/scheduled/*' workflow_dispatch: env: diff --git a/.github/workflows/generated-ios-12-5-1-arm64.yml b/.github/workflows/generated-ios-12-5-1-arm64.yml index 763356596b8..3463fc5c48a 100644 --- a/.github/workflows/generated-ios-12-5-1-arm64.yml +++ b/.github/workflows/generated-ios-12-5-1-arm64.yml @@ -4,15 +4,14 @@ name: ios-12-5-1-arm64 on: + schedule: + - cron: 45 4,10,16,22 * * * push: - branches: - - master - - release/* tags: - 'ciflow/all/*' - 'ciflow/ios/*' - 'ciflow/macos/*' - - 'ciflow/trunk/*' + - 'ciflow/scheduled/*' workflow_dispatch: env: diff --git a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml b/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml deleted file mode 100644 index 5562903e39a..00000000000 --- a/.github/workflows/generated-ios-12-5-1-x86-64-full-jit.yml +++ /dev/null @@ -1,177 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/ios_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: ios-12-5-1-x86-64-full-jit - -on: - push: - branches: - - master - - release/* - tags: - - 'ciflow/all/*' - - 'ciflow/ios/*' - - 'ciflow/macos/*' - - 'ciflow/trunk/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: ios-12-5-1-x86-64-full-jit - IN_CI: 1 - IS_GHA: 1 - IOS_PLATFORM: SIMULATOR - IOS_ARCH: x86_64 - - -jobs: - - build: - # NOTE: These builds will not run successfully without running on `pytorch/pytorch` due to the limitations - # of accessing secrets from forked pull requests and IOS' dependency on secrets for their build/test - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - runs-on: macos-10.15 - timeout-minutes: 240 - env: - JOB_BASE_NAME: ios-12-5-1-x86-64-full-jit-build - IOS_CERT_KEY_2022: ${{ secrets.IOS_CERT_KEY_2022 }} - IOS_CERT_SECRET: ${{ secrets.IOS_CERT_SECRET }} - IOS_DEV_TEAM_ID: ${{ secrets.IOS_DEV_TEAM_ID }} - IOS_SIGN_KEY_2022: ${{ secrets.IOS_SIGN_KEY_2022 }} - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Populate CI build options - run: | - # Most builds use the lite interpreter, if certain builds shouldn't - # build the lite interpreter this env variable should get over-written - # in the following case statement - echo "BUILD_LITE_INTERPRETER=1" >> "${GITHUB_ENV}" - - case ${BUILD_ENVIRONMENT} in - *metal*) - echo "USE_PYTORCH_METAL=1" >> "${GITHUB_ENV}" - ;; - *full_jit*) - echo "BUILD_LITE_INTERPRETER=0" >> "${GITHUB_ENV}" - ;; - *custom*) - echo "SELECTED_OP_LIST=${GITHUB_WORKSPACE}/ios/TestApp/custom_build/mobilenetv2.yaml" >> "${GITHUB_ENV}" - ;; - *coreml*) - echo "USE_COREML_DELEGATE=1" >> "${GITHUB_ENV}" - ;; - esac - - name: Install brew dependencies - run: | - # Install dependencies - brew install libtool - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 -o "${RUNNER_TEMP}/conda.sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - conda install -y \ - cffi \ - cmake \ - mkl \ - mkl-include \ - ninja \ - numpy \ - pyyaml \ - requests \ - setuptools \ - typing_extensions - - name: Run Fastlane - run: | - set -x - cd ios/TestApp - # install fastlane - sudo gem install bundler && bundle install - # install certificates - echo "${IOS_CERT_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o Certificates.p12 - rm cert.txt - bundle exec fastlane install_root_cert - bundle exec fastlane install_dev_cert - # install the provisioning profile - PROFILE=PyTorch_CI_2022.mobileprovision - PROVISIONING_PROFILES=~/Library/MobileDevice/Provisioning\ Profiles - mkdir -pv "${PROVISIONING_PROFILES}" - cd "${PROVISIONING_PROFILES}" - echo "${IOS_SIGN_KEY_2022}" >> cert.txt - base64 --decode cert.txt -o ${PROFILE} - rm cert.txt - - name: Build - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - export TCLLIBPATH="/usr/local/lib" - python -VV - export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname "$(which conda)")/../"} - scripts/build_ios.sh - - name: Run Build Test - run: | - PROFILE=PyTorch_CI_2022 - # run the ruby build script - if ! [ -x "$(command -v xcodebuild)" ]; then - echo 'Error: xcodebuild is not installed.' - exit 1 - fi - if [ "${IOS_PLATFORM}" != "SIMULATOR" ]; then - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" -c "${PROFILE}" -t "${IOS_DEV_TEAM_ID}" - else - ruby scripts/xcode_build.rb -i build_ios/install -x ios/TestApp/TestApp.xcodeproj -p "${IOS_PLATFORM}" - fi - - name: Run Simulator Tests - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - pip3 install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html - # generate models for differnet backends - cd "${GITHUB_WORKSPACE}/ios/TestApp/benchmark" - mkdir -p ../models - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - pip install coremltools==5.0b5 - pip install six==1.16.0 - python coreml_backend.py - else - python trace_model.py - fi - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - echo "Setting up the TestApp for LiteInterpreter" - ruby setup.rb --lite 1 - else - echo "Setting up the TestApp for Full JIT" - ruby setup.rb - fi - cd "${GITHUB_WORKSPACE}/ios/TestApp" - instruments -s -devices - if [ "${BUILD_LITE_INTERPRETER}" == 1 ]; then - if [ "${USE_COREML_DELEGATE}" == 1 ]; then - fastlane scan --only_testing TestAppTests/TestAppTests/testCoreML - else - fastlane scan --only_testing TestAppTests/TestAppTests/testLiteInterpreter - fi - else - fastlane scan --only_testing TestAppTests/TestAppTests/testFullJIT - fi - -concurrency: - group: ios-12-5-1-x86-64-full-jit-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true From 36fa50be6003d96fd913133cef7cc4f3c777d01d Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Thu, 17 Feb 2022 19:20:46 +0000 Subject: [PATCH 134/199] Add `tools/` to OSS CI merge rules These tools are mostly for dev infra usage. Also add @ezyang to the list of approvers. [skip ci] Pull Request resolved: https://github.com/pytorch/pytorch/pull/73026 --- .github/merge_rules.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/merge_rules.json b/.github/merge_rules.json index 6df1db142f5..75ee92711ab 100644 --- a/.github/merge_rules.json +++ b/.github/merge_rules.json @@ -22,8 +22,8 @@ }, { "name": "OSS CI", - "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**"], - "approved_by": ["seemethere", "malfet", "suo", "janeyx99"], + "patterns": [".github/**", ".circleci/**", ".jenkins/**", "scripts/**", "tools/**"], + "approved_by": ["seemethere", "malfet", "suo", "janeyx99", "ezyang"], "mandatory_app_id": 12274 }, { From d4f3d07ae251876054f5681afc44e0d6057d851b Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Thu, 17 Feb 2022 10:07:02 -0800 Subject: [PATCH 135/199] [ci] don't compute ignored issues in generate-test-matrix This doesn't have anything to do with controlling which test jobs are generated; it can be done dynamically in each job Pull Request resolved: https://github.com/pytorch/pytorch/pull/73020 --- .../scripts/generate_pytorch_test_matrix.py | 13 --------- .github/templates/linux_ci_workflow.yml.j2 | 6 ++--- .github/templates/macos_ci_workflow.yml.j2 | 4 +-- .github/templates/windows_ci_workflow.yml.j2 | 4 +-- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 6 ++--- .../generated-linux-bionic-py3.7-clang9.yml | 6 ++--- .../generated-linux-bionic-rocm4.5-py3.7.yml | 6 ++--- ...rated-linux-vulkan-bionic-py3.7-clang9.yml | 6 ++--- ...rated-linux-xenial-cuda11.3-py3.7-gcc7.yml | 6 ++--- ...nerated-linux-xenial-py3.7-clang7-asan.yml | 6 ++--- ...nerated-linux-xenial-py3.7-clang7-onnx.yml | 6 ++--- .../generated-linux-xenial-py3.7-gcc5.4.yml | 6 ++--- .../generated-linux-xenial-py3.7-gcc7.yml | 6 ++--- .../generated-macos-11-py3-x86-64.yml | 4 +-- ...rallelnative-linux-xenial-py3.7-gcc5.4.yml | 6 ++--- ...iodic-linux-bionic-cuda11.5-py3.7-gcc7.yml | 6 ++--- ...enial-cuda10.2-py3-gcc7-slow-gradcheck.yml | 6 ++--- ...linux-xenial-cuda11.1-py3.7-gcc7-debug.yml | 6 ++--- ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 4 +-- ...rated-periodic-win-vs2019-cuda11.5-py3.yml | 4 +-- ...-pytorch-xla-linux-bionic-py3.7-clang8.yml | 6 ++--- .../generated-win-vs2019-cpu-py3.yml | 4 +-- .../generated-win-vs2019-cuda11.3-py3.yml | 4 +-- tools/stats/import_test_stats.py | 27 ++++++++++--------- 24 files changed, 51 insertions(+), 107 deletions(-) diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py index bd0f77d4aa4..4cbc20691a1 100755 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ b/.github/scripts/generate_pytorch_test_matrix.py @@ -9,7 +9,6 @@ dictated by just sharding. import json import os -import re from typing import Dict from typing_extensions import TypedDict @@ -23,17 +22,6 @@ class Config(TypedDict): runner: str -def get_disabled_issues() -> str: - pr_body = os.getenv('PR_BODY', '') - # The below regex is meant to match all *case-insensitive* keywords that - # GitHub has delineated would link PRs to issues, more details here: - # https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue. - # E.g., "Close #62851", "fixES #62851" and "RESOLVED #62851" would all match, but not - # "closes #62851" --> extra space, "fixing #62851" --> not a keyword, nor "fix 62851" --> no # - regex = '(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) #([0-9]+)' - issue_numbers = [x[4] for x in re.findall(regex, pr_body)] - return ','.join(issue_numbers) - # When the user specifies labels that are NOT ciflow/default, the expectation is # that the workflows should be triggered as if they are on trunk. For example, when # ciflow/all is specified, we should run the full test suite for Windows CUDA @@ -128,7 +116,6 @@ def main() -> None: print(json.dumps({'matrix': matrix, 'render-matrix': render_matrix}, indent=2)) print(f'::set-output name=matrix::{json.dumps(matrix)}') print(f'::set-output name=render-matrix::{json.dumps(render_matrix)}') - print(f'::set-output name=ignore-disabled-issues::{get_disabled_issues()}') if __name__ == "__main__": diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 2d978bab9dc..02a1a8fb026 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -196,11 +196,9 @@ jobs: MULTIGPU_RUNNER_TYPE: !{{ multigpu_runner_type }} DISTRIBUTED_GPU_RUNNER_TYPE: !{{ distributed_gpu_runner_type }} NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -225,7 +223,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: {%- if 'rocm' in test_runner_type %} !{{ common.setup_rocm_linux() }} @@ -322,7 +320,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/templates/macos_ci_workflow.yml.j2 b/.github/templates/macos_ci_workflow.yml.j2 index bce1f88b6fe..413df391183 100644 --- a/.github/templates/macos_ci_workflow.yml.j2 +++ b/.github/templates/macos_ci_workflow.yml.j2 @@ -95,11 +95,9 @@ jobs: TEST_RUNNER_TYPE: !{{ test_runner_type }} ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }} NUM_TEST_SHARDS: !{{ num_test_shards }} - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -123,7 +121,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: !{{ common.checkout(submodules="false") }} - uses: actions/download-artifact@v2 diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 21f067101d9..53f50aa37de 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -138,14 +138,12 @@ jobs: TEST_RUNNER_TYPE: !{{ test_runner_type }} NUM_TEST_SHARDS: !{{ num_test_shards }} NUM_TEST_SHARDS_ON_PULL_REQUEST: !{{ num_test_shards_on_pull_request }} - PR_BODY: ${{ github.event.pull_request.body }} NOGPU_RUNNER_TYPE: windows.4xlarge ENABLE_FORCE_ON_CPU_TEST: !{{ enable_force_on_cpu_test }} RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -166,7 +164,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} http_proxy: "!{{ common.squid_proxy }}" https_proxy: "!{{ common.squid_proxy }}" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} needs: [build, generate-test-matrix] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index bd292201ab8..c78dc63a446 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -269,11 +269,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -298,7 +296,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -434,7 +432,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml index 7379f4b357e..aa28d07c1b2 100644 --- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml @@ -270,11 +270,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -299,7 +297,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -435,7 +433,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml index 314415b1ef1..5dd6543e1f5 100644 --- a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml +++ b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml @@ -269,11 +269,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.rocm.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.rocm.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -298,7 +296,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Clean workspace run: | @@ -418,7 +416,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml index bdfe986a0f7..e6ea8bde928 100644 --- a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml @@ -270,11 +270,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -299,7 +297,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -435,7 +433,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml index 4a32d2662a9..eaa71fb310b 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml @@ -269,11 +269,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -298,7 +296,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -434,7 +432,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml index 5c7c4d17db5..6a002fda318 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml @@ -270,11 +270,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -299,7 +297,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -435,7 +433,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml index 2512c071ab5..6ebf5dbdd0b 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml @@ -270,11 +270,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -299,7 +297,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -435,7 +433,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml index 3d0cb725a48..b1c63e596df 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml @@ -269,11 +269,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -298,7 +296,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -434,7 +432,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml index f8bfb6cc763..fd394cc99fa 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml @@ -269,11 +269,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -298,7 +296,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -434,7 +432,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml index 75501c91948..de79f0b3df9 100644 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ b/.github/workflows/generated-macos-11-py3-x86-64.yml @@ -93,11 +93,9 @@ jobs: TEST_RUNNER_TYPE: macos-11 ENABLE_DISTRIBUTED_TEST: '' NUM_TEST_SHARDS: 2 - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -121,7 +119,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Checkout PyTorch uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml index 3bfa5daa7f4..5c6698e2e59 100644 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml @@ -268,11 +268,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -297,7 +295,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -433,7 +431,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml index 1b3a4fad1d6..2debf29f0cc 100644 --- a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml @@ -267,11 +267,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -296,7 +294,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -432,7 +430,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml index 7d0db219cc8..2f03e0f409e 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml @@ -269,11 +269,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -298,7 +296,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -434,7 +432,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml index bf3144e1892..13a1cf744e3 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml @@ -268,11 +268,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -297,7 +295,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -433,7 +431,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index bb63145beda..22da8807cfd 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -140,14 +140,12 @@ jobs: TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu NUM_TEST_SHARDS: 2 NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - PR_BODY: ${{ github.event.pull_request.body }} NOGPU_RUNNER_TYPE: windows.4xlarge ENABLE_FORCE_ON_CPU_TEST: '' RUN_SMOKE_TESTS_ONLY_ON_PR: False outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -168,7 +166,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} needs: [build, generate-test-matrix] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml index cfac6389895..2cb44ccf28f 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml @@ -140,14 +140,12 @@ jobs: TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu NUM_TEST_SHARDS: 2 NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - PR_BODY: ${{ github.event.pull_request.body }} NOGPU_RUNNER_TYPE: windows.4xlarge ENABLE_FORCE_ON_CPU_TEST: 1 RUN_SMOKE_TESTS_ONLY_ON_PR: False outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -168,7 +166,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} needs: [build, generate-test-matrix] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml index 3a49e0b9c39..2186e05ecbe 100644 --- a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml +++ b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml @@ -235,11 +235,9 @@ jobs: MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu NOGPU_RUNNER_TYPE: linux.2xlarge - PR_BODY: ${{ github.event.pull_request.body }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -264,7 +262,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information shell: bash @@ -400,7 +398,7 @@ jobs: -e JOB_BASE_NAME \ -e TEST_CONFIG \ -e NUM_TEST_SHARDS \ - -e PYTORCH_IGNORE_DISABLED_ISSUES \ + -e PR_BODY \ -e PYTORCH_RETRY_TEST_CASES \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 6d94c2f839c..2774ac4b66e 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -133,14 +133,12 @@ jobs: TEST_RUNNER_TYPE: windows.4xlarge NUM_TEST_SHARDS: 2 NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - PR_BODY: ${{ github.event.pull_request.body }} NOGPU_RUNNER_TYPE: windows.4xlarge ENABLE_FORCE_ON_CPU_TEST: '' RUN_SMOKE_TESTS_ONLY_ON_PR: False outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -161,7 +159,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} needs: [build, generate-test-matrix] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index a7aa492a031..294d56f7409 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -142,14 +142,12 @@ jobs: TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu NUM_TEST_SHARDS: 2 NUM_TEST_SHARDS_ON_PULL_REQUEST: 0 - PR_BODY: ${{ github.event.pull_request.body }} NOGPU_RUNNER_TYPE: windows.4xlarge ENABLE_FORCE_ON_CPU_TEST: 1 RUN_SMOKE_TESTS_ONLY_ON_PR: True outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - ignore-disabled-issues: ${{ steps.set-matrix.outputs.ignore-disabled-issues }} container: image: python:3.9 steps: @@ -170,7 +168,7 @@ jobs: TEST_CONFIG: ${{ matrix.config }} http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PYTORCH_IGNORE_DISABLED_ISSUES: ${{ needs.generate-test-matrix.outputs.ignore-disabled-issues }} + PR_BODY: ${{ github.event.pull_request.body }} needs: [build, generate-test-matrix] strategy: matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} diff --git a/tools/stats/import_test_stats.py b/tools/stats/import_test_stats.py index f6250a182be..375f7181b45 100644 --- a/tools/stats/import_test_stats.py +++ b/tools/stats/import_test_stats.py @@ -8,19 +8,20 @@ import re from typing import Any, Callable, Dict, List, Optional, cast from urllib.request import urlopen -# PYTORCH_IGNORE_DISABLED_ISSUES should only be set during CI (along with IN_CI) as a -# comma-separated list of issue numbers. The intent is to re-enable any disabled tests -# associated with the issues in this list. -# -# There is normally no reason to use this locally as the disabled tests list should not -# affect your local development and every test should be enabled. If for whatever reason -# you would like to use this during local development, please note the following caveat: -# -# Whenever you set OR reset PYTORCH_IGNORE_DISABLED_ISSUES, you should delete the existing -# .pytorch-disabled-tests.json and redownload/parse the file for your change to apply, as -# PYTORCH_IGNORE_DISABLED_ISSUES is used during the parsing stage. To download the files, -# run test/run_test.py with IN_CI=1. -IGNORE_DISABLED_ISSUES: List[str] = os.getenv('PYTORCH_IGNORE_DISABLED_ISSUES', '').split(',') +def get_disabled_issues() -> List[str]: + pr_body = os.getenv('PR_BODY', '') + # The below regex is meant to match all *case-insensitive* keywords that + # GitHub has delineated would link PRs to issues, more details here: + # https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue. + # E.g., "Close #62851", "fixES #62851" and "RESOLVED #62851" would all match, but not + # "closes #62851" --> extra space, "fixing #62851" --> not a keyword, nor "fix 62851" --> no # + regex = '(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) #([0-9]+)' + issue_numbers = [x[4] for x in re.findall(regex, pr_body)] + print("Ignoring disabled issues: ", issue_numbers) + return issue_numbers + + +IGNORE_DISABLED_ISSUES: List[str] = get_disabled_issues() SLOW_TESTS_FILE = '.pytorch-slow-tests.json' DISABLED_TESTS_FILE = '.pytorch-disabled-tests.json' From 2c916ef19807e75cd61fb633a3c446819ad5afde Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Thu, 17 Feb 2022 12:00:27 -0800 Subject: [PATCH 136/199] More update on the guidance (#72818) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72818 ghstack-source-id: 149395630 Test Plan: CI Reviewed By: raziel Differential Revision: D34226823 fbshipit-source-id: e31b71110e8e94bd9fabe25a388f0d4a9b9d0ca7 (cherry picked from commit 57e9b034aabf3efa67ca418e9c6360dc841658b1) --- torch/csrc/jit/operator_upgraders/README.md | 215 ++++++++++++-------- 1 file changed, 131 insertions(+), 84 deletions(-) diff --git a/torch/csrc/jit/operator_upgraders/README.md b/torch/csrc/jit/operator_upgraders/README.md index bcec71a2a3b..42309ef3e43 100644 --- a/torch/csrc/jit/operator_upgraders/README.md +++ b/torch/csrc/jit/operator_upgraders/README.md @@ -1,42 +1,66 @@ # Guidance for Operator Developer -PyTorch’s operators sometimes require changes to maintain the high quality user experience (UX) that PyTorch is known for. These changes can be backward compatibility (BC) breaking, where older programs will no longer run as expected on the latest version of PyTorch (an old writer / new reader problem) or forward compatibility (FC) breaking, where new programs will not run on older versions of PyTorch (a new writer / old reader problem). An upgrader is a method to use the new operator to mimic the old operator behavior. When a new runtime loads an old model with the old operator, the upgrader will replace the old operator in the model with the new operator. The replacement will only happen for old models, and it does not need to consider the new models. Please refer to the documentation [PyTorch Operator Versioning](https://github.com/pytorch/rfcs/blob/master/RFC-0017-PyTorch-Operator-Versioning.md) for more details. +PyTorch’s operators sometimes require changes for different reasons (e.g. from improving their usability to fixing bugs). These changes can be backward compatibility (BC) breaking, where older programs will no longer run as expected (or at all) on the latest version of PyTorch (an old program / new runtime problem), or forward compatibility (FC) breaking, where new programs will not run on older versions of PyTorch (a new program / old runtime problem). This guidance focuses on the requirements for maintaining backwards comatibility when making changes to an operator. +In order to do this we introduce the concept of the *upgrader*: a method to adapt the new operator to mimic the old operator behavior. +When a new runtime reads an old program containing the old operator definition, the upgrader will adapt the old operator definition to comply with the new operator implementation. As you would expect, an upgrader is only applied when an old operation definition is encountered (i.e. if there are no "old" operators in the program, no upgrader would be used). +For more details on the reasoning behind this new requirement please refer to the [PyTorch Operator Versioning RFC](https://github.com/pytorch/rfcs/blob/master/RFC-0017-PyTorch-Operator-Versioning.md). -After you change to operator either the operator schema is BC-breaking way or the semantics of the operator, you will need to write an “upgrader” to make the change non-BC breaking iff they are used in TorchScript or mobile. In general, you can know your operator is BC breaking, if it fails `test/forward_backward_compatibility/check_forward_backward_compatibility.py ` +If the change to the operator is BC-breaking in either the schema or the semantics, you are responsible for writing an upgrader to prevent the change from becoming BC breaking. -The steps to write upgrader: +You can determine if your change in the operator is BC breaking, if it fails `test/forward_backward_compatibility/check_forward_backward_compatibility.py `. + +### Some examples BC breaking changes + +When making changes to the operators, the first thing to identify is if it's BC/FC breaking. Again, we only targetting for BC breaking changes on this guidance. Here are some examples to help understanding what a BC changes may look like: + +#### Backward Compatibility Breakage: + +- Return types are more generic than the older version + - Old: `foo(Tensor self, int a) -> int` + - New: `foo(Tensor self, int a) -> Scalar` +- Argument types are more specific than the older version + - Old: `foo(Tensor self, Scalar a) -> int` + - New: `foo(Tensor self, int a) -> int` +- Added new arguments don’t have associated default values + - Old: `foo(Tensor self, int a) -> int` + - New: `foo(Tensor self, int a, int b) -> int` +- Internal implementation change even when the schema remains the same +- Deprecating an operator + + +### The steps to write upgrader: ### 1.Preparation [Build PyTorch from souce](https://github.com/pytorch/pytorch#from-source) and prepare a test model before making changes to the operator, following the process below. A test model before making the operator changes is needed to test the upgrader. Otherwise, after the change to operator, the new runtime will no longer be able to produce a model with the historic operator and can't test it anymore. 1. Add a test module in `test/jit/fixtures_srcs/fixtures_src.py`. In `test/jit/fixtures_srcs/generate_models.py`, - ``` - class TestVersionedLinspaceV7(torch.nn.Module): - def __init__(self): - super(TestVersionedLinspaceV7, self).__init__() + ``` + class TestVersionedLinspaceV7(torch.nn.Module): + def __init__(self): + super(TestVersionedLinspaceV7, self).__init__() - def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]): - c = torch.linspace(a, b, steps=5) - d = torch.linspace(a, b) - return c, d - ``` + def forward(self, a: Union[int, float, complex], b: Union[int, float, complex]): + c = torch.linspace(a, b, steps=5) + d = torch.linspace(a, b) + return c, d + ``` Please make sure the module uses the changed operator and follow the name schema ` TestVersioned{${OpnameOverloadedname}}V${kProducedFileFormatVersion}`. [`kProducedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) can be found in `versions.h`. The example operator usage can be found on [PyTorch Docs](https://pytorch.org/docs/stable/index.html), like [linspace operator](https://pytorch.org/docs/stable/generated/torch.linspace.html) 2. Register its corresponding changed operator in ALL_MODULES like following. Use an instance as the key and the changed operator as the value. It will ensure the test model covers everything needed. It's important to check in a valid test model before making the change to the runtime, as it will be really challenging to switch to the revision of the source code and regenerate the test model after the change is merged. - ``` - # key: test module instance, value: changed operator name - ALL_MODULES = { - TestVersionedLinspaceV7(): "aten::linspace", - } - ``` + ``` + # key: test module instance, value: changed operator name + ALL_MODULES = { + TestVersionedLinspaceV7(): "aten::linspace", + } + ``` This module should include the changed operator. If the operator isn't covered in the model, the model export process will fail. 3. Export the model to `test/jit/fixtures` by running - ``` - python test/jit/fixtures_src/generate_models.py - ``` + ``` + python test/jit/fixtures_src/generate_models.py + ``` 4. Commit the change and submit a pull request. @@ -49,83 +73,84 @@ The steps to write upgrader: 2. If it's impossible to write an upgrader valid for `linspace` before versioning bumping to 8, check the date when the version is bumped to 8 at [`versions.h`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82). If it has been 180 days, write an upgrader `linspace_out_8_{kProducedFileFormatVersion}` for `linspace.out` after bumping to 8, and deprecate the old upgrader. If it hasn't been 180 days, wait until 180 days and do the same changes as above. To write an upgrader, you would need to know how the new runtime with the new `linspace` operator can handle an old model with the old `linspace` operator. When `linspace` is bumped to 8, the change is to make `step` a required argument, instead of an optional argument. The old schema is: - ``` - linspace(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], dtype: Optional[int], layout: Optional[int], - device: Optional[Device], pin_memory: Optional[bool]): - ``` + ``` + linspace(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], dtype: Optional[int], layout: Optional[int], + device: Optional[Device], pin_memory: Optional[bool]): + ``` And the new schema is: - ``` - linspace(start: Union[int, float, complex], end: Union[int, float, complex], steps: int, dtype: Optional[int], layout: Optional[int], - device: Optional[Device], pin_memory: Optional[bool]): - ``` + ``` + linspace(start: Union[int, float, complex], end: Union[int, float, complex], steps: int, dtype: Optional[int], layout: Optional[int], + device: Optional[Device], pin_memory: Optional[bool]): + ``` An upgrader will only be applied to an old model and it won't be applied to a new model. The upgrader can be written with the following logic: - ``` - def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int], - device: Optional[Device], pin_memory: Optional[bool]): - if (steps is None): - return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) - return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) - ``` + ``` + def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int], + device: Optional[Device], pin_memory: Optional[bool]): + if (steps is None): + return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) + return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) + ``` The actual upgrader needs to be written as [TorchScript](https://pytorch.org/docs/stable/jit.html), and the below example is the actual upgrader of the operator `linspace.out `and the operator ` linspace` exported at version from 0 to 7. - ``` - static std::unordered_map kUpgradersEntryMap( - { - {"linspace_0_7", R"SCRIPT( - def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int], - device: Optional[Device], pin_memory: Optional[bool]): - if (steps is None): - return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) - return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) - )SCRIPT"}, - } - ``` + ``` + static std::unordered_map kUpgradersEntryMap( + { + {"linspace_0_7", R"SCRIPT( + def linspace_0_7(start: Union[int, float, complex], end: Union[int, float, complex], steps: Optional[int], *, dtype: Optional[int], layout: Optional[int], + device: Optional[Device], pin_memory: Optional[bool]): + if (steps is None): + return torch.linspace(start=start, end=end, steps=100, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) + return torch.linspace(start=start, end=end, steps=steps, dtype=dtype, layout=layout, device=device, pin_memory=pin_memory) + )SCRIPT"}, + } + ``` With the upgrader, when a new runtime loads an old model, it will first check the operator version of the old model. If it's older than the current runtime, it will replace the operator from the old model with the upgrader above. 3. Bump [`kMaxSupportedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L15) the [`kProducedFileFormatVersion`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L82) by 1 and provide the reasons under [`versions.h`](https://github.com/pytorch/pytorch/blob/master/caffe2/serialize/versions.h#L73-L81) - ``` + ``` - constexpr uint64_t kMaxSupportedFileFormatVersion = 0x9L; + constexpr uint64_t kMaxSupportedFileFormatVersion = 0x9L; - ... - // We describe new operator version bump reasons here: - // 1) [01/24/2022] - // We bump the version number to 8 to update aten::linspace - // and aten::linspace.out to error out when steps is not - // provided. (see: https://github.com/pytorch/pytorch/issues/55951) - // 2) [01/30/2022] - // Bump the version number to 9 to update aten::logspace and - // and aten::logspace.out to error out when steps is not - // provided. (see: https://github.com/pytorch/pytorch/issues/55951) - constexpr uint64_t kProducedFileFormatVersion = 0x9L; - ``` + ... + // We describe new operator version bump reasons here: + // 1) [01/24/2022] + // We bump the version number to 8 to update aten::linspace + // and aten::linspace.out to error out when steps is not + // provided. (see: https://github.com/pytorch/pytorch/issues/55951) + // 2) [01/30/2022] + // Bump the version number to 9 to update aten::logspace and + // and aten::logspace.out to error out when steps is not + // provided. (see: https://github.com/pytorch/pytorch/issues/55951) + constexpr uint64_t kProducedFileFormatVersion = 0x9L; + ``` 4. In `torch/csrc/jit/operator_upgraders/version_map.cpp`, add changes like below. You will need to make sure that the entry is **SORTED** by the bumped to version number. - ``` - {{${operator_name.overloaded_name}, - {{${bump_to_version}, - "${upgrader_name}", - "${old operator schema}"}}}, - ``` + ``` + {{${operator_name.overloaded_name}, + {{${bump_to_version}, + "${upgrader_name}", + "${old operator schema}"}}}, + ``` For the example operator `linspace`, if there are two version bumps, one is bumped to 8 and one is bumped to 12, the sorted result is: - ``` - {{"aten::linspace", - {{12, - "linspace_0_11", - "aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}}, - {{8, - "linspace_0_7", - "aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}}, - ``` + ``` + {{"aten::linspace", + {{12, + "linspace_0_11", + "aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}}, + {{8, + "linspace_0_7", + "aten::linspace(Scalar start, Scalar end, int? steps=None, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"}}}, + ``` 5. After [rebuilding PyTorch](https://github.com/pytorch/pytorch#from-source), run the following command to auto update the file [`torch/csrc/jit/mobile/upgrader_mobile.cpp`](https://github.com/pytorch/pytorch/blob/8757e21c6a4fc00e83539aa7f9c28eb11eff53c1/torch/csrc/jit/mobile/upgrader_mobile.cpp). After rebuild PyTorch from source (`python setup.py`), run - ``` - python pytorch/tools/codegen/operator_versions/gen_mobile_upgraders.py - ``` + ``` + python pytorch/tools/codegen/operator_versions/gen_mobile_upgraders.py + ``` 6. Add a test. With the model generated from step 1, you will need to add tests in `test/test_save_load_for_op_versions.py`. Following is an example to write a test - ``` + + ``` @settings(max_examples=10, deadline=200000) # A total of 10 examples will be generated @given( sample_input=st.tuples(st.integers(min_value=5, max_value=199), st.floats(min_value=5.0, max_value=199.0)) @@ -178,8 +203,7 @@ The steps to write upgrader: # has the same result as the module loaded from the new model _helper(current_mobile_module_float, torch.div) _helper(current_server_module_float, torch.div) - - ``` + ``` 7. Commit all changes made in step 2 in a single pull request and submit it. @@ -190,7 +214,7 @@ You can look at following PRs to get the rough idea of what needs to be done: --- **NOTE** -Adding arguments with a default value to an operator is not BC breaking, and thus does not require an upgrader. For example, the following change to operator `foo` is backwards compatible: +1. Adding arguments with a default value to an operator is not BC breaking, and thus does not require an upgrader. For example, the following change to operator `foo` is backwards compatible: ``` # before def foo(x, y): @@ -202,4 +226,27 @@ def foo(x, y, z=100): return x, y, z ``` +2. To help understanding the BC/FC breakage changes, here are some FC breaking changes examples. The solution to resolve it is not there yet. If it's desired, please report it in either [PyTorch Forum](https://discuss.pytorch.org/) or [PyTorch Github](https://github.com/pytorch/pytorch). We will prioritize it accordingly. + + - Adding new default argument: + - Adding a new default argument not RIGHT BEFORE the out arguments which can be 0 or more. + - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))` + - New: `foo(Tensor self, int a, int c=1, int b=1, Tensor(a!) out) -> (Tensor(a!))` + + - Adding out argument NOT at the end of the schema. + - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))` + - New: `foo(Tensor self, int a, Tensor(d!), int b=1, Tensor(a!) out) -> (Tensor(a!), Tensor(d!))` + + - Adding default arguments with container types such as ListType or DictType (list or dict). + - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))` + - New: `foo(Tensor self, int a, int b=1, int[2] c=1, Tensor(a!) out) -> (Tensor(a!))` + - Changing default argument’s name + - This will only work when the default argument always uses the default value (so that serialization will ignore it). In all other cases, it will fail. + - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))` + - New: `foo(Tensor self, int a, int c=1, Tensor(a!) out) -> (Tensor(a!))` + - Changing default argument’s default value. This will break when this argument is saved with the default value in newer runtime. Older runtime will use its old default value which will lead to wrong output. + - Old: `foo(Tensor self, int a, int b=1, Tensor(a!) out) -> (Tensor(a!))` + - New: `foo(Tensor self, int a, int b=4, Tensor(a!) out) -> (Tensor(a!))` + - Adding new operator + --- From 2cd06679285317a1d3aecbb10d04063a27729132 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Thu, 17 Feb 2022 10:50:58 -0800 Subject: [PATCH 137/199] [ci] delete generate-test-matrix Today, we have two pieces that conspire to determine what workflows we run: - `generate_ci_workflows.py`, which takes a declarative description of what we want the workflow to do and uses jinja to generate a workflow yaml file - `generate-test-matrix`, which runs at CI time to dynamically generate test jobs. This is bad: - Having one layer of code generation is unfortunate, having two is confusing. - You cannot tell from a workflow yaml file what test jobs will be run. - We have to do this careful dance of plumbing the args to `generate-test-matrix` through setting env vars and other such ugliness. - In cases where the build job fails and prevents `generate-test-matrix` from running, a ghost `test` job that doesn't actually exist noises up the HUD and our stats. - A bunch of useless `generate-test-matrix` jobs (8 on PRs) noise up our signal. As far as I can tell, this complexity is unnecessary--we have all the information we need to generate the build matrix statically. There does not appear to be an advantage in retaining generate-build-matrix, so I am removing `generate-test-matrix` to simplify the CI. The *only* place where we were actually doing something dynamic is in our windows gpu workflow, where we would check at runtime whether the workflow was triggered from a PR or master and behave accordingly. This is more simply done by just having two separate workflows with different trigger conditions, which avoids the madness of needing to parse labels and forking the behavior dynamically, which has been a source of confusion in the past. Pull Request resolved: https://github.com/pytorch/pytorch/pull/73001 --- .github/actionlint.yaml | 1 + .github/generated-ciflow-ruleset.json | 14 +- .github/scripts/generate_ci_workflows.py | 185 +- .../scripts/generate_pytorch_test_matrix.py | 122 - .github/templates/common.yml.j2 | 9 +- .github/templates/linux_ci_workflow.yml.j2 | 64 +- .github/templates/macos_ci_workflow.yml.j2 | 42 +- .github/templates/windows_ci_workflow.yml.j2 | 56 +- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 2061 ++++++++++++++++- .../generated-linux-bionic-py3.7-clang9.yml | 801 ++++++- .../generated-linux-bionic-rocm4.5-py3.7.yml | 725 +++++- ...rated-linux-vulkan-bionic-py3.7-clang9.yml | 305 ++- ...rated-linux-xenial-cuda11.3-py3.7-gcc7.yml | 809 ++++++- ...nerated-linux-xenial-py3.7-clang7-asan.yml | 801 ++++++- ...nerated-linux-xenial-py3.7-clang7-onnx.yml | 553 ++++- .../generated-linux-xenial-py3.7-gcc5.4.yml | 1545 +++++++++++- .../generated-linux-xenial-py3.7-gcc7.yml | 801 ++++++- .../generated-macos-11-py3-x86-64.yml | 268 ++- ...rallelnative-linux-xenial-py3.7-gcc5.4.yml | 553 ++++- ...iodic-linux-bionic-cuda11.5-py3.7-gcc7.yml | 809 ++++++- ...enial-cuda10.2-py3-gcc7-slow-gradcheck.yml | 557 ++++- ...linux-xenial-cuda11.1-py3.7-gcc7-debug.yml | 809 ++++++- ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 522 ++++- ...rated-periodic-win-vs2019-cuda11.5-py3.yml | 688 +++++- ...-pytorch-xla-linux-bionic-py3.7-clang8.yml | 306 ++- .../generated-win-vs2019-cpu-py3.yml | 496 +++- ...enerated-win-vs2019-cuda11.3-py3-smoke.yml | 598 +++++ .../generated-win-vs2019-cuda11.3-py3.yml | 689 +++++- 28 files changed, 14018 insertions(+), 1171 deletions(-) delete mode 100755 .github/scripts/generate_pytorch_test_matrix.py create mode 100644 .github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 18329c52625..01fb48f5f85 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -9,3 +9,4 @@ self-hosted-runner: - windows.4xlarge - windows.8xlarge.nvidia.gpu - bm-runner + - linux.rocm.gpu diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 81abc2237bc..27ccb7d4f06 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -44,7 +44,8 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" + "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke" ], "ciflow/android": [ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", @@ -120,7 +121,8 @@ "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", "periodic-win-vs2019-cuda11.1-py3", "periodic-win-vs2019-cuda11.5-py3", - "win-vs2019-cuda11.3-py3" + "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke" ], "ciflow/default": [ "linux-binary-conda", @@ -149,7 +151,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke", "windows-binary-libtorch-cxx11-abi", "windows-binary-libtorch-pre-cxx11", "windows-binary-wheel" @@ -281,7 +283,8 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" + "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke" ], "ciflow/vulkan": [ "linux-vulkan-bionic-py3.7-clang9" @@ -290,7 +293,8 @@ "periodic-win-vs2019-cuda11.1-py3", "periodic-win-vs2019-cuda11.5-py3", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3" + "win-vs2019-cuda11.3-py3", + "win-vs2019-cuda11.3-py3-smoke" ], "ciflow/xla": [ "pytorch-xla-linux-bionic-py3.7-clang8" diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 0d5a86a11d3..da84f89b710 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -2,17 +2,16 @@ from dataclasses import asdict, dataclass, field from pathlib import Path -from typing import Dict, Set, List, Iterable +from typing import Dict, Set, List, Iterable, Any import jinja2 import json import os import sys -from typing_extensions import Literal +from typing_extensions import Literal, TypedDict import generate_binary_build_matrix # type: ignore[import] -YamlShellBool = Literal["''", 1] Arch = Literal["windows", "linux", "macos"] DOCKER_REGISTRY = "308535385114.dkr.ecr.us-east-1.amazonaws.com" @@ -142,6 +141,11 @@ class CIFlowRuleset: outfile.write('\n') +class Config(TypedDict): + num_shards: int + runner: str + + @dataclass class CIWorkflow: # Required fields @@ -162,50 +166,38 @@ class CIWorkflow: is_scheduled: str = '' is_default: bool = False num_test_shards: int = 1 - only_run_smoke_tests_on_pull_request: bool = False - num_test_shards_on_pull_request: int = -1 - distributed_test: bool = True timeout_after: int = 240 xcode_version: str = '' only_on_pr: bool = False ios_arch: str = '' ios_platform: str = '' + test_jobs: Any = field(default_factory=list) - # The following variables will be set as environment variables, - # so it's easier for both shell and Python scripts to consume it if false is represented as the empty string. - enable_jit_legacy_test: YamlShellBool = "''" - enable_distributed_test: YamlShellBool = "''" - enable_multigpu_test: YamlShellBool = "''" - enable_nogpu_no_avx_test: YamlShellBool = "''" - enable_nogpu_no_avx2_test: YamlShellBool = "''" - enable_slow_test: YamlShellBool = "''" - enable_docs_test: YamlShellBool = "''" - enable_backwards_compat_test: YamlShellBool = "''" - enable_xla_test: YamlShellBool = "''" - enable_noarch_test: YamlShellBool = "''" - enable_force_on_cpu_test: YamlShellBool = "''" + enable_default_test: bool = True + enable_smoke_test: bool = True + enable_jit_legacy_test: bool = False + enable_distributed_test: bool = True + enable_multigpu_test: bool = False + enable_nogpu_no_avx_test: bool = False + enable_nogpu_no_avx2_test: bool = False + enable_slow_test: bool = False + enable_docs_test: bool = False + enable_backwards_compat_test: bool = False + enable_xla_test: bool = False + enable_noarch_test: bool = False + enable_force_on_cpu_test: bool = False def __post_init__(self) -> None: if not self.build_generates_artifacts: self.exclude_test = True - if self.distributed_test: - self.enable_distributed_test = 1 - self.multigpu_runner_type = LINUX_MULTIGPU_RUNNERS.get(self.test_runner_type, "linux.16xlarge.nvidia.gpu") self.distributed_gpu_runner_type = LINUX_DISTRIBUTED_GPU_RUNNERS.get(self.test_runner_type, "linux.8xlarge.nvidia.gpu") if LABEL_CIFLOW_DEFAULT in self.ciflow_config.labels: self.is_default = True - # If num_test_shards_on_pull_request is not user-defined, default to num_test_shards unless we are - # only running smoke tests on the pull request. - if self.num_test_shards_on_pull_request == -1: - # Don't run the default if we are only running smoke tests - if self.only_run_smoke_tests_on_pull_request: - self.num_test_shards_on_pull_request = 0 - else: - self.num_test_shards_on_pull_request = self.num_test_shards + self.test_jobs = self._gen_test_jobs() self.assert_valid() def assert_valid(self) -> None: @@ -254,6 +246,83 @@ class CIWorkflow: output_file.write("\n") print(output_file_path) + def normalized_build_environment(self, suffix: str) -> str: + return self.build_environment.replace(".", "_") + suffix + + def _gen_test_jobs(self) -> Any: + if self.arch == "linux": + MULTIGPU_RUNNER_TYPE = "linux.16xlarge.nvidia.gpu" + DISTRIBUTED_GPU_RUNNER_TYPE = "linux.8xlarge.nvidia.gpu" + NOGPU_RUNNER_TYPE = "linux.2xlarge" + elif self.arch == "windows": + DISTRIBUTED_GPU_RUNNER_TYPE = self.test_runner_type + NOGPU_RUNNER_TYPE = "windows.4xlarge" + + test_jobs = [] + + configs: Dict[str, Config] = {} + if self.enable_jit_legacy_test: + configs["jit_legacy"] = {"num_shards": 1, "runner": self.test_runner_type} + if self.enable_multigpu_test: + configs["multigpu"] = {"num_shards": 1, "runner": MULTIGPU_RUNNER_TYPE} + + if self.enable_nogpu_no_avx_test: + configs["nogpu_NO_AVX"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE} + if self.enable_nogpu_no_avx2_test: + configs["nogpu_NO_AVX2"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE} + if self.enable_force_on_cpu_test: + configs["force_on_cpu"] = {"num_shards": 1, "runner": NOGPU_RUNNER_TYPE} + if self.enable_distributed_test: + configs["distributed"] = { + "num_shards": 1, + "runner": DISTRIBUTED_GPU_RUNNER_TYPE + if "cuda" in str(self.build_environment) + else self.test_runner_type, + } + if self.enable_slow_test: + configs["slow"] = {"num_shards": 1, "runner": self.test_runner_type} + if self.enable_docs_test: + configs["docs_test"] = {"num_shards": 1, "runner": self.test_runner_type} + if self.enable_backwards_compat_test: + configs["backwards_compat"] = { + "num_shards": 1, + "runner": self.test_runner_type, + } + if self.enable_xla_test: + configs["xla"] = {"num_shards": 1, "runner": self.test_runner_type} + if self.enable_noarch_test: + configs["noarch"] = {"num_shards": 1, "runner": self.test_runner_type} + + if self.enable_smoke_test: + configs["smoke_tests"] = {"num_shards": 1, "runner": self.test_runner_type} + + for name, config in configs.items(): + for shard in range(1, config["num_shards"] + 1): + test_jobs.append( + { + "id": f"test_{name}_{shard}_{config['num_shards']}", + "name": f"test ({name}, {shard}, {config['num_shards']}, {config['runner']})", + "config": name, + "shard": shard, + "num_shards": config["num_shards"], + "runner": config["runner"], + } + ) + + if self.enable_default_test: + for shard in range(1, self.num_test_shards + 1): + test_jobs.append( + { + "id": f"test_default_{shard}_{config['num_shards']}", + "name": f"test (default, {shard}, {self.num_test_shards}, {self.test_runner_type})", + "config": "default", + "shard": shard, + "num_shards": self.num_test_shards, + "runner": self.test_runner_type, + } + ) + return test_jobs + @dataclass class DockerWorkflow: build_environment: str @@ -327,17 +396,30 @@ WINDOWS_WORKFLOWS = [ labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN} ), ), + CIWorkflow( + arch="windows", + build_environment="win-vs2019-cuda11.3-py3-smoke", + cuda_version="11.3", + test_runner_type=WINDOWS_CUDA_TEST_RUNNER, + enable_default_test=False, + enable_smoke_test=True, + enable_force_on_cpu_test=True, + only_on_pr=True, + ciflow_config=CIFlowConfig( + run_on_canary=True, + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} + ), + ), CIWorkflow( arch="windows", build_environment="win-vs2019-cuda11.3-py3", cuda_version="11.3", test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, - only_run_smoke_tests_on_pull_request=True, - enable_force_on_cpu_test=1, + enable_force_on_cpu_test=True, ciflow_config=CIFlowConfig( run_on_canary=True, - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} + labels={LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} ), ), CIWorkflow( @@ -346,7 +428,7 @@ WINDOWS_WORKFLOWS = [ cuda_version="11.5", test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, - enable_force_on_cpu_test=1, + enable_force_on_cpu_test=True, is_scheduled="45 4,10,16,22 * * *", ciflow_config=CIFlowConfig( run_on_canary=True, @@ -372,9 +454,9 @@ LINUX_WORKFLOWS = [ build_environment="linux-xenial-py3.7-gcc5.4", docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3.7-gcc5.4", test_runner_type=LINUX_CPU_TEST_RUNNER, - enable_jit_legacy_test=1, - enable_backwards_compat_test=1, - enable_docs_test=1, + enable_jit_legacy_test=True, + enable_backwards_compat_test=True, + enable_docs_test=True, num_test_shards=2, ciflow_config=CIFlowConfig( run_on_canary=True, @@ -475,7 +557,7 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-asan", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=3, - distributed_test=False, + enable_distributed_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_SANITIZERS, LABEL_CIFLOW_CPU}, ), @@ -486,7 +568,7 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-py3-clang7-onnx", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=2, - distributed_test=False, + enable_distributed_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_ONNX, LABEL_CIFLOW_CPU}, ), @@ -496,11 +578,11 @@ LINUX_WORKFLOWS = [ build_environment="linux-bionic-cuda10.2-py3.9-gcc7", docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-cuda10.2-cudnn7-py3.9-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, - enable_jit_legacy_test=1, - enable_multigpu_test=1, - enable_nogpu_no_avx_test=1, - enable_nogpu_no_avx2_test=1, - enable_slow_test=1, + enable_jit_legacy_test=True, + enable_multigpu_test=True, + enable_nogpu_no_avx_test=True, + enable_nogpu_no_avx2_test=True, + enable_slow_test=True, num_test_shards=2, ciflow_config=CIFlowConfig( run_on_canary=True, @@ -623,8 +705,8 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=2, - distributed_test=False, - enable_noarch_test=1, + enable_distributed_test=False, + enable_noarch_test=True, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_NOARCH}, ), @@ -635,7 +717,7 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-bionic-py3.7-clang9", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=1, - distributed_test=False, + enable_distributed_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_VULKAN}, ), @@ -646,7 +728,7 @@ LINUX_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7", test_runner_type=LINUX_CUDA_TEST_RUNNER, num_test_shards=2, - distributed_test=False, + enable_distributed_test=False, timeout_after=360, # Only run this on master 4 times per day since it does take a while is_scheduled="0 */4 * * *", @@ -663,8 +745,9 @@ XLA_WORKFLOWS = [ docker_image_base=f"{DOCKER_REGISTRY}/pytorch/xla_base", test_runner_type=LINUX_CPU_TEST_RUNNER, num_test_shards=2, - distributed_test=False, - enable_xla_test=1, + enable_distributed_test=False, + enable_xla_test=True, + enable_default_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_LINUX, LABEL_CIFLOW_CPU, LABEL_CIFLOW_XLA}, ), @@ -801,7 +884,7 @@ MACOS_WORKFLOWS = [ xcode_version="12.4", test_runner_type=MACOS_TEST_RUNNER_11, num_test_shards=2, - distributed_test=False, + enable_distributed_test=False, ciflow_config=CIFlowConfig( labels={LABEL_CIFLOW_MACOS}, ), diff --git a/.github/scripts/generate_pytorch_test_matrix.py b/.github/scripts/generate_pytorch_test_matrix.py deleted file mode 100755 index 4cbc20691a1..00000000000 --- a/.github/scripts/generate_pytorch_test_matrix.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/env python3 - -"""Generates a matrix to be utilized through github actions - -Will output a matrix to represent our testing configurations, which is currently -dictated by just sharding. - -""" - -import json -import os -from typing import Dict - -from typing_extensions import TypedDict - - -BUILD_ENVIRONMENT = os.getenv('BUILD_ENVIRONMENT') -assert BUILD_ENVIRONMENT is not None - -class Config(TypedDict): - num_shards: int - runner: str - - -# When the user specifies labels that are NOT ciflow/default, the expectation is -# that the workflows should be triggered as if they are on trunk. For example, when -# ciflow/all is specified, we should run the full test suite for Windows CUDA -# and NOT only the smoke tests. -def run_as_if_on_trunk() -> bool: - ON_PULL_REQUEST = os.getenv('GITHUB_HEAD_REF') - if not ON_PULL_REQUEST: - return True - - from pathlib import Path - GITHUB_DIR = Path(__file__).resolve().parent.parent - - with open(f'{GITHUB_DIR}/generated-ciflow-ruleset.json') as f: - labels_to_workflows = json.load(f)['label_rules'] - - pr_labels = json.loads(os.getenv('PR_LABELS', '[]')) - current_workflow_triggered_by_label = False - for label in pr_labels: - if label != 'ciflow/default' and label in labels_to_workflows: - workflows_triggered_by_label = labels_to_workflows[label] - if any([BUILD_ENVIRONMENT in workflow for workflow in workflows_triggered_by_label]): - current_workflow_triggered_by_label = True - break - - return current_workflow_triggered_by_label - -def main() -> None: - INCLUDE_DEFAULT_TEST = True - TEST_RUNNER_TYPE = os.getenv('TEST_RUNNER_TYPE') - assert TEST_RUNNER_TYPE is not None - RUN_SMOKE_TESTS_ONLY_ON_PR = os.getenv('RUN_SMOKE_TESTS_ONLY_ON_PR') - RUN_SMOKE_TESTS = RUN_SMOKE_TESTS_ONLY_ON_PR == "true" and not run_as_if_on_trunk() - NUM_TEST_SHARDS_ON_PULL_REQUEST = os.getenv('NUM_TEST_SHARDS_ON_PULL_REQUEST') - NUM_TEST_SHARDS = int(os.getenv('NUM_TEST_SHARDS', '0')) - if not run_as_if_on_trunk() and NUM_TEST_SHARDS_ON_PULL_REQUEST: - NUM_TEST_SHARDS = int(NUM_TEST_SHARDS_ON_PULL_REQUEST) - MULTIGPU_RUNNER_TYPE = os.getenv('MULTIGPU_RUNNER_TYPE') - DISTRIBUTED_GPU_RUNNER_TYPE = os.getenv('DISTRIBUTED_GPU_RUNNER_TYPE', TEST_RUNNER_TYPE) - NOGPU_RUNNER_TYPE = os.getenv('NOGPU_RUNNER_TYPE') - configs: Dict[str, Config] = {} - if os.getenv('ENABLE_JIT_LEGACY_TEST'): - configs['jit_legacy'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if MULTIGPU_RUNNER_TYPE is not None and os.getenv('ENABLE_MULTIGPU_TEST'): - configs['multigpu'] = {'num_shards': 1, 'runner': MULTIGPU_RUNNER_TYPE} - if NOGPU_RUNNER_TYPE is not None: - if os.getenv('ENABLE_NOGPU_NO_AVX_TEST'): - configs['nogpu_NO_AVX'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_NOGPU_NO_AVX2_TEST'): - configs['nogpu_NO_AVX2'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_FORCE_ON_CPU_TEST'): - configs['force_on_cpu'] = {'num_shards': 1, 'runner': NOGPU_RUNNER_TYPE} - if os.getenv('ENABLE_DISTRIBUTED_TEST'): - configs['distributed'] = { - 'num_shards': 1, - 'runner': DISTRIBUTED_GPU_RUNNER_TYPE if "cuda" in str(BUILD_ENVIRONMENT) else TEST_RUNNER_TYPE - } - if os.getenv('ENABLE_SLOW_TEST'): - configs['slow'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_DOCS_TEST'): - configs['docs_test'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_BACKWARDS_COMPAT_TEST'): - configs['backwards_compat'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if os.getenv('ENABLE_XLA_TEST'): - configs['xla'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - INCLUDE_DEFAULT_TEST = False - if os.getenv('ENABLE_NOARCH_TEST'): - configs['noarch'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - if RUN_SMOKE_TESTS: - configs['smoke_tests'] = {'num_shards': 1, 'runner': TEST_RUNNER_TYPE} - matrix = { - 'include': [ - { - 'config': 'default', - 'shard': shard, - 'num_shards': NUM_TEST_SHARDS, - 'runner': TEST_RUNNER_TYPE, - } - for shard in range(1, NUM_TEST_SHARDS + 1) - if INCLUDE_DEFAULT_TEST - ] + [ - { - 'config': name, - 'shard': shard, - 'num_shards': config['num_shards'], - 'runner': config['runner'], - } - for name, config in configs.items() - for shard in range(1, config['num_shards'] + 1) - ] - } - render_matrix = {'config': list(dict.fromkeys(x['config'] for x in matrix['include']))} - print(json.dumps({'matrix': matrix, 'render-matrix': render_matrix}, indent=2)) - print(f'::set-output name=matrix::{json.dumps(matrix)}') - print(f'::set-output name=render-matrix::{json.dumps(render_matrix)}') - - -if __name__ == "__main__": - main() diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 123d498363f..855917e0742 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -219,13 +219,12 @@ concurrency: {%- endif %} {%- endmacro -%} -{%- macro upload_downloaded_files(name, artifact_name="", use_s3=True, when="always()") -%} +{%- macro upload_downloaded_files(name, config=None, shard=None, num_shards=None, runner=None, artifact_name="", use_s3=True, when="always()") -%} - name: Zip JSONs for upload if: !{{ when }} env: {%- if name == 'linux' or name == 'windows' or name == 'macos' %} - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' -{%- else %} + FILE_SUFFIX: '${{ github.job }}-!{{ config }}-!{{ shard }}-!{{ num_shards }}-!{{ runner }}'{%- else %} FILE_SUFFIX: '!{{ name }}-${{ github.job }}' {%- endif %} {%- if name == 'windows' %} @@ -257,12 +256,12 @@ concurrency: test-jsons-*.zip {%- endmacro -%} -{%- macro upload_test_reports(name, artifact_name="", use_s3=True) -%} +{%- macro upload_test_reports(name, config=None, shard=None, num_shards=None, runner=None, artifact_name="", use_s3=True) -%} - name: Zip test reports for upload if: always() env: {%- if name == 'linux' or name == 'windows' or name == 'macos' %} - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-!{{ config }}-!{{ shard }}-!{{ num_shards }}-!{{ runner }}' {%- else %} FILE_SUFFIX: '!{{ name }}-${{ github.job }}' {%- endif %} diff --git a/.github/templates/linux_ci_workflow.yml.j2 b/.github/templates/linux_ci_workflow.yml.j2 index 02a1a8fb026..775394b1789 100644 --- a/.github/templates/linux_ci_workflow.yml.j2 +++ b/.github/templates/linux_ci_workflow.yml.j2 @@ -176,53 +176,18 @@ jobs: {%- endblock %} {%- if not exclude_test %} {% block test +%} - generate-test-matrix: + {%- for test_job in test_jobs %} + !{{ test_job.id }}: + name: !{{ test_job.name }} needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }} - ENABLE_JIT_LEGACY_TEST: !{{ enable_jit_legacy_test }} - ENABLE_MULTIGPU_TEST: !{{ enable_multigpu_test }} - ENABLE_NOGPU_NO_AVX_TEST: !{{ enable_nogpu_no_avx_test }} - ENABLE_NOGPU_NO_AVX2_TEST: !{{ enable_nogpu_no_avx2_test }} - ENABLE_SLOW_TEST: !{{ enable_slow_test }} - ENABLE_DOCS_TEST: !{{ enable_docs_test }} - ENABLE_BACKWARDS_COMPAT_TEST: !{{ enable_backwards_compat_test }} - ENABLE_XLA_TEST: !{{ enable_xla_test }} - ENABLE_NOARCH_TEST: !{{ enable_noarch_test }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - MULTIGPU_RUNNER_TYPE: !{{ multigpu_runner_type }} - DISTRIBUTED_GPU_RUNNER_TYPE: !{{ distributed_gpu_runner_type }} - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: !{{ test_job.runner }} timeout-minutes: !{{ common.timeout_minutes }} env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: !{{ build_environment }}-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: !{{ test_job.config }} + SHARD_NUMBER: !{{ test_job.shard }} + NUM_TEST_SHARDS: !{{ test_job.num_shards }} PR_BODY: ${{ github.event.pull_request.body }} steps: {%- if 'rocm' in test_runner_type %} @@ -235,14 +200,12 @@ jobs: run: | !{{ common.add_retry_to_env() }} retry docker pull "${DOCKER_IMAGE}" -{%- if 'rocm' in test_runner_type %} +{%- if 'rocm' in test_runner_type and "nogpu" not in test_job.config %} - name: ROCm set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }} run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" -{%- else %} +{%- elif "cuda" in build_environment and "nogpu" not in test_job.config %} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -365,11 +328,11 @@ jobs: {%- endif %} !{{ common.render_test_results() }} {%- if 'rocm' in test_runner_type %} - !{{ common.upload_downloaded_files(name='linux', use_s3=False) }} - !{{ common.upload_test_reports(name='linux', artifact_name="test-reports", use_s3=False) }} + !{{ common.upload_downloaded_files(name='linux', use_s3=False, config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} + !{{ common.upload_test_reports(name='linux', artifact_name="test-reports", use_s3=False, config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} {%- else %} - !{{ common.upload_downloaded_files(name='linux') }} - !{{ common.upload_test_reports(name='linux') }} + !{{ common.upload_downloaded_files(name='linux', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} + !{{ common.upload_test_reports(name='linux', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} {%- endif %} !{{ common.upload_test_statistics(build_environment) }} {%- if 'rocm' in test_runner_type %} @@ -377,6 +340,7 @@ jobs: {%- else %} !{{ common.teardown_ec2_linux() }} {%- endif %} +{%- endfor %} {% endblock %} {%- endif -%} {%- if enable_doc_jobs %} diff --git a/.github/templates/macos_ci_workflow.yml.j2 b/.github/templates/macos_ci_workflow.yml.j2 index 413df391183..ea7aa370cb5 100644 --- a/.github/templates/macos_ci_workflow.yml.j2 +++ b/.github/templates/macos_ci_workflow.yml.j2 @@ -87,40 +87,17 @@ jobs: {% endblock +%} {%- if not exclude_test %} {% block test +%} - generate-test-matrix: + {%- for test_job in test_jobs %} + !{{ test_job.id }}: + name: !{{ test_job.name }} needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - ENABLE_DISTRIBUTED_TEST: !{{ enable_distributed_test }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: !{{ test_job.runner }} timeout-minutes: !{{ common.timeout_minutes }} env: JOB_BASE_NAME: !{{ build_environment }}-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: !{{ test_job.config }} + SHARD_NUMBER: !{{ test_job.shard }} + NUM_TEST_SHARDS: !{{ test_job.num_shards }} PR_BODY: ${{ github.event.pull_request.body }} steps: !{{ common.checkout(submodules="false") }} @@ -143,9 +120,10 @@ jobs: python3 -mpip install dist/*.whl .jenkins/pytorch/macos-test.sh !{{ common.render_test_results() }} - !{{ common.upload_downloaded_files(name='macos', artifact_name="test-jsons", use_s3=False) }} - !{{ common.upload_test_reports("macos", artifact_name="test-reports", use_s3=False) }} + !{{ common.upload_downloaded_files(name='macos', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner, artifact_name="test-jsons", use_s3=False) }} + !{{ common.upload_test_reports("macos", config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner, artifact_name="test-reports", use_s3=False) }} !{{ common.upload_test_statistics(build_environment, needs_credentials=True) }} +{%- endfor %} {% endblock +%} {%- endif %} diff --git a/.github/templates/windows_ci_workflow.yml.j2 b/.github/templates/windows_ci_workflow.yml.j2 index 53f50aa37de..db392d9cbe7 100644 --- a/.github/templates/windows_ci_workflow.yml.j2 +++ b/.github/templates/windows_ci_workflow.yml.j2 @@ -31,11 +31,12 @@ on: - '!{{ label }}/*' {%- endif %} {%- endfor %} -{%- if not is_scheduled %} +{%- if not is_scheduled and not only_on_pr %} branches: - master - release/* -{%- else %} +{%- endif %} +{%- if is_scheduled and not only_on_pr %} schedule: - cron: !{{ is_scheduled }} {%- endif %} @@ -130,46 +131,20 @@ jobs: rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: !{{ common.timeout_minutes }} - env: - TEST_RUNNER_TYPE: !{{ test_runner_type }} - NUM_TEST_SHARDS: !{{ num_test_shards }} - NUM_TEST_SHARDS_ON_PULL_REQUEST: !{{ num_test_shards_on_pull_request }} - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: !{{ enable_force_on_cpu_test }} - RUN_SMOKE_TESTS_ONLY_ON_PR: !{{ only_run_smoke_tests_on_pull_request }} - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + {%- for test_job in test_jobs %} + !{{ test_job.id }}: + name: !{{ test_job.name }} timeout-minutes: !{{ common.timeout_minutes }} env: JOB_BASE_NAME: !{{ build_environment }}-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: !{{ test_job.shard }} + NUM_TEST_SHARDS: !{{ test_job.num_shards }} + TEST_CONFIG: !{{ test_job.config }} http_proxy: "!{{ common.squid_proxy }}" https_proxy: "!{{ common.squid_proxy }}" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: !{{ test_job.runner }} steps: !{{ common.display_ec2_information() }} - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" @@ -181,14 +156,12 @@ jobs: shell: powershell run: | .\.circleci\scripts\vs_install.ps1 -{%- if cuda_version != "cpu" %} +{%- if cuda_version != "cpu" and not test_job.config == 'force_on_cpu' %} - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} shell: bash run: | .circleci/scripts/windows_cuda_install.sh - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} shell: bash run: | .circleci/scripts/windows_cudnn_install.sh @@ -215,8 +188,8 @@ jobs: timeout-minutes: 210 run: | .jenkins/pytorch/win-test.sh - !{{ common.upload_downloaded_files(name='windows') }} - !{{ common.upload_test_reports(name='windows') }} + !{{ common.upload_downloaded_files(name='windows', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} + !{{ common.upload_test_reports(name='windows', config=test_job.config, shard=test_job.shard, num_shards=test_job.num_shards, runner=test_job.runner) }} !{{ common.render_test_results() }} !{{ common.wait_and_kill_ssh_windows() }} !{{ common.parse_ref() }} @@ -227,3 +200,4 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* + {%- endfor %} diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index c78dc63a446..e2631900a36 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_jit_legacy_1_1: + name: test (jit_legacy, 1, 1, linux.4xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: 1 - ENABLE_MULTIGPU_TEST: 1 - ENABLE_NOGPU_NO_AVX_TEST: 1 - ENABLE_NOGPU_NO_AVX2_TEST: 1 - ENABLE_SLOW_TEST: 1 - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.4xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: jit_legacy + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,7 +323,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -475,7 +438,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-jit_legacy-1-1-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +454,2015 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-jit_legacy-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_multigpu_1_1: + name: test (multigpu, 1, 1, linux.16xlarge.nvidia.gpu) + needs: build + runs-on: linux.16xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: multigpu + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-multigpu-1-1-linux.16xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-multigpu-1-1-linux.16xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_nogpu_NO_AVX_1_1: + name: test (nogpu_NO_AVX, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: nogpu_NO_AVX + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-nogpu_NO_AVX-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-nogpu_NO_AVX-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_nogpu_NO_AVX2_1_1: + name: test (nogpu_NO_AVX2, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: nogpu_NO_AVX2 + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-nogpu_NO_AVX2-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-nogpu_NO_AVX2-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_distributed_1_1: + name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) + needs: build + runs-on: linux.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_slow_1_1: + name: test (slow, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: slow + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-slow-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-slow-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml index aa28d07c1b2..65880d1b982 100644 --- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml @@ -250,53 +250,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_noarch_1_1: + name: test (noarch, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: 1 - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-py3.7-clang9-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: noarch + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,11 +323,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -476,7 +435,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-noarch-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -492,7 +451,751 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-noarch-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-py3.7-clang9-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml index 5dd6543e1f5..0922d5a9e62 100644 --- a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml +++ b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.rocm.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.rocm.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.rocm.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.rocm.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.rocm.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Clean workspace @@ -352,7 +316,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: ROCm set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'rocm') && !contains(matrix.config, 'nogpu') }} run: | echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" - name: Determine shm-size @@ -455,7 +418,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.rocm.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -471,7 +434,679 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.rocm.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.rocm.gpu) + needs: build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.rocm.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.rocm.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.rocm.gpu) + needs: build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.rocm.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.rocm.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.rocm.gpu) + needs: build + runs-on: linux.rocm.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: Set DOCKER_HOST + run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Runner health check system info + if: always() + run: | + cat /etc/os-release || true + cat /etc/apt/sources.list.d/rocm.list || true + cat /opt/rocm/.info/version || true + whoami + - name: Runner health check rocm-smi + if: always() + run: | + rocm-smi + - name: Runner health check rocminfo + if: always() + run: | + rocminfo + - name: Runner health check GPU count + if: always() + run: | + ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') + if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then + echo "Failed to detect GPUs on the runner" + exit 1 + fi + - name: Runner health check disconnect on failure + if: ${{ failure() }} + run: | + killall runsvc.sh + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: ROCm set GPU_FLAG + run: | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home + docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" + # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct + docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.rocm.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.rocm.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml index e6ea8bde928..e836ddf691b 100644 --- a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml @@ -250,53 +250,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 1 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,11 +323,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -476,7 +435,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -492,7 +451,255 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-1-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml index eaa71fb310b..8b26d013935 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.8xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,7 +323,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -475,7 +438,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +454,763 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml index 6a002fda318..0b6fea00e1f 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml @@ -250,53 +250,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 3 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,11 +323,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -476,7 +435,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -492,7 +451,751 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 3, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 3 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-3-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-3-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 3, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 3 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-3-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-3-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_3_1: + name: test (default, 3, 3, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test + TEST_CONFIG: default + SHARD_NUMBER: 3 + NUM_TEST_SHARDS: 3 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-3-3-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-3-3-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml index 6ebf5dbdd0b..7d49630c027 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml @@ -250,53 +250,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,11 +323,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -476,7 +435,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -492,7 +451,503 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml index b1c63e596df..189b085f8ae 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_jit_legacy_1_1: + name: test (jit_legacy, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: 1 - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: 1 - ENABLE_BACKWARDS_COMPAT_TEST: 1 - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: jit_legacy + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -358,11 +322,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -475,7 +434,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-jit_legacy-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +450,1495 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-jit_legacy-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_distributed_1_1: + name: test (distributed, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_docs_test_1_1: + name: test (docs_test, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: docs_test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-docs_test-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-docs_test-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_backwards_compat_1_1: + name: test (backwards_compat, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: backwards_compat + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-backwards_compat-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-backwards_compat-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml index fd394cc99fa..42507986059 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -358,11 +322,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -475,7 +434,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +450,751 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml index de79f0b3df9..da7c8c0d9ff 100644 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ b/.github/workflows/generated-macos-11-py3-x86-64.yml @@ -85,40 +85,16 @@ jobs: artifacts.zip - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, macos-11) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: macos-11 - ENABLE_DISTRIBUTED_TEST: '' - NUM_TEST_SHARDS: 2 - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: macos-11 timeout-minutes: 240 env: JOB_BASE_NAME: macos-11-py3-x86-64-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Checkout PyTorch @@ -173,7 +149,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-macos-11' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -190,7 +166,235 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-macos-11' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: macos-11-py3-x86-64-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }} + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + test_default_1_1: + name: test (default, 1, 2, macos-11) + needs: build + runs-on: macos-11 + timeout-minutes: 240 + env: + JOB_BASE_NAME: macos-11-py3-x86-64-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - uses: actions/download-artifact@v2 + name: Download PyTorch Build Artifacts from GHA + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: . + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + activate-environment: build + - name: Install macOS homebrew dependencies + run: | + # Install dependencies + brew install libomp + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + run: | + python3 -mpip install dist/*.whl + .jenkins/pytorch/macos-test.sh + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-macos-11' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + name: test-jsons + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-macos-11' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: actions/upload-artifact@v2 + name: Store Test Reports on Github + if: always() + with: + name: test-reports + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: macos-11-py3-x86-64-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }} + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + test_default_2_1: + name: test (default, 2, 2, macos-11) + needs: build + runs-on: macos-11 + timeout-minutes: 240 + env: + JOB_BASE_NAME: macos-11-py3-x86-64-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: false + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - uses: actions/download-artifact@v2 + name: Download PyTorch Build Artifacts from GHA + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: . + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + python-version: 3.8 + activate-environment: build + - name: Install macOS homebrew dependencies + run: | + # Install dependencies + brew install libomp + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + run: | + python3 -mpip install dist/*.whl + .jenkins/pytorch/macos-test.sh + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-macos-11' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: actions/upload-artifact@v2 + name: Store Test Downloaded JSONs on Github + if: always() + with: + name: test-jsons + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-macos-11' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml index 5c6698e2e59..46e142b53b8 100644 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml @@ -248,53 +248,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 1 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -357,11 +321,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -474,7 +433,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -490,7 +449,503 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-1-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml index 2debf29f0cc..a118b22f61a 100644 --- a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml @@ -247,53 +247,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.8xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -357,7 +321,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -473,7 +436,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -489,7 +452,763 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml index 2f03e0f409e..c3bee6a3aa2 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml @@ -249,53 +249,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.4xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -359,7 +323,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -475,7 +438,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -491,7 +454,511 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 360 minutes + timeout-minutes: 360 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 360 minutes + timeout-minutes: 360 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml index 13a1cf744e3..6fe981b2ff1 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml @@ -248,53 +248,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_distributed_1_1: + name: test (distributed, 1, 1, linux.8xlarge.nvidia.gpu) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.4xlarge.nvidia.gpu - ENABLE_DISTRIBUTED_TEST: 1 - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: '' - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.8xlarge.nvidia.gpu timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: distributed + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -358,7 +322,6 @@ jobs: } retry docker pull "${DOCKER_IMAGE}" - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} run: | bash .github/scripts/install_nvidia_utils_linux.sh echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" @@ -474,7 +437,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -490,7 +453,763 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-linux.8xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_1_1: + name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + TEST_CONFIG: default + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_default_2_1: + name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) + needs: build + runs-on: linux.4xlarge.nvidia.gpu + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test + TEST_CONFIG: default + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + run: | + bash .github/scripts/install_nvidia_utils_linux.sh + echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-linux.4xlarge.nvidia.gpu' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 22da8807cfd..a24e8f22446 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -131,47 +131,19 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: '' - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + test_distributed_1_1: + name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 env: JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu steps: - name: Display EC2 information shell: bash @@ -206,12 +178,10 @@ jobs: run: | .\.circleci\scripts\vs_install.ps1 - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} shell: bash run: | .circleci/scripts/windows_cuda_install.sh - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} shell: bash run: | .circleci/scripts/windows_cudnn_install.sh @@ -240,7 +210,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' shell: powershell run: | # -ir => recursive include all files in pattern @@ -256,7 +226,481 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_1_1: + name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_2_1: + name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' shell: powershell run: | # -ir => recursive include all files in pattern diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml index 2cb44ccf28f..f11536461b6 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml @@ -131,47 +131,19 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: 1 - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + test_force_on_cpu_1_1: + name: test (force_on_cpu, 1, 1, windows.4xlarge) timeout-minutes: 240 env: JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: force_on_cpu http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: windows.4xlarge steps: - name: Display EC2 information shell: bash @@ -205,16 +177,6 @@ jobs: shell: powershell run: | .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b name: Download PyTorch Build Artifacts with: @@ -240,7 +202,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' shell: powershell run: | # -ir => recursive include all files in pattern @@ -256,7 +218,639 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_distributed_1_1: + name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_1_1: + name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_2_1: + name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' shell: powershell run: | # -ir => recursive include all files in pattern diff --git a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml index 2186e05ecbe..ecde072b15d 100644 --- a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml +++ b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml @@ -215,53 +215,17 @@ jobs: # Prune all of the docker images docker system prune -af - generate-test-matrix: + test_xla_1_1: + name: test (xla, 1, 1, linux.2xlarge) needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: linux.2xlarge - ENABLE_DISTRIBUTED_TEST: '' - ENABLE_JIT_LEGACY_TEST: '' - ENABLE_MULTIGPU_TEST: '' - ENABLE_NOGPU_NO_AVX_TEST: '' - ENABLE_NOGPU_NO_AVX2_TEST: '' - ENABLE_SLOW_TEST: '' - ENABLE_DOCS_TEST: '' - ENABLE_BACKWARDS_COMPAT_TEST: '' - ENABLE_XLA_TEST: 1 - ENABLE_NOARCH_TEST: '' - NUM_TEST_SHARDS: 2 - MULTIGPU_RUNNER_TYPE: linux.16xlarge.nvidia.gpu - DISTRIBUTED_GPU_RUNNER_TYPE: linux.8xlarge.nvidia.gpu - NOGPU_RUNNER_TYPE: linux.2xlarge - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + runs-on: linux.2xlarge timeout-minutes: 240 env: DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test - TEST_CONFIG: ${{ matrix.config }} - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} + TEST_CONFIG: xla + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 PR_BODY: ${{ github.event.pull_request.body }} steps: - name: Display EC2 information @@ -324,11 +288,6 @@ jobs: "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") } retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - if: ${{ contains(env.BUILD_ENVIRONMENT, 'cuda') && !contains(matrix.config, 'nogpu') }} - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - name: Determine shm-size run: | shm_size="1g" @@ -442,7 +401,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-xla-1-1-linux.2xlarge' run: | # Remove any previous test jsons if they exist rm -f test-jsons-*.zip @@ -458,7 +417,256 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-xla-1-1-linux.2xlarge' + run: | + # Remove any previous test reports if they exist + rm -f test-reports-*.zip + zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Hold runner for 2 hours or until ssh sessions have drained + # Always hold for active ssh sessions + if: always() + run: .github/scripts/wait_for_ssh_to_drain.sh + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Kill containers, clean up images + if: always() + run: | + # ignore expansion of "docker ps -q" since it could be empty + # shellcheck disable=SC2046 + docker stop $(docker ps -q) || true + # Prune all of the docker images + docker system prune -af + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, linux.2xlarge) + needs: build + runs-on: linux.2xlarge + timeout-minutes: 240 + env: + DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} + JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test + TEST_CONFIG: smoke_tests + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + PR_BODY: ${{ github.event.pull_request.body }} + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Log in to ECR + env: + AWS_RETRY_MODE: standard + AWS_MAX_ATTEMPTS: 5 + run: | + AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ + --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" + - name: Chown workspace + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${ALPINE_IMAGE}" + # Ensure the working directory gets chowned back to the current user + docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Clean workspace + run: | + rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Preserve github env variables for use in docker + run: | + env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Pull Docker image + run: | + retry () { + "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") + } + retry docker pull "${DOCKER_IMAGE}" + - name: Determine shm-size + run: | + shm_size="1g" + case "${BUILD_ENVIRONMENT}" in + *cuda*) + shm_size="2g" + ;; + *rocm*) + shm_size="8g" + ;; + esac + echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + - name: Unzip artifacts + run: | + unzip -o artifacts.zip + - name: Output disk space left + run: | + sudo df -H + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Test + env: + PR_NUMBER: ${{ github.event.pull_request.number }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # Time out the test phase after 240 minutes + timeout-minutes: 240 + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.jenkins/caffe2/test.sh + else + TEST_COMMAND=.jenkins/pytorch/test.sh + fi + PROXY_ENV= + # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now + # We should investigate whether or not there's a list of hostnames we can add to no_proxy to + # make it so that we shouldn't have to fully disable squid for XLA tests + if [[ $TEST_CONFIG != 'xla' ]]; then + # shellcheck disable=SC2089 + PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" + fi + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ + -e GITHUB_ACTIONS \ + -e IN_CI \ + -e IS_GHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e JOB_BASE_NAME \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e PR_BODY \ + -e PYTORCH_RETRY_TEST_CASES \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e XLA_CUDA \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + ${PROXY_ENV} \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --ulimit stack=10485760:83886080 \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" + - name: Chown workspace + if: always() + run: | + # Ensure the working directory gets chowned back to the current user + docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' + run: | + # Remove any previous test jsons if they exist + rm -f test-jsons-*.zip + zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' run: | # Remove any previous test reports if they exist rm -f test-reports-*.zip diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 2774ac4b66e..fe37c106670 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -124,47 +124,19 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.4xlarge - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 2 - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: '' - RUN_SMOKE_TESTS_ONLY_ON_PR: False - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + test_distributed_1_1: + name: test (distributed, 1, 1, windows.4xlarge) timeout-minutes: 240 env: JOB_BASE_NAME: win-vs2019-cpu-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: windows.4xlarge steps: - name: Display EC2 information shell: bash @@ -223,7 +195,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.4xlarge' shell: powershell run: | # -ir => recursive include all files in pattern @@ -239,7 +211,457 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cpu-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.4xlarge) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cpu-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.4xlarge + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cpu-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_1_1: + name: test (default, 1, 2, windows.4xlarge) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cpu-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.4xlarge + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cpu-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_2_1: + name: test (default, 2, 2, windows.4xlarge) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cpu-py3-test + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.4xlarge + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.4xlarge' shell: powershell run: | # -ir => recursive include all files in pattern diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml new file mode 100644 index 00000000000..64322fc55e2 --- /dev/null +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml @@ -0,0 +1,598 @@ +# @generated DO NOT EDIT MANUALLY +# Template is at: .github/templates/windows_ci_workflow.yml.j2 +# Generation script: .github/scripts/generate_ci_workflows.py +name: win-vs2019-cuda11.3-py3-smoke + +on: + pull_request: + push: + tags: + - 'ciflow/all/*' + - 'ciflow/cuda/*' + - 'ciflow/trunk/*' + - 'ciflow/win/*' + workflow_dispatch: + +env: + BUILD_ENVIRONMENT: win-vs2019-cuda11.3-py3-smoke + BUILD_WHEEL: 1 + MAX_JOBS: 8 + CUDA_VERSION: "11.3" + IN_CI: 1 + IS_GHA: 1 + INSTALL_WINDOWS_SDK: 1 + PYTHON_VERSION: "3.8" + PYTORCH_RETRY_TEST_CASES: 1 + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + SCCACHE_BUCKET: "ossci-compiler-cache" + VC_PRODUCT: "BuildTools" + VC_VERSION: "" + VS_VERSION: "16.8.6" + VC_YEAR: "2019" + ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" + no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TORCH_CUDA_ARCH_LIST: "7.0" + USE_CUDA: 1 + +concurrency: + group: win-vs2019-cuda11.3-py3-smoke-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + build: + runs-on: "windows.4xlarge" + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-build + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + steps: + - name: print labels + run: echo "${PR_LABELS}" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Build + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + BRANCH: ${{ steps.parse-ref.outputs.branch }} + run: | + .jenkins/pytorch/win-build.sh + # Upload to github so that people can click and download artifacts + - name: Upload artifacts to s3 + uses: seemethere/upload-artifact-s3@v3 + with: + retention-days: 14 + if-no-files-found: error + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Cleanup build-results and workspaces + if: always() + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" + rm -rf ./* + test_force_on_cpu_1_1: + name: test (force_on_cpu, 1, 1, windows.4xlarge) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: force_on_cpu + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.4xlarge + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_distributed_1_1: + name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 294d56f7409..98fef6ac396 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -4,7 +4,6 @@ name: win-vs2019-cuda11.3-py3 on: - pull_request: push: tags: - 'ciflow/all/*' @@ -133,47 +132,19 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - - generate-test-matrix: - needs: build - runs-on: ubuntu-18.04 - timeout-minutes: 240 - env: - TEST_RUNNER_TYPE: windows.8xlarge.nvidia.gpu - NUM_TEST_SHARDS: 2 - NUM_TEST_SHARDS_ON_PULL_REQUEST: 0 - NOGPU_RUNNER_TYPE: windows.4xlarge - ENABLE_FORCE_ON_CPU_TEST: 1 - RUN_SMOKE_TESTS_ONLY_ON_PR: True - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - render-matrix: ${{ steps.set-matrix.outputs.render-matrix }} - container: - image: python:3.9 - steps: - - name: Install dependencies - run: pip install typing-extensions==3.10 - - name: Clone pytorch/pytorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - - name: Generating test matrix - id: set-matrix - run: .github/scripts/generate_pytorch_test_matrix.py - - test: + test_force_on_cpu_1_1: + name: test (force_on_cpu, 1, 1, windows.4xlarge) timeout-minutes: 240 env: JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test - SHARD_NUMBER: ${{ matrix.shard }} - NUM_TEST_SHARDS: ${{ matrix.num_shards }} - TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: force_on_cpu http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" PR_BODY: ${{ github.event.pull_request.body }} - needs: [build, generate-test-matrix] - strategy: - matrix: ${{ fromJson(needs.generate-test-matrix.outputs.matrix) }} - fail-fast: false - runs-on: ${{ matrix.runner }} + needs: build + runs-on: windows.4xlarge steps: - name: Display EC2 information shell: bash @@ -207,16 +178,6 @@ jobs: shell: powershell run: | .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - if: ${{ matrix.config != 'force_on_cpu' }} - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b name: Download PyTorch Build Artifacts with: @@ -242,7 +203,7 @@ jobs: - name: Zip JSONs for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' shell: powershell run: | # -ir => recursive include all files in pattern @@ -258,7 +219,639 @@ jobs: - name: Zip test reports for upload if: always() env: - FILE_SUFFIX: '${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}' + FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_distributed_1_1: + name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: distributed + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_smoke_tests_1_1: + name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 1 + TEST_CONFIG: smoke_tests + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_1_1: + name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + SHARD_NUMBER: 1 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-1-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Reports on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: error + path: + test-reports-*.zip + - name: Install render_test_results dependencies + if: always() + shell: bash + run: | + python3 -m pip install junitparser==2.1.1 rich==10.9.0 + - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" + if: always() + shell: bash + # Encoding is weird on windows, just try to default to utf-8 if possible + env: + PYTHONIOENCODING: "utf-8" + run: | + python3 tools/render_junit.py test/ + - name: Wait until all sessions have drained + shell: powershell + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + - name: Parse ref + id: parse-ref + run: .github/scripts/parse_ref.py + - name: Display and upload test statistics (Click Me) + if: always() + # temporary hack: set CIRCLE_* vars, until we update + # tools/stats/print_test_stats.py to natively support GitHub Actions + env: + AWS_DEFAULT_REGION: us-east-1 + BRANCH: ${{ steps.parse-ref.outputs.branch }} + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + TAG: ${{ steps.parse-ref.outputs.tag }} + WORKFLOW_ID: '${{ github.run_id }}' + shell: bash + run: | + python3 -m pip install -r requirements.txt + python3 -m pip install boto3==1.19.12 + python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test + - name: Cleanup workspace + if: always() + shell: bash + # Should remove the entirety of pytorch-${{ github.run_id }} + run: | + rm -rf ./* + test_default_2_1: + name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) + timeout-minutes: 240 + env: + JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test + SHARD_NUMBER: 2 + NUM_TEST_SHARDS: 2 + TEST_CONFIG: default + http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" + PR_BODY: ${{ github.event.pull_request.body }} + needs: build + runs-on: windows.8xlarge.nvidia.gpu + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: seemethere/add-github-ssh-key@v1 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + # deep clone, to allow use of git merge-base + fetch-depth: 0 + submodules: recursive + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + - name: Install Visual Studio 2019 toolchain + shell: powershell + run: | + .\.circleci\scripts\vs_install.ps1 + - name: Install Cuda + shell: bash + run: | + .circleci/scripts/windows_cuda_install.sh + - name: Install Cudnn + shell: bash + run: | + .circleci/scripts/windows_cudnn_install.sh + - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b + name: Download PyTorch Build Artifacts + with: + name: ${{ env.BUILD_ENVIRONMENT }} + path: C:\${{ github.run_id }}\build-results + - name: Check build-results folder + shell: powershell + run: | + tree /F C:\$Env:GITHUB_RUN_ID\build-results + # Needed for coverage in win-test.sh + - uses: actions/setup-python@v2 + name: Setup Python3 + with: + python-version: '3.x' + - name: Test + shell: bash + env: + PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ + # Time out the test phase after 3.5 hours + timeout-minutes: 210 + run: | + .jenkins/pytorch/win-test.sh + - name: Zip JSONs for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' + shell: powershell + run: | + # -ir => recursive include all files in pattern + 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' + - uses: seemethere/upload-artifact-s3@v3 + name: Store Test Downloaded JSONs on S3 + if: always() + with: + retention-days: 14 + if-no-files-found: warn + path: + test-jsons-*.zip + - name: Zip test reports for upload + if: always() + env: + FILE_SUFFIX: '${{ github.job }}-default-2-2-windows.8xlarge.nvidia.gpu' shell: powershell run: | # -ir => recursive include all files in pattern From 209a9488966af9b2bac41f88fffab68a4c424179 Mon Sep 17 00:00:00 2001 From: Rohan Varma Date: Thu, 17 Feb 2022 13:28:44 -0800 Subject: [PATCH 138/199] [Reland][FSDP] Implement apply() (#72925) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72925 Reland with fix to add the owner string in test file ghstack-source-id: 149280348 Test Plan: CI Reviewed By: zhaojuanmao Differential Revision: D34273858 fbshipit-source-id: 2174c1d71fcc5148282d94e375071a50b92114f2 (cherry picked from commit 158762bbb36f9652d93b3f23beca51c319435cc7) --- test/distributed/fsdp/test_fsdp_apply.py | 104 ++++++++++++++++++ .../fsdp/fully_sharded_data_parallel.py | 39 +++++++ 2 files changed, 143 insertions(+) create mode 100644 test/distributed/fsdp/test_fsdp_apply.py diff --git a/test/distributed/fsdp/test_fsdp_apply.py b/test/distributed/fsdp/test_fsdp_apply.py new file mode 100644 index 00000000000..e685c333d93 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_apply.py @@ -0,0 +1,104 @@ +# Owner(s): ["oncall: distributed"] + +import sys + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.testing._internal.common_distributed import ( + skip_if_lt_x_gpu, +) +from torch.testing._internal.common_fsdp import ( + FSDPTest, + NestedWrappedModule, +) +from torch.testing._internal.common_utils import ( + TEST_WITH_DEV_DBG_ASAN, + run_tests, +) + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + + +class TestApply(FSDPTest): + @property + def world_size(self): + return 2 + + @torch.no_grad() + def _init_linear_weights(self, m): + if type(m) == nn.Linear: + m.weight.fill_(1.0) + m.bias.fill_(1.0) + + @property + def process_group(self): + return dist.distributed_c10d._get_default_group() + + def check_weights(self, fsdp, expected_tensor_fn, check): + with fsdp._summon_full_params(recurse=True): + linear_modules = [ + module for module in fsdp.modules() if type(module) == nn.Linear + ] + for module in linear_modules: + for param in module.parameters(): + expected = expected_tensor_fn(param) + check(param, expected, f"Got {param} but expected {expected}") + + def _check_apply(self, fsdp): + # Assert linear weights are not all 1.0 + self.check_weights( + fsdp, lambda param: torch.empty_like(param).fill_(1.0), self.assertNotEqual + ) + + fsdp.apply(self._init_linear_weights) + + # Ensure all weights are 1.0 + self.check_weights( + fsdp, lambda param: torch.empty_like(param).fill_(1.0), self.assertEqual + ) + + @skip_if_lt_x_gpu(2) + def test_nested_module_apply(self): + """ + Checks apply() modifies weights appropriately on a nested FSDP instance. + """ + nested_module = NestedWrappedModule( + self.process_group, wrap_fsdp=True, wrap_everything=True + ) + fsdp_module = FSDP(nested_module, self.process_group).cuda(self.rank) + self._check_apply(fsdp_module) + + @skip_if_lt_x_gpu(2) + def test_transformer_module_apply(self): + """ + Checks apply() modifies weights appropriately on a wrapped Transformer + module. + """ + transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank) + self._check_apply(transformer) + + @skip_if_lt_x_gpu(2) + def test_apply_in_summon_raises_error(self): + """ + Ensures that if user calls apply() on FSDP instance within full param + summon context, appropriate error is raised. + """ + transformer = self._get_wrapped_model(group=self.process_group).cuda(self.rank) + with transformer._summon_full_params(recurse=True): + with self.assertRaisesRegex(ValueError, "expected to be in states"): + transformer.apply(self._init_linear_weights) + + +if __name__ == "__main__": + run_tests() diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index d270230eba1..fe61684b69d 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -290,6 +290,45 @@ class FullyShardedDataParallel(nn.Module): assert isinstance(self._fsdp_wrapped_module, FlattenParamsWrapper) return self._fsdp_wrapped_module + def fsdp_modules(self) -> List["FullyShardedDataParallel"]: + """ + Helper function to return all nested FSDP instances, including self. + """ + fsdp_modules = [] + for module in self.modules(): + if isinstance(module, FullyShardedDataParallel): + fsdp_modules.append(module) + + return fsdp_modules + + def apply(self, fn: Callable[[nn.Module], None]) -> "FullyShardedDataParallel": + r"""Applies ``fn`` recursively to every submodule (as returned by ``.children()``) + as well as self. Typical use includes initializing the parameters of a model + (see also :ref:`nn-init-doc`). + + Compared to ``torch.nn.Module.apply``, this version additionally gathers + the full parameters before applying ``fn``. It should not be called from + within another ``summon_full_params`` context. + + Args: + fn (:class:`Module` -> None): function to be applied to each submodule + + Returns: + Module: self + """ + uninitialized = self._is_root is None + self._assert_state(TrainingState_.IDLE) + with self._summon_full_params(recurse=False): + ret = super().apply(fn) + + # Reset lazy init that might be called by summon_full_params, since + # it could have set is_root incorrectly for non-root FSDP instances. + if uninitialized and self._is_root: + for module in self.fsdp_modules(): + module._reset_lazy_init() + + return ret + # setting two factors 'self.gradient_predivide_factor' # and 'self.gradient_postdivide_factor' to avoid underflow and overflow def _get_gradient_predivide_factor(self, world_size: int) -> float: From 84680423b54b214c6c1ba2437bb0cf7b50817b73 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 17 Feb 2022 13:44:39 -0800 Subject: [PATCH 139/199] Move implementation of CUDA error handling to Exceptions.cpp (#72958) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72958 Please note, that it must not depend on any of the symbols from CUDA libraries Test Plan: Imported from OSS Reviewed By: gchanan Differential Revision: D34296428 Pulled By: malfet fbshipit-source-id: ec46f0b847db39977187a8439e941085fd1dc8f5 (cherry picked from commit 3918339e04aa9ff8ee8d18fed07633f943799978) --- aten/src/ATen/cuda/CUDABlas.cpp | 36 ---------- aten/src/ATen/cuda/Exceptions.cpp | 68 +++++++++++++++++++ .../ATen/native/cuda/linalg/CUDASolver.cpp | 14 ---- 3 files changed, 68 insertions(+), 50 deletions(-) create mode 100644 aten/src/ATen/cuda/Exceptions.cpp diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 34b0214a561..598a05712c6 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -97,42 +97,6 @@ namespace at { namespace cuda { namespace blas { -C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error) { - if (error == CUBLAS_STATUS_SUCCESS) { - return "CUBLAS_STATUS_SUCCESS"; - } - if (error == CUBLAS_STATUS_NOT_INITIALIZED) { - return "CUBLAS_STATUS_NOT_INITIALIZED"; - } - if (error == CUBLAS_STATUS_ALLOC_FAILED) { - return "CUBLAS_STATUS_ALLOC_FAILED"; - } - if (error == CUBLAS_STATUS_INVALID_VALUE) { - return "CUBLAS_STATUS_INVALID_VALUE"; - } - if (error == CUBLAS_STATUS_ARCH_MISMATCH) { - return "CUBLAS_STATUS_ARCH_MISMATCH"; - } - if (error == CUBLAS_STATUS_MAPPING_ERROR) { - return "CUBLAS_STATUS_MAPPING_ERROR"; - } - if (error == CUBLAS_STATUS_EXECUTION_FAILED) { - return "CUBLAS_STATUS_EXECUTION_FAILED"; - } - if (error == CUBLAS_STATUS_INTERNAL_ERROR) { - return "CUBLAS_STATUS_INTERNAL_ERROR"; - } - if (error == CUBLAS_STATUS_NOT_SUPPORTED) { - return "CUBLAS_STATUS_NOT_SUPPORTED"; - } -#ifdef CUBLAS_STATUS_LICENSE_ERROR - if (error == CUBLAS_STATUS_LICENSE_ERROR) { - return "CUBLAS_STATUS_LICENSE_ERROR"; - } -#endif - return ""; -} - /* LEVEL 3 BLAS FUNCTIONS */ #ifndef USE_ROCM diff --git a/aten/src/ATen/cuda/Exceptions.cpp b/aten/src/ATen/cuda/Exceptions.cpp new file mode 100644 index 00000000000..2821f94d2b7 --- /dev/null +++ b/aten/src/ATen/cuda/Exceptions.cpp @@ -0,0 +1,68 @@ +//NS: CUDACachingAllocator must be included before to get CUDART_VERSION definedi +#include + +#include + +namespace at { +namespace cuda { +namespace blas { + +C10_EXPORT const char* _cublasGetErrorEnum(cublasStatus_t error) { + if (error == CUBLAS_STATUS_SUCCESS) { + return "CUBLAS_STATUS_SUCCESS"; + } + if (error == CUBLAS_STATUS_NOT_INITIALIZED) { + return "CUBLAS_STATUS_NOT_INITIALIZED"; + } + if (error == CUBLAS_STATUS_ALLOC_FAILED) { + return "CUBLAS_STATUS_ALLOC_FAILED"; + } + if (error == CUBLAS_STATUS_INVALID_VALUE) { + return "CUBLAS_STATUS_INVALID_VALUE"; + } + if (error == CUBLAS_STATUS_ARCH_MISMATCH) { + return "CUBLAS_STATUS_ARCH_MISMATCH"; + } + if (error == CUBLAS_STATUS_MAPPING_ERROR) { + return "CUBLAS_STATUS_MAPPING_ERROR"; + } + if (error == CUBLAS_STATUS_EXECUTION_FAILED) { + return "CUBLAS_STATUS_EXECUTION_FAILED"; + } + if (error == CUBLAS_STATUS_INTERNAL_ERROR) { + return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + if (error == CUBLAS_STATUS_NOT_SUPPORTED) { + return "CUBLAS_STATUS_NOT_SUPPORTED"; + } +#ifdef CUBLAS_STATUS_LICENSE_ERROR + if (error == CUBLAS_STATUS_LICENSE_ERROR) { + return "CUBLAS_STATUS_LICENSE_ERROR"; + } +#endif + return ""; +} + +} // namespace blas + +#ifdef CUDART_VERSION +namespace solver { + +C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status) { + switch (status) { + case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCES"; + case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + default: return "Unknown cusolver error number"; + } +} + +} // namespace solver +#endif + +}} // namespace at::cuda diff --git a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp index 036cdd329e3..6cba66af01a 100644 --- a/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp +++ b/aten/src/ATen/native/cuda/linalg/CUDASolver.cpp @@ -10,20 +10,6 @@ namespace at { namespace cuda { namespace solver { -C10_EXPORT const char* cusolverGetErrorMessage(cusolverStatus_t status) { - switch (status) { - case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCES"; - case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED"; - case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED"; - case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE"; - case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH"; - case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED"; - case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR"; - case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - default: return "Unknown cusolver error number"; - } -} - template <> void getrf( cusolverDnHandle_t handle, int m, int n, double* dA, int ldda, int* ipiv, int* info) { From 1522912602bc4cc5f7adbce66cad00ebb436f195 Mon Sep 17 00:00:00 2001 From: Ansley Ussery Date: Thu, 17 Feb 2022 13:45:54 -0800 Subject: [PATCH 140/199] Port `mse_loss` to structured (#72294) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72294 Reviewed By: cpuhrsch Differential Revision: D34306555 Pulled By: ansley fbshipit-source-id: 1c626cec2c1dfc09f462dffe0a3de74b97989f24 (cherry picked from commit d338edb477b0c035a53f845f73092bd1136f96d4) --- aten/src/ATen/native/BinaryOps.h | 2 +- aten/src/ATen/native/Loss.cpp | 51 ++++++++++--------- aten/src/ATen/native/cpu/BinaryOpsKernel.cpp | 2 +- .../ATen/native/cuda/BinaryMiscOpsKernels.cu | 2 +- aten/src/ATen/native/native_functions.yaml | 5 +- 5 files changed, 33 insertions(+), 29 deletions(-) diff --git a/aten/src/ATen/native/BinaryOps.h b/aten/src/ATen/native/BinaryOps.h index aea2a125bb0..4bdf587f0bd 100644 --- a/aten/src/ATen/native/BinaryOps.h +++ b/aten/src/ATen/native/BinaryOps.h @@ -84,7 +84,7 @@ DECLARE_DISPATCH(binary_fn_double, huber_stub); DECLARE_DISPATCH(structured_binary_fn, sigmoid_backward_stub); DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub); DECLARE_DISPATCH(structured_binary_fn, tanh_backward_stub); -DECLARE_DISPATCH(binary_fn, mse_stub); +DECLARE_DISPATCH(structured_binary_fn, mse_stub); DECLARE_DISPATCH(structured_binary_fn, fmod_stub); DECLARE_DISPATCH(structured_binary_fn, logaddexp_stub); DECLARE_DISPATCH(structured_binary_fn, logaddexp2_stub); diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 414a2bcb9fd..07b675ea99e 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -39,6 +39,17 @@ TORCH_META_FUNC(smooth_l1_loss) maybe_get_output().resize_({}); } +TORCH_META_FUNC(mse_loss) +(const Tensor& input, const Tensor& target, const int64_t reduction) { + build_borrowing_binary_op(maybe_get_output(), input, target); + if (reduction == Reduction::None) { + return; + } + + TORCH_INTERNAL_ASSERT(reduction == Reduction::Mean || reduction == Reduction::Sum); + maybe_get_output().resize_({}); +} + } // namespace meta namespace native { @@ -70,6 +81,22 @@ TORCH_IMPL_FUNC(smooth_l1_loss_out) } } +TORCH_IMPL_FUNC(mse_loss_out) +(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& result) { + if (reduction != Reduction::None) { + Tensor loss; + auto iter = TensorIterator::borrowing_binary_op(loss, input, target); + mse_stub(iter.device_type(), iter); + if (reduction == Reduction::Mean) { + at::mean_out(const_cast(result), iter.output(), IntArrayRef{}); + } else { + at::sum_out(const_cast(result), iter.output(), IntArrayRef{}); + } + } else { + mse_stub(device_type(), *this); + } +} + Tensor cosine_embedding_loss(const Tensor& input1, const Tensor& input2, const Tensor& target, double margin, int64_t reduction) { auto targ_dim = target.dim(); TORCH_CHECK( @@ -454,30 +481,6 @@ Tensor& huber_loss_backward_out(const Tensor& grad_output, const Tensor& input, return grad_input; } -Tensor mse_loss(const Tensor& input, const Tensor& target, int64_t reduction) { - Tensor loss; - auto iter = TensorIterator::borrowing_binary_op(loss, input, target); - mse_stub(iter.device_type(), iter); - return apply_loss_reduction(iter.output(), reduction); -} - -Tensor& mse_loss_out(const Tensor& input, const Tensor& target, int64_t reduction, Tensor&result) { - if (reduction != Reduction::None) { - Tensor loss; - auto iter = TensorIterator::borrowing_binary_op(loss, input, target); - mse_stub(iter.device_type(), iter); - if (reduction == Reduction::Mean) { - at::mean_out(result, iter.output(), 0); - } else { - at::sum_out(result, iter.output(), 0); - } - } else { - auto iter = TensorIterator::borrowing_binary_op(result, input, target); - mse_stub(iter.device_type(), iter); - } - return result; -} - Tensor mse_loss_backward(const Tensor& grad_output, const Tensor& input, const Tensor& target, int64_t reduction) { Tensor grad_input = at::zeros_like(input, LEGACY_CONTIGUOUS_MEMORY_FORMAT); return at::mse_loss_backward_out(grad_input, grad_output, input, target, reduction); diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index f2410947de1..d383849e290 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -836,7 +836,7 @@ void tanh_backward_kernel(TensorIteratorBase& iter) { } } -void mse_kernel(TensorIterator& iter) { +void mse_kernel(TensorIteratorBase& iter) { if (iter.dtype() == ScalarType::Half) { TORCH_WARN_ONCE("Applying the CPU mse kernel on half-type tensors. " "This may be slower than using float or double-type tensors."); diff --git a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu index f72ddfa4bfe..703436a1d49 100644 --- a/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu +++ b/aten/src/ATen/native/cuda/BinaryMiscOpsKernels.cu @@ -32,7 +32,7 @@ void huber_kernel_cuda(TensorIterator& iter, double delta) { }); } -void mse_kernel_cuda(TensorIterator& iter) { +void mse_kernel_cuda(TensorIteratorBase& iter) { AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "mse_cuda", [&]() { gpu_kernel(iter, []GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { auto diff = a - b; diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 0598f1cedfa..8f06ed8ae95 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -8592,15 +8592,16 @@ - func: mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator + structured: True + structured_inherits: TensorIteratorBase python_module: nn dispatch: CPU, CUDA: mse_loss_out - func: mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor device_check: NoCheck # TensorIterator + structured_delegate: mse_loss.out python_module: nn - dispatch: - CPU, CUDA: mse_loss - func: mse_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!) python_module: nn From af3ca50291110014580d7fc494ef1ebd75df5aba Mon Sep 17 00:00:00 2001 From: vfdev Date: Thu, 17 Feb 2022 14:35:42 -0800 Subject: [PATCH 141/199] Fixed docstring typo for nn.Module.get_submodule (#73018) Summary: Description: - Fixed docstring typo for nn.Module.get_submodule otherwise output is invisible: https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.get_submodule Pull Request resolved: https://github.com/pytorch/pytorch/pull/73018 Reviewed By: davidberard98 Differential Revision: D34310091 Pulled By: jbschlosser fbshipit-source-id: e35aef2b7479bdd81fb6b7ddd203bd71798769e1 (cherry picked from commit e4944e1f8e5779667ed98f1278150c5d46773835) --- torch/nn/modules/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py index d712f251f32..20bf2e3f739 100644 --- a/torch/nn/modules/module.py +++ b/torch/nn/modules/module.py @@ -404,7 +404,7 @@ class Module: For example, let's say you have an ``nn.Module`` ``A`` that looks like this: - .. code-block::text + .. code-block:: text A( (net_b): Module( From f670179c0a0d7c62118c7691f045ab29183c104b Mon Sep 17 00:00:00 2001 From: Joel Schlosser Date: Thu, 17 Feb 2022 14:35:50 -0800 Subject: [PATCH 142/199] Fix doc regressions for various modules and functional forms (#73014) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73014 Fixes #72501 Fixes #72502 Fixes #72503 Fixes #72504 Fixes #72505 Fixes #72506 Fixes #72507 Fixes #72509 Fixes #72510 Test Plan: Imported from OSS Reviewed By: albanD Differential Revision: D34305640 Pulled By: jbschlosser fbshipit-source-id: 62f341633fdb0316eaa346cf7247865290eb830a (cherry picked from commit 8362d264e7b2c0c2bd5d688a87bf4f8f0bf60f0f) --- torch/nn/functional.py | 5 ++--- torch/nn/modules/conv.py | 4 ++-- torch/nn/modules/distance.py | 2 +- torch/nn/modules/fold.py | 2 +- torch/nn/modules/loss.py | 2 +- torch/nn/modules/pooling.py | 4 ++-- 6 files changed, 9 insertions(+), 10 deletions(-) diff --git a/torch/nn/functional.py b/torch/nn/functional.py index 1c6dd93685b..a52bb6d5571 100644 --- a/torch/nn/functional.py +++ b/torch/nn/functional.py @@ -1980,7 +1980,7 @@ This operator supports :ref:`TensorFloat32`. Shape: - Input: :math:`(*, in\_features)` where `*` means any number of - additional dimensions, including none + additional dimensions, including none - Weight: :math:`(out\_features, in\_features)` or :math:`(in\_features)` - Bias: :math:`(out\_features)` or :math:`()` - Output: :math:`(*, out\_features)` or :math:`(*)`, based on the shape of the weight @@ -3471,8 +3471,7 @@ def multi_margin_loss( reduce: Optional[bool] = None, reduction: str = "mean", ) -> Tensor: - r"""multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=None, - reduce=None, reduction='mean') -> Tensor + r"""multi_margin_loss(input, target, p=1, margin=1, weight=None, size_average=None, reduce=None, reduction='mean') -> Tensor See :class:`~torch.nn.MultiMarginLoss` for details. """ diff --git a/torch/nn/modules/conv.py b/torch/nn/modules/conv.py index ee1abe2e6e3..023a45c462d 100644 --- a/torch/nn/modules/conv.py +++ b/torch/nn/modules/conv.py @@ -506,7 +506,7 @@ class Conv3d(_ConvNd): Shape: - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})` - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, - where + where .. math:: D_{out} = \left\lfloor\frac{D_{in} + 2 \times \text{padding}[0] - \text{dilation}[0] @@ -994,7 +994,7 @@ class ConvTranspose3d(_ConvTransposeNd): Shape: - Input: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})` or :math:`(C_{in}, D_{in}, H_{in}, W_{in})` - Output: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})` or - :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, where + :math:`(C_{out}, D_{out}, H_{out}, W_{out})`, where .. math:: D_{out} = (D_{in} - 1) \times \text{stride}[0] - 2 \times \text{padding}[0] + \text{dilation}[0] diff --git a/torch/nn/modules/distance.py b/torch/nn/modules/distance.py index 00513ac2aa0..174659d3d30 100644 --- a/torch/nn/modules/distance.py +++ b/torch/nn/modules/distance.py @@ -21,7 +21,7 @@ class PairwiseDistance(Module): - Input1: :math:`(N, D)` or :math:`(D)` where `N = batch dimension` and `D = vector dimension` - Input2: :math:`(N, D)` or :math:`(D)`, same shape as the Input1 - Output: :math:`(N)` or :math:`()` based on input dimension. - If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension. + If :attr:`keepdim` is ``True``, then :math:`(N, 1)` or :math:`(1)` based on input dimension. Examples:: >>> pdist = nn.PairwiseDistance(p=2) >>> input1 = torch.randn(100, 128) diff --git a/torch/nn/modules/fold.py b/torch/nn/modules/fold.py index 6498dea6f91..5c10bd21df2 100644 --- a/torch/nn/modules/fold.py +++ b/torch/nn/modules/fold.py @@ -103,7 +103,7 @@ class Fold(Module): Shape: - Input: :math:`(N, C \times \prod(\text{kernel\_size}), L)` or :math:`(C \times \prod(\text{kernel\_size}), L)` - Output: :math:`(N, C, \text{output\_size}[0], \text{output\_size}[1], \dots)` - or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above + or :math:`(C, \text{output\_size}[0], \text{output\_size}[1], \dots)` as described above Examples:: diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index b77bfb47b39..af1cf55e32d 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1451,7 +1451,7 @@ class TripletMarginLoss(_Loss): Shape: - Input: :math:`(N, D)` or :math`(D)` where :math:`D` is the vector dimension. - Output: A Tensor of shape :math:`(N)` if :attr:`reduction` is ``'none'`` and - input shape is :math`(N, D)`; a scalar otherwise. + input shape is :math`(N, D)`; a scalar otherwise. Examples:: diff --git a/torch/nn/modules/pooling.py b/torch/nn/modules/pooling.py index 845029a5ee1..e3d2d494677 100644 --- a/torch/nn/modules/pooling.py +++ b/torch/nn/modules/pooling.py @@ -508,7 +508,7 @@ class AvgPool1d(_AvgPoolNd): count_include_pad: when True, will include the zero-padding in the averaging calculation Shape: - - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in}`. + - Input: :math:`(N, C, L_{in})` or :math:`(C, L_{in})`. - Output: :math:`(N, C, L_{out})` or :math:`(C, L_{out})`, where .. math:: @@ -1192,7 +1192,7 @@ class AdaptiveAvgPool3d(_AdaptiveAvgPoolNd): be the same as that of the input. Shape: - - Input: :math:`(N, C, H_{in}, W_{in})` or :math:`(C, H_{in}, W_{in})`. + - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})` or :math:`(C, D_{in}, H_{in}, W_{in})`. - Output: :math:`(N, C, S_{0}, S_{1}, S_{2})` or :math:`(C, S_{0}, S_{1}, S_{2})`, where :math:`S=\text{output\_size}`. From 6d33852685640899539aff5fe9666158e95c31af Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Thu, 17 Feb 2022 15:09:59 -0800 Subject: [PATCH 143/199] [NNC] TensorExprKernel state should not be modified on calls to run methods (#73028) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73028 A typical use case for `TensorExprKernel` is to create the kernel once and call it multiple times, possibly in parallel. For the parallel calls to work, we need to ensure that the run() method calls do not change any state in `TensorExprKernel`. Before this change, the `run()` method was modifying the sizes and strides vectors when dynamic shapes were present. This manifested as a data race when running a model with Static Runtime. ghstack-source-id: 149398820 Test Plan: ``` buck build mode/dev-asan //caffe2/test/cpp/tensorexpr:tensorexpr ./buck-out/dev/gen/caffe2/test/cpp/tensorexpr/tensorexpr --gtest_filter="DynamicShapes.MultiThreadedExecution" ``` Reviewed By: eellison Differential Revision: D34287960 fbshipit-source-id: d311f3c5a66c5d5de4e1deaeaa01816b53e9906e (cherry picked from commit 161568bfae9fc1497a36d6103f49deda001509a4) --- test/cpp/tensorexpr/test_dynamic_shapes.cpp | 60 +++++++++++++++++ torch/csrc/jit/tensorexpr/kernel.cpp | 74 ++++++++++++++------- torch/csrc/jit/tensorexpr/kernel.h | 6 +- 3 files changed, 114 insertions(+), 26 deletions(-) diff --git a/test/cpp/tensorexpr/test_dynamic_shapes.cpp b/test/cpp/tensorexpr/test_dynamic_shapes.cpp index 46b55272ddf..22255cf3826 100644 --- a/test/cpp/tensorexpr/test_dynamic_shapes.cpp +++ b/test/cpp/tensorexpr/test_dynamic_shapes.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -626,5 +627,64 @@ TEST(DynamicShapes, GraphFromModel) { #endif } +TEST(DynamicShapes, MultiThreadedExecution) { +#ifdef TORCH_ENABLE_LLVM + std::shared_ptr graph = std::make_shared(); + const auto graph_string = R"IR( + graph(%x : Float(SS(-2), SS(-3), requires_grad=0, device=cpu), + %y : Float(SS(-2), SS(-3), requires_grad=0, device=cpu), + %SS_2 : int, + %SS_3 : int): + %3 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::tanh(%x) + %4 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::erf(%3) + %5 : Float(SS(-2), SS(-3), requires_grad=0, device=cpu) = aten::mul(%4, %y) + return (%5))IR"; + torch::jit::parseIR(graph_string, graph.get()); + + std::vector symbolic_shape_inputs = {-2, -3}; + + std::vector input_desc = { + torch::jit::StrideInput::TENSOR_CONT}; + std::unordered_map< + const torch::jit::Value*, + std::vector> + symbolic_strides; + symbolic_strides[graph->inputs().at(0)] = input_desc; + symbolic_strides[graph->inputs().at(1)] = input_desc; + symbolic_strides[graph->outputs().at(0)] = input_desc; + + TensorExprKernel kernel( + graph, {}, symbolic_shape_inputs, false, symbolic_strides); + + auto run_kernel = [&](int dim1, int dim2) { + auto a = + at::rand({dim1, dim2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); + auto b = + at::rand({dim1, dim2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); + + auto ref = at::mul(at::erf(at::tanh(a)), b); + + std::vector stack = fmap(std::vector({a, b})); + stack.emplace_back(dim1); + stack.emplace_back(dim2); + kernel.run(stack); + + auto o = stack[0].toTensor(); + ASSERT_TRUE(at::allclose(o, ref)); + }; + + // Run the kernel in parallel to ensure that the run() method calls in + // TensorExprKernel are not changing any state. + constexpr size_t kNumThreads = 4; + std::vector threads; + for (size_t id = 0; id < kNumThreads; ++id) { + threads.emplace_back(run_kernel, id + 5, id + 20); + } + for (auto& t : threads) { + t.join(); + } +#endif +} + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp index a48c9d07e29..8b9dafaae92 100644 --- a/torch/csrc/jit/tensorexpr/kernel.cpp +++ b/torch/csrc/jit/tensorexpr/kernel.cpp @@ -1651,37 +1651,44 @@ void TensorExprKernel::run(Stack& stack) { } } -void TensorExprKernel::updateOutputSizesAndStrides( - const at::ArrayRef& inputs) { +void TensorExprKernel::getStaticOutputSizesAndStrides( + const at::ArrayRef& inputs, + std::vector>* sizes, + std::vector>* strides) { TORCH_INTERNAL_ASSERT(has_symbolic_shapes_); // If there are symbolic shapes, then the output tensor size wouldn't have // been computed at compile time. That has to be done here by using the // symbolic shape input params passed in to this call. TORCH_INTERNAL_ASSERT( tensorOutputSymbolicSizes_.size() == bufOutputs_.size()); - TORCH_INTERNAL_ASSERT(tensorOutputSizes_.size() == bufOutputs_.size()); - TORCH_INTERNAL_ASSERT(tensorOutputStrides_.size() == bufOutputs_.size()); + + TORCH_INTERNAL_ASSERT(sizes); + TORCH_INTERNAL_ASSERT(strides); + *sizes = tensorOutputSizes_; + *strides = tensorOutputStrides_; + auto& static_sizes = *sizes; + auto& static_strides = *strides; for (size_t i = 0, e = bufOutputs_.size(); i < e; ++i) { - tensorOutputSizes_[i].clear(); + static_sizes[i].clear(); for (auto t : tensorOutputSymbolicSizes_[i]) { if (t.AsNode()) { - tensorOutputSizes_[i].emplace_back(immediateAs(t.node())); + static_sizes[i].emplace_back(immediateAs(t.node())); } else { auto input_pos = shapeSymbolInputPos_.at(t.node()); TORCH_INTERNAL_ASSERT(input_pos < inputs.size()); TORCH_INTERNAL_ASSERT(inputs[input_pos].isInt()); - tensorOutputSizes_[i].emplace_back(inputs[input_pos].toInt()); + static_sizes[i].emplace_back(inputs[input_pos].toInt()); } } if (tensorOutputStrideDesc_[i] == torch::jit::StrideInput::TENSOR_CONT) { - tensorOutputStrides_[i] = - TensorType::contiguousStridesOf(tensorOutputSizes_[i]); + static_strides[i] = TensorType::contiguousStridesOf(static_sizes[i]); + } else if ( tensorOutputStrideDesc_[i] == torch::jit::StrideInput::TENSOR_CONT_CHANNELS_LAST) { - tensorOutputStrides_[i] = - at::get_channels_last_strides_2d(tensorOutputSizes_[i]); + static_strides[i] = at::get_channels_last_strides_2d(static_sizes[i]); + } else { std::string output_desc = toString(tensorOutputStrideDesc_[i]); TORCH_INTERNAL_ASSERT( @@ -1712,7 +1719,9 @@ std::vector TensorExprKernel::prepareRunArgs( } if (has_symbolic_shapes_) { - updateOutputSizesAndStrides(inputs); + std::vector> static_sizes; + std::vector> static_strides; + getStaticOutputSizesAndStrides(inputs, &static_sizes, &static_strides); // add stride args for (const auto& input_stride_arg : input_stride_args_) { @@ -1720,18 +1729,30 @@ std::vector TensorExprKernel::prepareRunArgs( inputs[input_stride_arg.first].toTensor().strides().at( input_stride_arg.second)); } - } - for (size_t i = 0, e = bufOutputs_.size(); i < e; ++i) { - auto const& opts = tensorOutputTensorOptions_[i]; - outputs.emplace_back(codegen_->empty_strided( - tensorOutputSizes_[i], - tensorOutputStrides_[i], - opts.dtype, - opts.layout, - opts.device, - opts.pinned_memory)); - runArgs.emplace_back(outputs.back().data_ptr()); + for (size_t i = 0, e = bufOutputs_.size(); i < e; ++i) { + auto const& opts = tensorOutputTensorOptions_[i]; + outputs.emplace_back(codegen_->empty_strided( + static_sizes[i], + static_strides[i], + opts.dtype, + opts.layout, + opts.device, + opts.pinned_memory)); + runArgs.emplace_back(outputs.back().data_ptr()); + } + } else { + for (size_t i = 0, e = bufOutputs_.size(); i < e; ++i) { + auto const& opts = tensorOutputTensorOptions_[i]; + outputs.emplace_back(codegen_->empty_strided( + tensorOutputSizes_[i], + tensorOutputStrides_[i], + opts.dtype, + opts.layout, + opts.device, + opts.pinned_memory)); + runArgs.emplace_back(outputs.back().data_ptr()); + } } for (auto c : constants_) { @@ -1814,7 +1835,10 @@ void TensorExprKernel::runWithAllocatedOutputs(Stack& stack) { std::vector stride_values(input_stride_args_.size()); if (has_symbolic_shapes_) { - updateOutputSizesAndStrides(stack_inputs); + std::vector> static_sizes; + std::vector> static_strides; + getStaticOutputSizesAndStrides( + stack_inputs, &static_sizes, &static_strides); // add stride args for (auto idx : c10::irange(input_stride_args_.size())) { @@ -1830,7 +1854,7 @@ void TensorExprKernel::runWithAllocatedOutputs(Stack& stack) { auto& out = stack_outputs[i].toTensor(); // This has only been tested on CPUs. // TODO: Test on GPUs. - out.resize_(tensorOutputSizes_[i]); + out.resize_(static_sizes[i]); args.emplace_back(out.data_ptr()); } } else { diff --git a/torch/csrc/jit/tensorexpr/kernel.h b/torch/csrc/jit/tensorexpr/kernel.h index c18681fddf7..1b205ca9624 100644 --- a/torch/csrc/jit/tensorexpr/kernel.h +++ b/torch/csrc/jit/tensorexpr/kernel.h @@ -218,7 +218,11 @@ class TORCH_API TensorExprKernel { std::string getCodeGenName(BackendType backendType); - void updateOutputSizesAndStrides(const at::ArrayRef& inputs); + void getStaticOutputSizesAndStrides( + const at::ArrayRef& inputs, + std::vector>* static_sizes, + std::vector>* static_strides); + std::vector prepareRunArgs( const at::ArrayRef& inputs, std::vector& outputs); From 4d642d0dd423f1695cfc8d97fbd2e526ae7c07ae Mon Sep 17 00:00:00 2001 From: Linbin Yu Date: Thu, 17 Feb 2022 23:24:28 +0000 Subject: [PATCH 144/199] add android and ios folder to merge rule Opt-in android and ios folder for the new GHF workflow Pull Request resolved: https://github.com/pytorch/pytorch/pull/73041 --- .github/merge_rules.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/merge_rules.json b/.github/merge_rules.json index 75ee92711ab..eee24234f5e 100644 --- a/.github/merge_rules.json +++ b/.github/merge_rules.json @@ -31,5 +31,17 @@ "patterns": ["docs/**", "torch/*docs.py"], "approved_by": ["mruberry", "ngimel", "albanD", "janeyx99"], "mandatory_app_id": 12274 + }, + { + "name": "Android", + "patterns": ["android/**"], + "approved_by": ["linbinyu", "kit1980", "IvanKobzarev", "malfet"], + "mandatory_app_id": 12274 + }, + { + "name": "iOS", + "patterns": ["ios/**"], + "approved_by": ["linbinyu", "kit1980", "xta0", "malfet", "hanton"], + "mandatory_app_id": 12274 } ] From bac7feb76e7a918a1fd83afc2fde43639d518e9a Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 17 Feb 2022 23:28:13 +0000 Subject: [PATCH 145/199] Remove smoke test functionality to simplify infra Follow up to https://github.com/pytorch/pytorch/issues/73030 Pull Request resolved: https://github.com/pytorch/pytorch/pull/73042 --- .github/generated-ciflow-ruleset.json | 14 +- .github/scripts/generate_ci_workflows.py | 22 +- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 256 +------- .../generated-linux-bionic-py3.7-clang9.yml | 252 +------- .../generated-linux-bionic-rocm4.5-py3.7.yml | 228 +------ ...rated-linux-vulkan-bionic-py3.7-clang9.yml | 248 -------- ...rated-linux-xenial-cuda11.3-py3.7-gcc7.yml | 256 +------- ...nerated-linux-xenial-py3.7-clang7-asan.yml | 254 +------- ...nerated-linux-xenial-py3.7-clang7-onnx.yml | 252 +------- .../generated-linux-xenial-py3.7-gcc5.4.yml | 252 +------- .../generated-linux-xenial-py3.7-gcc7.yml | 252 +------- .../generated-macos-11-py3-x86-64.yml | 118 +--- ...rallelnative-linux-xenial-py3.7-gcc5.4.yml | 248 -------- ...iodic-linux-bionic-cuda11.5-py3.7-gcc7.yml | 256 +------- ...enial-cuda10.2-py3-gcc7-slow-gradcheck.yml | 256 +------- ...linux-xenial-cuda11.1-py3.7-gcc7-debug.yml | 256 +------- ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 162 +---- ...rated-periodic-win-vs2019-cuda11.5-py3.yml | 162 +---- ...-pytorch-xla-linux-bionic-py3.7-clang8.yml | 249 -------- .../generated-win-vs2019-cpu-py3.yml | 154 +---- ...enerated-win-vs2019-cuda11.3-py3-smoke.yml | 598 ------------------ .../generated-win-vs2019-cuda11.3-py3.yml | 163 +---- .../pytorch/win-test-helpers/test_python.bat | 10 +- .../test_python_first_shard.bat | 11 +- .../test_python_second_shard.bat | 11 +- .jenkins/pytorch/win-test.sh | 28 +- 26 files changed, 52 insertions(+), 4916 deletions(-) delete mode 100644 .github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml diff --git a/.github/generated-ciflow-ruleset.json b/.github/generated-ciflow-ruleset.json index 27ccb7d4f06..81abc2237bc 100644 --- a/.github/generated-ciflow-ruleset.json +++ b/.github/generated-ciflow-ruleset.json @@ -44,8 +44,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3", - "win-vs2019-cuda11.3-py3-smoke" + "win-vs2019-cuda11.3-py3" ], "ciflow/android": [ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-build", @@ -121,8 +120,7 @@ "periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug", "periodic-win-vs2019-cuda11.1-py3", "periodic-win-vs2019-cuda11.5-py3", - "win-vs2019-cuda11.3-py3", - "win-vs2019-cuda11.3-py3-smoke" + "win-vs2019-cuda11.3-py3" ], "ciflow/default": [ "linux-binary-conda", @@ -151,7 +149,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single", "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3-smoke", + "win-vs2019-cuda11.3-py3", "windows-binary-libtorch-cxx11-abi", "windows-binary-libtorch-pre-cxx11", "windows-binary-wheel" @@ -283,8 +281,7 @@ "pytorch-linux-xenial-py3-clang5-android-ndk-r19c-gradle-custom-build-single-full-jit", "pytorch-xla-linux-bionic-py3.7-clang8", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3", - "win-vs2019-cuda11.3-py3-smoke" + "win-vs2019-cuda11.3-py3" ], "ciflow/vulkan": [ "linux-vulkan-bionic-py3.7-clang9" @@ -293,8 +290,7 @@ "periodic-win-vs2019-cuda11.1-py3", "periodic-win-vs2019-cuda11.5-py3", "win-vs2019-cpu-py3", - "win-vs2019-cuda11.3-py3", - "win-vs2019-cuda11.3-py3-smoke" + "win-vs2019-cuda11.3-py3" ], "ciflow/xla": [ "pytorch-xla-linux-bionic-py3.7-clang8" diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index da84f89b710..5507da89953 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -174,7 +174,6 @@ class CIWorkflow: test_jobs: Any = field(default_factory=list) enable_default_test: bool = True - enable_smoke_test: bool = True enable_jit_legacy_test: bool = False enable_distributed_test: bool = True enable_multigpu_test: bool = False @@ -293,9 +292,6 @@ class CIWorkflow: if self.enable_noarch_test: configs["noarch"] = {"num_shards": 1, "runner": self.test_runner_type} - if self.enable_smoke_test: - configs["smoke_tests"] = {"num_shards": 1, "runner": self.test_runner_type} - for name, config in configs.items(): for shard in range(1, config["num_shards"] + 1): test_jobs.append( @@ -313,7 +309,7 @@ class CIWorkflow: for shard in range(1, self.num_test_shards + 1): test_jobs.append( { - "id": f"test_default_{shard}_{config['num_shards']}", + "id": f"test_default_{shard}_{self.num_test_shards}", "name": f"test (default, {shard}, {self.num_test_shards}, {self.test_runner_type})", "config": "default", "shard": shard, @@ -396,20 +392,6 @@ WINDOWS_WORKFLOWS = [ labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CPU, LABEL_CIFLOW_WIN} ), ), - CIWorkflow( - arch="windows", - build_environment="win-vs2019-cuda11.3-py3-smoke", - cuda_version="11.3", - test_runner_type=WINDOWS_CUDA_TEST_RUNNER, - enable_default_test=False, - enable_smoke_test=True, - enable_force_on_cpu_test=True, - only_on_pr=True, - ciflow_config=CIFlowConfig( - run_on_canary=True, - labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} - ), - ), CIWorkflow( arch="windows", build_environment="win-vs2019-cuda11.3-py3", @@ -419,7 +401,7 @@ WINDOWS_WORKFLOWS = [ enable_force_on_cpu_test=True, ciflow_config=CIFlowConfig( run_on_canary=True, - labels={LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} + labels={LABEL_CIFLOW_DEFAULT, LABEL_CIFLOW_CUDA, LABEL_CIFLOW_WIN} ), ), CIWorkflow( diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index e2631900a36..d8071737bba 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -1753,259 +1753,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) - needs: build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-bionic-cuda10.2-py3.9-gcc7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu @@ -2257,7 +2005,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml index 65880d1b982..2cd74d39d45 100644 --- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml @@ -498,255 +498,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.2xlarge) - needs: build - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-bionic-py3.7-clang9-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-bionic-py3.7-clang9-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.2xlarge) needs: build runs-on: linux.2xlarge @@ -994,7 +746,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.2xlarge) needs: build runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml index 0922d5a9e62..550593a7d3e 100644 --- a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml +++ b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml @@ -473,231 +473,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.rocm.gpu) - needs: build - runs-on: linux.rocm.gpu - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: Set DOCKER_HOST - run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" - - name: Runner health check system info - if: always() - run: | - cat /etc/os-release || true - cat /etc/apt/sources.list.d/rocm.list || true - cat /opt/rocm/.info/version || true - whoami - - name: Runner health check rocm-smi - if: always() - run: | - rocm-smi - - name: Runner health check rocminfo - if: always() - run: | - rocminfo - - name: Runner health check GPU count - if: always() - run: | - ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx') - if [[ "x$ngpu" != "x2" && "x$ngpu" != "x4" ]]; then - echo "Failed to detect GPUs on the runner" - exit 1 - fi - - name: Runner health check disconnect on failure - if: ${{ failure() }} - run: | - killall runsvc.sh - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: ROCm set GPU_FLAG - run: | - echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home - docker exec -t "${container_name}" sh -c "cd .. && cp -R workspace pytorch && cd pytorch && pip install dist/*.whl && ${TEST_COMMAND}" - # copy test results back to the mounted workspace, needed sudo, resulting permissions were correct - docker exec -t "${container_name}" sh -c "cd ../pytorch && sudo cp -R test/test-reports ../workspace/test" - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.rocm.gpu' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: actions/upload-artifact@v2 - name: Store Test Downloaded JSONs on Github - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.rocm.gpu' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: actions/upload-artifact@v2 - name: Store Test Reports on Github - if: always() - with: - name: test-reports - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-bionic-rocm4.5-py3.7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.rocm.gpu) needs: build runs-on: linux.rocm.gpu @@ -921,7 +697,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.rocm.gpu) needs: build runs-on: linux.rocm.gpu diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml index e836ddf691b..da3055d8458 100644 --- a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml @@ -250,254 +250,6 @@ jobs: # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.2xlarge) - needs: build - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-vulkan-bionic-py3.7-clang9-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af test_default_1_1: name: test (default, 1, 1, linux.2xlarge) needs: build diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml index 8b26d013935..97778b2eec7 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml @@ -501,259 +501,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) - needs: build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-cuda11.3-py3.7-gcc7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu @@ -1005,7 +753,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml index 0b6fea00e1f..8bbc701439b 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml @@ -250,255 +250,7 @@ jobs: # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.2xlarge) - needs: build - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.7-clang7-asan-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_3: name: test (default, 1, 3, linux.2xlarge) needs: build runs-on: linux.2xlarge @@ -746,7 +498,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_3: name: test (default, 2, 3, linux.2xlarge) needs: build runs-on: linux.2xlarge @@ -994,7 +746,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_3_1: + test_default_3_3: name: test (default, 3, 3, linux.2xlarge) needs: build runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml index 7d49630c027..8cadec7595b 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml @@ -250,255 +250,7 @@ jobs: # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.2xlarge) - needs: build - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.7-clang7-onnx-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.2xlarge) needs: build runs-on: linux.2xlarge @@ -746,7 +498,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.2xlarge) needs: build runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml index 189b085f8ae..0f50f2423d3 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml @@ -1241,255 +1241,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.2xlarge) - needs: build - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.7-gcc5.4-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.2xlarge) needs: build runs-on: linux.2xlarge @@ -1737,7 +1489,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.2xlarge) needs: build runs-on: linux.2xlarge diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml index 42507986059..f96cfff958b 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml @@ -497,255 +497,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.2xlarge) - needs: build - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: linux-xenial-py3.7-gcc7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.2xlarge) needs: build runs-on: linux.2xlarge @@ -993,7 +745,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.2xlarge) needs: build runs-on: linux.2xlarge diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml index da7c8c0d9ff..97c79478b11 100644 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ b/.github/workflows/generated-macos-11-py3-x86-64.yml @@ -85,121 +85,7 @@ jobs: artifacts.zip - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, macos-11) - needs: build - runs-on: macos-11 - timeout-minutes: 240 - env: - JOB_BASE_NAME: macos-11-py3-x86-64-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: false - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - uses: actions/download-artifact@v2 - name: Download PyTorch Build Artifacts from GHA - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: . - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - python-version: 3.8 - activate-environment: build - - name: Install macOS homebrew dependencies - run: | - # Install dependencies - brew install libomp - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - run: | - python3 -mpip install dist/*.whl - .jenkins/pytorch/macos-test.sh - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-macos-11' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: actions/upload-artifact@v2 - name: Store Test Downloaded JSONs on Github - if: always() - with: - name: test-jsons - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-macos-11' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: actions/upload-artifact@v2 - name: Store Test Reports on Github - if: always() - with: - name: test-reports - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: macos-11-py3-x86-64-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_SECRET_ACCESS_KEY }} - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, macos-11) needs: build runs-on: macos-11 @@ -313,7 +199,7 @@ jobs: python3 -m pip install -r requirements.txt python3 -m pip install boto3==1.19.12 python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, macos-11) needs: build runs-on: macos-11 diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml index 46e142b53b8..d74143ce679 100644 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml @@ -496,254 +496,6 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.2xlarge) - needs: build - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: parallelnative-linux-xenial-py3.7-gcc5.4-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af test_default_1_1: name: test (default, 1, 1, linux.2xlarge) needs: build diff --git a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml index a118b22f61a..a4c067f0608 100644 --- a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml @@ -499,259 +499,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) - needs: build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-linux-bionic-cuda11.5-py3.7-gcc7-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu @@ -1003,7 +751,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml index c3bee6a3aa2..beed145369e 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml @@ -249,259 +249,7 @@ jobs: # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) - needs: build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 360 minutes - timeout-minutes: 360 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu @@ -753,7 +501,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml index 6fe981b2ff1..f269f9ad679 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml @@ -500,259 +500,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.4xlarge.nvidia.gpu) - needs: build - runs-on: linux.4xlarge.nvidia.gpu - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG - run: | - bash .github/scripts/install_nvidia_utils_linux.sh - echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.4xlarge.nvidia.gpu' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu @@ -1004,7 +752,7 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, linux.4xlarge.nvidia.gpu) needs: build runs-on: linux.4xlarge.nvidia.gpu diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index a24e8f22446..c95dfdf6ea1 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -289,165 +289,7 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: smoke_tests - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.8xlarge.nvidia.gpu - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 env: @@ -605,7 +447,7 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 env: diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml index f11536461b6..a0c54966471 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml @@ -439,165 +439,7 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: smoke_tests - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.8xlarge.nvidia.gpu - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 env: @@ -755,7 +597,7 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 env: diff --git a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml index ecde072b15d..258bfe614d8 100644 --- a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml +++ b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml @@ -464,252 +464,3 @@ jobs: docker stop $(docker ps -q) || true # Prune all of the docker images docker system prune -af - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, linux.2xlarge) - needs: build - runs-on: linux.2xlarge - timeout-minutes: 240 - env: - DOCKER_IMAGE: ${{ needs.build.outputs.docker_image }} - JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test - TEST_CONFIG: smoke_tests - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - PR_BODY: ${{ github.event.pull_request.body }} - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Log in to ECR - env: - AWS_RETRY_MODE: standard - AWS_MAX_ATTEMPTS: 5 - run: | - AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \ - --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com" - - name: Chown workspace - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${ALPINE_IMAGE}" - # Ensure the working directory gets chowned back to the current user - docker run --pull=never --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Clean workspace - run: | - rm -rf "${GITHUB_WORKSPACE}" - mkdir "${GITHUB_WORKSPACE}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Preserve github env variables for use in docker - run: | - env | grep '^GITHUB' > "/tmp/github_env_${GITHUB_RUN_ID}" - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Pull Docker image - run: | - retry () { - "$@" || (sleep 1 && "$@") || (sleep 2 && "$@") - } - retry docker pull "${DOCKER_IMAGE}" - - name: Determine shm-size - run: | - shm_size="1g" - case "${BUILD_ENVIRONMENT}" in - *cuda*) - shm_size="2g" - ;; - *rocm*) - shm_size="8g" - ;; - esac - echo "SHM_SIZE=${shm_size}" >> "${GITHUB_ENV}" - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - - name: Unzip artifacts - run: | - unzip -o artifacts.zip - - name: Output disk space left - run: | - sudo df -H - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Test - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - BRANCH: ${{ steps.parse-ref.outputs.branch }} - # Time out the test phase after 240 minutes - timeout-minutes: 240 - run: | - set -x - - if [[ $TEST_CONFIG == 'multigpu' ]]; then - TEST_COMMAND=.jenkins/pytorch/multigpu-test.sh - elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then - TEST_COMMAND=.jenkins/caffe2/test.sh - else - TEST_COMMAND=.jenkins/pytorch/test.sh - fi - PROXY_ENV= - # NOTE: XLA multiprocessing tests appear to have issues with squid proxy, going to disable for now - # We should investigate whether or not there's a list of hostnames we can add to no_proxy to - # make it so that we shouldn't have to fully disable squid for XLA tests - if [[ $TEST_CONFIG != 'xla' ]]; then - # shellcheck disable=SC2089 - PROXY_ENV="-e http_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e https_proxy=http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128 -e no_proxy=localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" - fi - # detached container should get cleaned up by teardown_ec2_linux - # TODO: Stop building test binaries as part of the build phase - # Used for GPU_FLAG since that doesn't play nice - # shellcheck disable=SC2086,SC2090 - container_name=$(docker run \ - ${GPU_FLAG:-} \ - -e BUILD_ENVIRONMENT \ - -e PR_NUMBER \ - -e CUSTOM_TEST_ARTIFACT_BUILD_DIR \ - -e GITHUB_ACTIONS \ - -e IN_CI \ - -e IS_GHA \ - -e BRANCH \ - -e SHA1 \ - -e AWS_DEFAULT_REGION \ - -e IN_WHEEL_TEST \ - -e SHARD_NUMBER \ - -e JOB_BASE_NAME \ - -e TEST_CONFIG \ - -e NUM_TEST_SHARDS \ - -e PR_BODY \ - -e PYTORCH_RETRY_TEST_CASES \ - -e PR_LABELS \ - -e MAX_JOBS="$(nproc --ignore=2)" \ - -e SCCACHE_BUCKET \ - -e XLA_CUDA \ - -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ - ${PROXY_ENV} \ - --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ - --security-opt seccomp=unconfined \ - --cap-add=SYS_PTRACE \ - --ipc=host \ - --shm-size="${SHM_SIZE}" \ - --tty \ - --detach \ - --name="${container_name}" \ - --user jenkins \ - -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ - -w /var/lib/jenkins/workspace \ - "${DOCKER_IMAGE}" - ) - docker exec -t "${container_name}" sh -c "sudo chown -R jenkins . && pip install dist/*.whl && ${TEST_COMMAND}" - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test jsons if they exist - rm -f test-jsons-*.zip - zip -r "test-jsons-${FILE_SUFFIX}.zip" test -i '*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-linux.2xlarge' - run: | - # Remove any previous test reports if they exist - rm -f test-reports-*.zip - zip -r "test-reports-${FILE_SUFFIX}.zip" test -i '*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: pytorch-xla-linux-bionic-py3.7-clang8-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Hold runner for 2 hours or until ssh sessions have drained - # Always hold for active ssh sessions - if: always() - run: .github/scripts/wait_for_ssh_to_drain.sh - - name: Chown workspace - if: always() - run: | - # Ensure the working directory gets chowned back to the current user - docker run --rm -v "$(pwd)":/v -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" . - - name: Kill containers, clean up images - if: always() - run: | - # ignore expansion of "docker ps -q" since it could be empty - # shellcheck disable=SC2046 - docker stop $(docker ps -q) || true - # Prune all of the docker images - docker system prune -af diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index fe37c106670..da25d682bb4 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -274,157 +274,7 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, windows.4xlarge) - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cpu-py3-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: smoke_tests - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.4xlarge - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.4xlarge' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.4xlarge' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cpu-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, windows.4xlarge) timeout-minutes: 240 env: @@ -574,7 +424,7 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, windows.4xlarge) timeout-minutes: 240 env: diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml deleted file mode 100644 index 64322fc55e2..00000000000 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3-smoke.yml +++ /dev/null @@ -1,598 +0,0 @@ -# @generated DO NOT EDIT MANUALLY -# Template is at: .github/templates/windows_ci_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: win-vs2019-cuda11.3-py3-smoke - -on: - pull_request: - push: - tags: - - 'ciflow/all/*' - - 'ciflow/cuda/*' - - 'ciflow/trunk/*' - - 'ciflow/win/*' - workflow_dispatch: - -env: - BUILD_ENVIRONMENT: win-vs2019-cuda11.3-py3-smoke - BUILD_WHEEL: 1 - MAX_JOBS: 8 - CUDA_VERSION: "11.3" - IN_CI: 1 - IS_GHA: 1 - INSTALL_WINDOWS_SDK: 1 - PYTHON_VERSION: "3.8" - PYTORCH_RETRY_TEST_CASES: 1 - PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} - SCCACHE_BUCKET: "ossci-compiler-cache" - VC_PRODUCT: "BuildTools" - VC_VERSION: "" - VS_VERSION: "16.8.6" - VC_YEAR: "2019" - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - no_proxy: localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock - AWS_DEFAULT_REGION: us-east-1 - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TORCH_CUDA_ARCH_LIST: "7.0" - USE_CUDA: 1 - -concurrency: - group: win-vs2019-cuda11.3-py3-smoke-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - build: - runs-on: "windows.4xlarge" - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-build - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - steps: - - name: print labels - run: echo "${PR_LABELS}" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Build - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - BRANCH: ${{ steps.parse-ref.outputs.branch }} - run: | - .jenkins/pytorch/win-build.sh - # Upload to github so that people can click and download artifacts - - name: Upload artifacts to s3 - uses: seemethere/upload-artifact-s3@v3 - with: - retention-days: 14 - if-no-files-found: error - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Cleanup build-results and workspaces - if: always() - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" - rm -rf ./* - test_force_on_cpu_1_1: - name: test (force_on_cpu, 1, 1, windows.4xlarge) - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: force_on_cpu - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.4xlarge - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-force_on_cpu-1-1-windows.4xlarge' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - test_distributed_1_1: - name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: distributed - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.8xlarge.nvidia.gpu - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: smoke_tests - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.8xlarge.nvidia.gpu - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-smoke-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 98fef6ac396..3d62329c3b8 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -4,6 +4,7 @@ name: win-vs2019-cuda11.3-py3 on: + pull_request: push: tags: - 'ciflow/all/*' @@ -440,165 +441,7 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_smoke_tests_1_1: - name: test (smoke_tests, 1, 1, windows.8xlarge.nvidia.gpu) - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: smoke_tests - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.8xlarge.nvidia.gpu - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-smoke_tests-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) - if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* - test_default_1_1: + test_default_1_2: name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 env: @@ -756,7 +599,7 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_default_2_1: + test_default_2_2: name: test (default, 2, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 env: diff --git a/.jenkins/pytorch/win-test-helpers/test_python.bat b/.jenkins/pytorch/win-test-helpers/test_python.bat index 2de7ac4c3bc..ac7012f3b30 100644 --- a/.jenkins/pytorch/win-test-helpers/test_python.bat +++ b/.jenkins/pytorch/win-test-helpers/test_python.bat @@ -7,14 +7,6 @@ if not errorlevel 0 ( ) pushd test -if "%RUN_SMOKE_TESTS_ONLY%"=="1" ( - :: Download specified test cases to run - curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv - if ERRORLEVEL 1 exit /b 1 - - python run_test.py --exclude-jit-executor --verbose --run-specified-test-cases -) else ( - python run_test.py --exclude-jit-executor --verbose -) +python run_test.py --exclude-jit-executor --verbose popd if ERRORLEVEL 1 exit /b 1 diff --git a/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat index 181259df7e3..7a454e9dcc8 100644 --- a/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat +++ b/.jenkins/pytorch/win-test-helpers/test_python_first_shard.bat @@ -23,16 +23,7 @@ echo Copying over test times file copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%" echo Run nn tests - -if "%RUN_SMOKE_TESTS_ONLY%"=="1" ( - :: Download specified test cases to run - curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv - if ERRORLEVEL 1 goto fail - - python run_test.py --exclude-jit-executor --shard 1 2 --verbose --run-specified-test-cases -) else ( - python run_test.py --exclude-jit-executor --shard 1 2 --verbose -) +python run_test.py --exclude-jit-executor --shard 1 2 --verbose if ERRORLEVEL 1 goto fail popd diff --git a/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat b/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat index 56d115f64df..9edb6f0c069 100644 --- a/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat +++ b/.jenkins/pytorch/win-test-helpers/test_python_second_shard.bat @@ -10,16 +10,7 @@ echo Copying over test times file copy /Y "%PYTORCH_FINAL_PACKAGE_DIR_WIN%\.pytorch-test-times.json" "%TEST_DIR_WIN%" pushd test - -if "%RUN_SMOKE_TESTS_ONLY%"=="1" ( - :: Download specified test cases to run - curl --retry 3 -k https://raw.githubusercontent.com/pytorch/test-infra/main/stats/windows_smoke_tests.csv --output .pytorch_specified_test_cases.csv - if ERRORLEVEL 1 exit /b 1 - - python run_test.py --exclude-jit-executor --shard 2 2 --verbose --run-specified-test-cases -) else ( - python run_test.py --exclude-jit-executor --shard 2 2 --verbose -) +python run_test.py --exclude-jit-executor --shard 2 2 --verbose popd diff --git a/.jenkins/pytorch/win-test.sh b/.jenkins/pytorch/win-test.sh index 51c5700db0b..4b8b5db52a1 100755 --- a/.jenkins/pytorch/win-test.sh +++ b/.jenkins/pytorch/win-test.sh @@ -49,8 +49,6 @@ fi if [[ "$TEST_CONFIG" = "force_on_cpu" ]]; then # run the full test suite for force_on_cpu test export USE_CUDA=0 -elif [[ "$TEST_CONFIG" == "smoke_tests" ]]; then - export RUN_SMOKE_TESTS_ONLY=1 fi run_tests() { @@ -64,30 +62,20 @@ run_tests() { if [[ ( -z "${JOB_BASE_NAME}" || "${JOB_BASE_NAME}" == *-test ) && $NUM_TEST_SHARDS -eq 1 ]]; then "$SCRIPT_HELPERS_DIR"/test_python.bat - - if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then - "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat - "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat - "$SCRIPT_HELPERS_DIR"/test_libtorch.bat - fi + "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat + "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat + "$SCRIPT_HELPERS_DIR"/test_libtorch.bat else if [[ "${JOB_BASE_NAME}" == *-test1 || ("${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1) ]]; then "$SCRIPT_HELPERS_DIR"/test_python_first_shard.bat - - if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then - "$SCRIPT_HELPERS_DIR"/test_libtorch.bat - if [[ "${USE_CUDA}" == "1" ]]; then - "$SCRIPT_HELPERS_DIR"/test_python_jit_legacy.bat - fi + "$SCRIPT_HELPERS_DIR"/test_libtorch.bat + if [[ "${USE_CUDA}" == "1" ]]; then + "$SCRIPT_HELPERS_DIR"/test_python_jit_legacy.bat fi - elif [[ "${JOB_BASE_NAME}" == *-test2 || ("${SHARD_NUMBER}" == 2 && $NUM_TEST_SHARDS -gt 1) ]]; then "$SCRIPT_HELPERS_DIR"/test_python_second_shard.bat - - if [[ -z ${RUN_SMOKE_TESTS_ONLY} ]]; then - "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat - "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat - fi + "$SCRIPT_HELPERS_DIR"/test_custom_backend.bat + "$SCRIPT_HELPERS_DIR"/test_custom_script_ops.bat fi fi } From b0c3b36943b05026ed16ece315e0e039bcfffb7c Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Thu, 17 Feb 2022 23:39:27 +0000 Subject: [PATCH 146/199] Don't print so much in Display and Upload Test Stats Following some discussion, it seems that the value of these logs do not outweigh the confusion it sometimes brings when people look at the logs to debug test failures. Pull Request resolved: https://github.com/pytorch/pytorch/pull/73038 --- .github/templates/common.yml.j2 | 4 +-- ...rated-linux-bionic-cuda10.2-py3.9-gcc7.yml | 32 +++++-------------- .../generated-linux-bionic-py3.7-clang9.yml | 12 ++----- .../generated-linux-bionic-rocm4.5-py3.7.yml | 12 ++----- ...rated-linux-vulkan-bionic-py3.7-clang9.yml | 4 +-- ...-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml | 4 +-- ...rated-linux-xenial-cuda11.3-py3.7-gcc7.yml | 12 ++----- ...nerated-linux-xenial-py3.7-clang7-asan.yml | 12 ++----- ...nerated-linux-xenial-py3.7-clang7-onnx.yml | 8 ++--- .../generated-linux-xenial-py3.7-gcc5.4.yml | 24 ++++---------- .../generated-linux-xenial-py3.7-gcc7.yml | 12 ++----- .../generated-macos-11-py3-x86-64.yml | 8 ++--- ...rallelnative-linux-xenial-py3.7-gcc5.4.yml | 8 ++--- ...iodic-linux-bionic-cuda11.5-py3.7-gcc7.yml | 12 ++----- ...enial-cuda10.2-py3-gcc7-slow-gradcheck.yml | 8 ++--- ...linux-xenial-cuda11.1-py3.7-gcc7-debug.yml | 12 ++----- ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 12 ++----- ...rated-periodic-win-vs2019-cuda11.5-py3.yml | 16 +++------- ...-pytorch-xla-linux-bionic-py3.7-clang8.yml | 4 +-- .../generated-win-vs2019-cpu-py3.yml | 12 ++----- .../generated-win-vs2019-cuda11.3-py3.yml | 16 +++------- tools/stats/print_test_stats.py | 31 ------------------ 22 files changed, 61 insertions(+), 214 deletions(-) diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 855917e0742..182a93de990 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -44,13 +44,11 @@ concurrency: {%- endmacro -%} {%- macro upload_test_statistics(build_environment, when="always()", pytorch_directory="", needs_credentials=False) -%} - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics {%- if pytorch_directory %} working-directory: !{{ pytorch_directory }} {%- endif %} if: !{{ when }} - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml index d8071737bba..97a740f9931 100644 --- a/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml +++ b/.github/workflows/generated-linux-bionic-cuda10.2-py3.9-gcc7.yml @@ -467,10 +467,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -719,10 +717,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -967,10 +963,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -1215,10 +1209,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -1467,10 +1459,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -1719,10 +1709,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -1971,10 +1959,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -2223,10 +2209,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml index 2cd74d39d45..cf826fac4d7 100644 --- a/.github/workflows/generated-linux-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-bionic-py3.7-clang9.yml @@ -464,10 +464,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -712,10 +710,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -960,10 +956,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml index 550593a7d3e..08cf9d18056 100644 --- a/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml +++ b/.github/workflows/generated-linux-bionic-rocm4.5-py3.7.yml @@ -448,10 +448,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -672,10 +670,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -896,10 +892,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml index da3055d8458..b54cd493228 100644 --- a/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml +++ b/.github/workflows/generated-linux-vulkan-bionic-py3.7-clang9.yml @@ -464,10 +464,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml index 214a69a8984..4bd83452f55 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7-bazel-test.yml @@ -301,10 +301,8 @@ jobs: if-no-files-found: warn path: test-jsons-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml index 97778b2eec7..6738daa49d8 100644 --- a/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-cuda11.3-py3.7-gcc7.yml @@ -467,10 +467,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -719,10 +717,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -971,10 +967,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml index 8bbc701439b..7a998e0c022 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-asan.yml @@ -464,10 +464,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -712,10 +710,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -960,10 +956,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml index 8cadec7595b..acc73f8cdfb 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-clang7-onnx.yml @@ -464,10 +464,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -712,10 +710,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml index 0f50f2423d3..57687738dd7 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc5.4.yml @@ -463,10 +463,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -711,10 +709,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -959,10 +955,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -1207,10 +1201,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -1455,10 +1447,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -1703,10 +1693,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml index f96cfff958b..e2b69cac083 100644 --- a/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml +++ b/.github/workflows/generated-linux-xenial-py3.7-gcc7.yml @@ -463,10 +463,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -711,10 +709,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -959,10 +955,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-macos-11-py3-x86-64.yml b/.github/workflows/generated-macos-11-py3-x86-64.yml index 97c79478b11..4a1456224b9 100644 --- a/.github/workflows/generated-macos-11-py3-x86-64.yml +++ b/.github/workflows/generated-macos-11-py3-x86-64.yml @@ -180,10 +180,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -294,10 +292,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml index d74143ce679..44ecd551db2 100644 --- a/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml +++ b/.github/workflows/generated-parallelnative-linux-xenial-py3.7-gcc5.4.yml @@ -462,10 +462,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -710,10 +708,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml index a4c067f0608..f90c40c50eb 100644 --- a/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml +++ b/.github/workflows/generated-periodic-linux-bionic-cuda11.5-py3.7-gcc7.yml @@ -465,10 +465,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -717,10 +715,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -969,10 +965,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml index beed145369e..f7250714ea5 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck.yml @@ -467,10 +467,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -719,10 +717,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml index f269f9ad679..cd573485002 100644 --- a/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml +++ b/.github/workflows/generated-periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug.yml @@ -466,10 +466,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -718,10 +716,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -970,10 +966,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index c95dfdf6ea1..37957f0b8a0 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -266,10 +266,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -424,10 +422,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -582,10 +578,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml index a0c54966471..f6171a5776e 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml @@ -258,10 +258,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -416,10 +414,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -574,10 +570,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -732,10 +726,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml index 258bfe614d8..b87c18445c8 100644 --- a/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml +++ b/.github/workflows/generated-pytorch-xla-linux-bionic-py3.7-clang8.yml @@ -430,10 +430,8 @@ jobs: if-no-files-found: error path: test-reports-*.zip - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index da25d682bb4..820a77ddcc4 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -251,10 +251,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -401,10 +399,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -551,10 +547,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 3d62329c3b8..3dbbc4cf0a6 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -260,10 +260,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -418,10 +416,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -576,10 +572,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} @@ -734,10 +728,8 @@ jobs: - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py - - name: Display and upload test statistics (Click Me) + - name: Upload test statistics if: always() - # temporary hack: set CIRCLE_* vars, until we update - # tools/stats/print_test_stats.py to natively support GitHub Actions env: AWS_DEFAULT_REGION: us-east-1 BRANCH: ${{ steps.parse-ref.outputs.branch }} diff --git a/tools/stats/print_test_stats.py b/tools/stats/print_test_stats.py index 0555945e478..990e6fcab14 100755 --- a/tools/stats/print_test_stats.py +++ b/tools/stats/print_test_stats.py @@ -637,23 +637,6 @@ class TestSuite: self.test_cases[name].unexpected_success |= test_case.unexpected_success self.test_cases[name].expected_failure |= test_case.expected_failure - def print_report(self, num_longest: int = 3) -> None: - sorted_tests = sorted(self.test_cases.values(), key=lambda x: x.time) - test_count = len(sorted_tests) - print(f"class {self.name}:") - print( - f" tests: {test_count} failed: {self.failed_count} skipped: {self.skipped_count} " - f"errored: {self.errored_count} unexpected_success: {self.unexpected_success_count} " - f"expected_failure: {self.expected_failure_count}") - print(f" run_time: {self.total_time:.2f} seconds") - print(f" avg_time: {self.total_time/test_count:.2f} seconds") - if test_count >= 2: - print(f" median_time: {statistics.median(x.time for x in sorted_tests):.2f} seconds") - sorted_tests = sorted_tests[-num_longest:] - print(f" {len(sorted_tests)} longest tests:") - for test in reversed(sorted_tests): - print(f" {test.name} time: {test.time:.2f} seconds") - print("") DuplicatedDict = Dict[str, Dict[str, List[TestCase]]] @@ -1092,16 +1075,10 @@ if __name__ == '__main__': except Exception as e: print(f"ERROR ENCOUNTERED WHEN UPLOADING TO SCRIBE: {e}") - # longest_tests can contain duplicates as the same tests can be spawned from different files - longest_tests: List[TestCase] = [] total_time = 0.0 for filename, test_filename in reports_by_file.items(): for suite_name, test_suite in test_filename.test_suites.items(): total_time += test_suite.total_time - if test_suite.total_time >= args.class_print_threshold: - test_suite.print_report(args.longest_of_class) - longest_tests.extend(test_suite.test_cases.values()) - longest_tests = sorted(longest_tests, key=lambda x: x.time)[-args.longest_of_run:] obj = assemble_s3_object(reports_by_file, total_seconds=total_time) @@ -1111,14 +1088,6 @@ if __name__ == '__main__': except Exception as e: print(f"ERROR ENCOUNTERED WHEN UPLOADING TO S3: {e}") - print(f"Total runtime is {datetime.timedelta(seconds=total_time)}") - print( - f"{len(longest_tests)} longest tests of entire run" - f" (ignoring suites totaling less than {args.class_print_threshold} seconds):" - ) - for test_case in reversed(longest_tests): - print(f" {test_case.class_name}.{test_case.name} time: {test_case.time:.2f} seconds") - if args.compare_with_s3: head_json = obj if args.use_json: From 39fb7714231913a2f616891e0e7e6d82b83ac0c7 Mon Sep 17 00:00:00 2001 From: Don Jang Date: Thu, 17 Feb 2022 15:53:48 -0800 Subject: [PATCH 147/199] [Static Runtime] Report static op statistics from graph when input size is zero (#73032) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73032 Currently, ptvsc2_predictor_bench reports nothing when the input size is zero. However, Static Runtime's module creation has some useful information even after loading a model. This change reports static op statistics when the given input's size is zero. In addition to that, this enables it to report the out variant coverage percentage, which is crucial to establish the baseline performance of Static Runtime. Test Plan: - Ran `ptvsc2_predictor_bench` with this change as seen above. Reviewed By: mikeiovine Differential Revision: D34294803 fbshipit-source-id: 80c02199075dae9280657d6edecc7c679c1c27f4 (cherry picked from commit 83aec141a25a9ede5d22e5c17c0b6b07307faf39) --- torch/csrc/jit/runtime/static/impl.cpp | 68 +++++++++++++++++++------- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 595e428e535..9c06bb8c02c 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -1260,9 +1260,6 @@ void BlockRunner::benchmark( TORCH_CHECK( kwargs_list.size() == 0 || args_list.size() == kwargs_list.size()); std::cout << "Input size: " << args_list.size() << std::endl; - if (args_list.size() == 0) { - return; - } float time_per_iter = benchmark_model(args_list, kwargs_list, warmup_runs, main_runs); std::cout << "Static runtime ms per iter: " << time_per_iter @@ -1282,11 +1279,20 @@ void BlockRunner::benchmark( std::vector> time_per_node_type_vec{ results.time_per_node_type.begin(), results.time_per_node_type.end()}; - std::sort( - time_per_node_type_vec.begin(), - time_per_node_type_vec.end(), - [](auto& left, auto& right) { return left.second > right.second; }); - + if (args_list.size() == 0) { + std::sort( + time_per_node_type_vec.begin(), + time_per_node_type_vec.end(), + [&results](auto& left, auto& right) { + return results.instances_per_node_type[left.first] > + results.instances_per_node_type[right.first]; + }); + } else { + std::sort( + time_per_node_type_vec.begin(), + time_per_node_type_vec.end(), + [](auto& left, auto& right) { return left.second > right.second; }); + } std::cout << "Time per node type:" << std::endl; for (const auto& p : time_per_node_type_vec) { const std::string& kind = p.first; @@ -1341,13 +1347,14 @@ void BlockRunner::benchmark( std::cout << "Total number of reused tensors: " << planner_->total_reused_tensors() << std::endl; } - std::cout << "Total number of 'out' variant nodes/total number of nodes: " - << results.out_nodes_count << "/" << results.total_nodes_count - << " (" - << 100.0 * (results.out_nodes_count) / - static_cast(results.total_nodes_count) - << "%)" << std::endl; } + std::cout << "Total number of 'out' variant nodes/total number of nodes: " + << results.out_nodes_count << "/" << results.total_nodes_count + << " (" + << 100.0 * (results.out_nodes_count) / + static_cast(results.total_nodes_count) + << "%)" << std::endl; + check_for_memory_leak(); #ifndef NDEBUG @@ -1468,8 +1475,36 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops( TORCH_CHECK( kwargs_list.size() == 0 || args_list.size() == kwargs_list.size()); TORCH_CHECK(warmup_runs >= 1 && main_runs >= 1); + + IndividualMetrics results; + results.time_per_node.resize(nodes_.size(), 0); if (args_list.size() == 0) { - return {}; + // When the given input is empty, compute the op statistics from the given + // graph without executing it. + for (const auto i : c10::irange(nodes_.size())) { + const Node* node = nodes_[i].node(); + std::string kind(node->kind().toQualString()); + // TODO: Collect op statistics from sub-blocks here. + results.time_per_node[i] = 0; + results.time_per_node_type[kind] = 0; + results.instances_per_node_type[kind]++; + if (nodes_[i].has_out_variant()) { + results.out_nodes.insert(kind); + results.out_nodes_count++; + } else if (nodes_[i].has_native()) { + results.native_nodes.insert(kind); + } + results.total_time += results.time_per_node[i]; + } + results.total_nodes_count = nodes_.size(); + results.memory_alloc_time = 0; + results.memory_dealloc_time = 0; + results.output_dealloc_time = 0; + for (const auto& p : results.time_per_node_type) { + const std::string& kind = p.first; + results.percent_per_node_type[kind] = 0; + } + return results; } const bool is_kwargs_empty = kwargs_list.size() == 0; @@ -1479,9 +1514,6 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops( // explanation. c10::InferenceMode mode; - IndividualMetrics results; - results.time_per_node.resize(nodes_.size(), 0); - // setup time caffe2::Timer timer; From 056b6260f7af87903d807907b87edde28fa7afad Mon Sep 17 00:00:00 2001 From: BowenBao Date: Fri, 18 Feb 2022 00:19:12 +0000 Subject: [PATCH 148/199] [ONNX] Mergerule: add onnx pass registration file Forgot about this one in #72297. This folder contains files that registers ONNX bindings. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72963 --- .github/merge_rules.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/merge_rules.json b/.github/merge_rules.json index eee24234f5e..81270b6e5ff 100644 --- a/.github/merge_rules.json +++ b/.github/merge_rules.json @@ -9,7 +9,8 @@ "docs/source/onnx.rst", "torch/csrc/jit/serialization/export.*", "torch/csrc/jit/serialization/onnx.*", - "torch/_C/__init__.pyi.in" + "torch/_C/__init__.pyi.in", + "torch/csrc/onnx/**" ], "approved_by": ["BowenBao", "garymm"], "mandatory_app_id": 12274 From 69389fb5423832272a36d3ef4bd2a39d10489507 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Thu, 17 Feb 2022 19:44:33 -0800 Subject: [PATCH 149/199] Sync lazy_tensor_staging back to master (#72875) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72875 This diff contains changes from several PRs landed to lazy_tensor_staging branch. * generating 'fallback' overrides for each codegenned op, useful for debugging * supports operators which are missing aten:: symbols for op names, instead using their string counterpart * makes the IR class a base class instead of hardcoding the assumption of TS It also resolves lint issues and in particular cleans up the following: * {Type}s shouldn't be passed into isValueType, and using the catch-all base class of CType is nicer than specifying a list of types. Fixes #72852 Test Plan: test manually on lazy_tensor_staging branch Reviewed By: shunting314 Differential Revision: D34250357 fbshipit-source-id: aa7d589f605055d5d02bc77c77fa6f1182ff7497 (cherry picked from commit 2f8f5e49719027a309a9cddbc20a1bfb920276a8) --- test/cpp/lazy/test_cache.cpp | 2 +- test/cpp/lazy/test_ir.cpp | 2 +- test/cpp/lazy/test_ir_util.cpp | 2 +- tools/codegen/api/lazy.py | 53 +++++-- tools/codegen/dest/lazy_ir.py | 147 +++++++++++++------ tools/codegen/dest/lazy_ts_lowering.py | 8 +- tools/codegen/gen_lazy_tensor.py | 22 +-- torch/csrc/lazy/core/config.cpp | 5 + torch/csrc/lazy/core/config.h | 1 + torch/csrc/lazy/core/ir.cpp | 27 +++- torch/csrc/lazy/core/ir.h | 41 +++++- torch/csrc/lazy/core/lazy_graph_executor.cpp | 10 +- torch/csrc/lazy/core/shape.cpp | 8 +- torch/csrc/lazy/core/shape.h | 2 +- torch/csrc/lazy/ts_backend/ts_node.cpp | 18 ++- torch/csrc/lazy/ts_backend/ts_node.h | 2 +- 16 files changed, 248 insertions(+), 102 deletions(-) diff --git a/test/cpp/lazy/test_cache.cpp b/test/cpp/lazy/test_cache.cpp index 033b6c21b1e..a6da9bccbd2 100644 --- a/test/cpp/lazy/test_cache.cpp +++ b/test/cpp/lazy/test_cache.cpp @@ -11,7 +11,7 @@ namespace lazy { class CacheNode : public Node { public: explicit CacheNode(const std::string& str) - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(str)), + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool /*bakeInSizes*/) -> hash_t { return Hash(str); }), str_(str) {} ~CacheNode() override = default; diff --git a/test/cpp/lazy/test_ir.cpp b/test/cpp/lazy/test_ir.cpp index 78b94618c7f..326f7a9092c 100644 --- a/test/cpp/lazy/test_ir.cpp +++ b/test/cpp/lazy/test_ir.cpp @@ -12,7 +12,7 @@ namespace lazy { class TestLeafNode : public Node { public: explicit TestLeafNode(size_t param) - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(param)), + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */[&](bool /*bakeInSizes*/) -> hash_t { return Hash(param); }), param_(param) {} ~TestLeafNode() override = default; diff --git a/test/cpp/lazy/test_ir_util.cpp b/test/cpp/lazy/test_ir_util.cpp index 5c216258f9a..bb29cff6f6b 100644 --- a/test/cpp/lazy/test_ir_util.cpp +++ b/test/cpp/lazy/test_ir_util.cpp @@ -12,7 +12,7 @@ namespace lazy { class IrUtilNode : public Node { public: explicit IrUtilNode() - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(0)) {} + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool /*bakeInSizes*/) -> hash_t { return Hash(0); }) {} ~IrUtilNode() override = default; void AddOperand(Value v) { diff --git a/tools/codegen/api/lazy.py b/tools/codegen/api/lazy.py index 3fe83936eef..ebbc72eb1fc 100644 --- a/tools/codegen/api/lazy.py +++ b/tools/codegen/api/lazy.py @@ -1,12 +1,11 @@ from typing import List, Union, Tuple from tools.codegen.model import (Type, BaseTy, BaseType, OptionalType, ListType, OperatorName, FunctionSchema, - Return) -from tools.codegen.api.types import (BaseCppType, BaseCType, OptionalCType, - ConstRefCType, NamedCType, - MutRefCType, + Return, TensorOptionsArguments) +from tools.codegen.api.types import (CType, BaseCppType, BaseCType, OptionalCType, + NamedCType, deviceT, layoutT, VectorCType, boolT, longT, doubleT, ListCType, stringT, - scalarT, scalarTypeT, ArrayRefCType, ArrayCType, TupleCType) + scalarT, scalarTypeT) valueT = BaseCppType('torch::lazy', 'Value') @@ -33,7 +32,9 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L if typ.name == BaseTy.Tensor: return BaseCType(valueT) elif typ.name == BaseTy.Scalar: - return BaseCType(scalarT) + # at::scalar has special handling, + # and is wrapped in an IR value just like at::tensor + return BaseCType(valueT) elif typ.name == BaseTy.ScalarType: return BaseCType(scalarTypeT) elif typ.name == BaseTy.int: @@ -44,6 +45,10 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L return BaseCType(doubleT) elif typ.name == BaseTy.str: return BaseCType(stringT) + elif typ.name == BaseTy.Device: + return BaseCType(deviceT) + elif typ.name == BaseTy.Layout: + return BaseCType(layoutT) else: raise AssertionError(f"TODO add support for type {repr(typ)}") elif isinstance(typ, OptionalType): @@ -58,19 +63,36 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L raise AssertionError(f"unrecognized type {repr(typ)}") -def isValueType(typ: Union[Type, BaseCType, OptionalCType, ConstRefCType, MutRefCType, - ListCType, ArrayRefCType, ArrayCType, VectorCType, TupleCType]) -> bool: +def isValueType(typ: CType) -> bool: """ Given a type, determine if it is a Value-like type. This is equivalent to being Tensor-like, but assumes the type has already been transformed. """ if isinstance(typ, BaseCType): - return typ.type == valueT + # I am regretting my naming conventions, but now we are wrapping at::scalar in + # lazy value, while preserving other 'scalar' types as scalars in the IR + return typ.type == valueT or typ.type == scalarT elif isinstance(typ, (OptionalCType, ListCType, VectorCType)): return isValueType(typ.elem) else: return False +def isWrappedScalarType(typ: Type) -> bool: + """ + Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value. + Since we literally change the type from scalarT to valueT, information is lost. + This function helps build a list of wrapped scalars to save that information + """ + if isinstance(typ, BaseType): + # I am regretting my naming conventions, but now we are wrapping at::scalar in + # lazy value, while preserving other 'scalar' types as scalars in the IR + return typ.name == BaseTy.Scalar + elif isinstance(typ, (OptionalType, ListType)): + return isWrappedScalarType(typ.elem) + else: + return False + + # Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node. # Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML), # but carries type information from a native FunctionSchema modified for use with IR nodes, @@ -87,6 +109,8 @@ class LazyIrSchema: # TODO: Need to handle collisions with argument names at some point returns: Tuple['Return', ...] + wrapped_scalar_names: List[str] + def __init__(self, func: FunctionSchema): positional_arg_types = [] @@ -108,14 +132,15 @@ class LazyIrSchema: "tensor_options", "post_tensor_options_kwarg_only", "out"]: - if getattr(func.arguments, arg_field) is not None: - keyword_arg_types.extend([ - NamedCType( - arg.name, - process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)]) + curr_args = getattr(func.arguments, arg_field) + if curr_args is not None: + if isinstance(curr_args, TensorOptionsArguments): + curr_args = curr_args.all() + keyword_arg_types.extend([NamedCType(arg.name, process_ir_type(arg.type)) for arg in curr_args]) self.keyword_arg_types = tuple(keyword_arg_types) self.name = func.name self.returns = func.returns + self.wrapped_scalar_names = [arg.name for arg in func.schema_order_arguments() if isWrappedScalarType(arg.type)] @property def node_name(self) -> str: diff --git a/tools/codegen/dest/lazy_ir.py b/tools/codegen/dest/lazy_ir.py index d41b4edcd8a..58fc6862900 100644 --- a/tools/codegen/dest/lazy_ir.py +++ b/tools/codegen/dest/lazy_ir.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from typing import List, Union from dataclasses import dataclass from tools.codegen.context import method_with_native_function @@ -9,17 +10,23 @@ import tools.codegen.api.dispatcher as dispatcher from tools.codegen.api.lazy import LazyIrSchema, isValueType from tools.codegen.dest.lazy_ts_lowering import ts_lowering_body - -def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: +def node_ctor_arg_rvalue_string(arg: NamedCType, schema: LazyIrSchema) -> str: """ Given a NamedCType from a lazy IR schema, generate a c++ string for materializing an rvalue of that arg for passing into a lazy Node constructor. """ + if isValueType(arg.type): if isinstance(arg.type, BaseCType): + if arg.name in schema.wrapped_scalar_names: + return f"torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen({arg.name})" return f"lazy_{arg.name}.GetIrValue()" elif isinstance(arg.type, OptionalCType): + if arg.name in schema.wrapped_scalar_names: + return f"{arg.name} ? " \ + f"c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen(*{arg.name})) : " \ + "c10::nullopt" return f"lazy_{arg.name} ? " \ f"c10::make_optional(lazy_{arg.name}.GetIrValue()) : " \ "c10::nullopt" @@ -35,24 +42,55 @@ def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: else: return f"{arg.name}" -def node_ctor_inputs(func: LazyIrSchema) -> str: +def node_ctor_inputs(schema: LazyIrSchema) -> str: """ Produce a formatted string with the arguments as passed into the constructor of a node class. """ - node_ctor_values = [node_ctor_arg_rvalue_string(arg) for arg in func.filtered_types()] + node_ctor_values = [node_ctor_arg_rvalue_string(arg, schema) for arg in schema.filtered_types()] return ",\n ".join(node_ctor_values) +def gen_fallback_code(schema: LazyIrSchema, overload_name: str) -> str: + """ + Generate code that falls back to eager conditioned on a predicate + """ + fallback_args = ",\n ".join([str(arg.name) for arg in schema.filtered_types()]) + if len(overload_name): + aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})" + else: + aten_op_str = f"ATEN_OP({schema.aten_name})" + return f""" + if (force_eager_fallback({aten_symbol(schema)})) {{ + return at::native::call_fallback_fn<<c_eager_fallback, {aten_op_str}>::call( + {fallback_args} + ); + }} +""" + +def aten_symbol(schema: LazyIrSchema) -> str: + missing_interned_strings = { + 'sigmoid_backward', + } + if schema.aten_name in missing_interned_strings: + return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")' + return f'at::aten::{schema.aten_name}' @dataclass(frozen=True) -class LazyIR: +class LazyIR(ABC): backend_index: BackendIndex node_base: str + lowering_function_type: str = "" + lowering_context_type: str = "" + lowering_return_type: str = "" @method_with_native_function def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func return self.gen(f) + @abstractmethod + def lowering_body(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> str: + pass + def gen(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: # for now, we just want one IR class decl and soon after also the method defs # and we use the functional version not out/inplace. @@ -63,9 +101,9 @@ class LazyIR: scalar_types = schema.filtered_types(values=False, scalars=True) node_ctor_args = ", ".join([f"const {i.cpp_type()}& {i.name}" for i in all_types]) - scalar_initializers = ",\n ".join([f"{t.name}_({t.name})" for t in scalar_types]) + scalar_initializers = ",\n ".join([f"{t.name}({t.name})" for t in scalar_types]) comma_if_scalar_initializers = ",\n" if len(scalar_initializers) else "" - scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name}_;" for t in scalar_types]) + scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name};" for t in scalar_types]) scalar_hashes = ", ".join([f"{f.name}" for f in scalar_types]) base_ctor_value_args_list = [] optional_values = [] @@ -83,21 +121,20 @@ class LazyIR: members_to_string = [] for t in scalar_types: if isinstance(t.type, OptionalCType): - members_to_string.append(f"""if ({t.name}_.has_value()) {{ - ss << ", {t.name}=" << {t.name}_.value(); + members_to_string.append(f"""if ({t.name}.has_value()) {{ + ss << ", {t.name}=" << {t.name}.value(); }} else {{ ss << ", {t.name}=null"; }}""") else: - members_to_string.append(f'ss << ", {t.name}=" << {t.name}_;') + members_to_string.append(f'ss << ", {t.name}=" << {t.name};') members_to_string_str = "\n ".join(members_to_string) return [f"""\ -// TODO(alanwaketan): Public members don't need to have _ suffix. class {schema.node_name} : public {self.node_base} {{ public: {schema.node_name}({node_ctor_args}, std::vector&& shapes) - : {self.node_base}(torch::lazy::OpKind(at::aten::{schema.aten_name}), + : {self.node_base}(torch::lazy::OpKind({aten_symbol(schema)}), {{{base_ctor_value_args}}}, std::move(shapes), /* num_outputs */ {len(func.returns)}, torch::lazy::MHash({scalar_hashes})){comma_if_scalar_initializers} @@ -109,14 +146,14 @@ class {schema.node_name} : public {self.node_base} {{ std::string ToString() const override {{ std::stringstream ss; - ss << TsNode::ToString(); + ss << {self.node_base}::ToString(); {members_to_string_str} return ss.str(); }} - torch::lazy::TSOpVector Lower(std::shared_ptr function, - torch::lazy::TSLoweringContext* loctx) const override {{ - {ts_lowering_body(f)} + {self.lowering_return_type} Lower({self.lowering_function_type} function, + {self.lowering_context_type} loctx) const override {{ + {self.lowering_body(f)} }} {scalar_decls} @@ -127,21 +164,34 @@ class {schema.node_name} : public {self.node_base} {{ """, ] -def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str) -> str: +@dataclass(frozen=True) +class TSLazyIR(LazyIR): + lowering_function_type: str = "std::shared_ptr" + lowering_context_type: str = "torch::lazy::TSLoweringContext*" + lowering_return_type: str = "torch::lazy::TSOpVector" + + def lowering_body(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> str: + return ts_lowering_body(f) + + +def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str, schema: LazyIrSchema) -> str: lazy_tensor_decls: List[str] = [] for t in value_types: + if t.name in schema.wrapped_scalar_names: + # no lazy tensor wrapper for scalars that are promoted to IR values + continue if isinstance(t.type, BaseCType): lazy_tensor_decls.append( f"{tensor_class} lazy_{t.name} = " - f"GetLtcTensorOrCreateForWrappedNumber({t.name}, *device);") + f"torch::lazy::GetLtcTensorOrCreateForWrappedNumber({t.name}, *common_device);") elif isinstance(t.type, OptionalCType): # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it # until we encounter a real world example. lazy_tensor_decls.append( - f" {tensor_class} lazy_{t.name} = TryGetLtcTensor({t.name}.value_or(at::Tensor()));") + f" {tensor_class} lazy_{t.name} = torch::lazy::TryGetLtcTensor({t.name}.value_or(at::Tensor()));") else: raise AssertionError("TODO not sure if there are other valid types to handle here") - return "\n ".join(lazy_tensor_decls) + return ("\n ").join(lazy_tensor_decls) @dataclass(frozen=True) class GenLazyNativeFuncDefinition: @@ -152,17 +202,22 @@ class GenLazyNativeFuncDefinition: @method_with_native_function def __call__(self, func: NativeFunction) -> List[str]: sig = kernel_signature(func, self.backend_index) - - # Lazy IR stuff + metadata = self.backend_index.get_kernel(func) + assert metadata is not None schema = LazyIrSchema(func.func) all_types = schema.filtered_types() value_types = schema.filtered_types(values=True, scalars=False) scalar_types = schema.filtered_types(values=False, scalars=True) returns_length = len(schema.returns) - value_types_names = ", ".join([f"{t.name}" for t in value_types]) - get_device_str = f"""auto device = bridge::GetBackendDevice({value_types_names});""" - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) + fallback_str = gen_fallback_code(schema, overload_name=func.func.name.overload_name) + value_types_names = [f"{t.name}" for t in value_types if t.name not in schema.wrapped_scalar_names] + assert len(value_types_names) > 0, "Code below assumes there is at least one tensor arg" + get_device_str = f"""auto common_device = torch::lazy::GetBackendDevice({', '.join(value_types_names)}); + TORCH_INTERNAL_ASSERT(common_device); + """ + + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) node_ctor_input_str = node_ctor_inputs(schema) # call the meta kernel if it exists, to compute output shape/dtype for our IR @@ -174,37 +229,40 @@ class GenLazyNativeFuncDefinition: shapes_str = ','.join([this_shape(i) for i in range(returns_length)]) meta_out = "std::vector shapes{" + shapes_str + "};" + # TODO: INTEGRATION POINT HERE: meta_str = f"""auto out_meta = at::meta::{schema.aten_name}({', '.join(str(t.name) for t in all_types)}); {meta_out}""" else: - shape_sig = ComputeShapeSignature(func) + shape_sig = ComputeShapeSignature(metadata.kernel, func) meta_str = f""" auto shapes = {shape_sig.shape_call};""" + meta_str += f""" TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});""" node_str = f"""auto node = torch::lazy::MakeNode({node_ctor_input_str}, std::move(shapes));""" + first_tensor_name = value_types_names[0] + bridge_str = """auto result = torch::lazy::CreateAtenFromLtcTensor( + torch::lazy::LazyTensor::Create(std::move(node), *common_device));""" - assert len(value_types) > 0, f"Only supporting tensor ops so far, none found in {sig}" - first_tensor = value_types[0] - bridge_str = f"""auto result = CreateAtenFromLtcTensor(lazy_{first_tensor.name}.CreateFrom(node));""" if returns_length > 1: bridge_str = f"""std::vector<{self.tensor_class}> lazy_tensors; for (int i = 0; i < {returns_length}; i++) {{ - lazy_tensors.push_back(lazy_{first_tensor.name}.CreateFrom(torch::lazy::Value(node, i))); + lazy_tensors.push_back(torch::lazy::LazyTensor::Create(torch::lazy::Value(node, i), *common_device)); }} - auto result = TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" - if schema.name.name.inplace: + auto result = torch::lazy::TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" + + if schema.name.name.inplace or func.func.is_out_fn(): assert returns_length == 1, "We assumed there was no such case where an op is an in-place variant " \ "and has tuple outputs." - bridge_str = f"""lazy_{first_tensor.name}.SetInPlaceIrValue(node); - auto& result = {first_tensor.name};""" + bridge_str = f"""lazy_{first_tensor_name}.SetInPlaceIrValue(node); + auto& result = {first_tensor_name};""" return [f"""\ - // TODO(alanwaketan): Quite a lot inefficient copy-by-value there. Let's optimize it. - {sig.decl(name=f"{self.class_method_name}::{schema.aten_name}")} {{ + {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{ + {fallback_str} TORCH_LAZY_FN_COUNTER("lazy::"); {get_device_str} {lazy_tensor_decls_str} @@ -219,17 +277,17 @@ class ComputeShapeSignature: """ Here we use the base name as the suffix of the signature to avoid generating for in-place variants. """ - @method_with_native_function - def __init__(self, f: NativeFunction): + def __init__(self, kernel_name: str, f: NativeFunction): self.__schema = LazyIrSchema(f.func) self.__dispatch_args = ', '.join([a.decl() for a in dispatcher.arguments(f.func)]) self.__call_args = ", ".join([f"{t.name}" for t in self.__schema.filtered_types()]) + self.__kernel_name = kernel_name def __decl_suffix(self) -> str: - return f"{self.__schema.base_name}({self.__dispatch_args})" + return f"{self.__kernel_name}({self.__dispatch_args})" def __call_suffix(self) -> str: - return f"{self.__schema.base_name}({self.__call_args})" + return f"{self.__kernel_name}({self.__call_args})" @property def shape_decl(self) -> str: @@ -246,19 +304,20 @@ class GenLazyShapeInferenceDefinition: tensor_class: str @method_with_native_function + # def gen_lazy_shape_inference_decl(f: NativeFunction, backend_index: BackendIndex, tensor_class: str) -> List[str]: def __call__(self, f: NativeFunction) -> List[str]: sig = kernel_signature(f, self.backend_index) - - # Lazy IR stuff + metadata = self.backend_index.get_kernel(f) + assert metadata is not None schema = LazyIrSchema(f.func) value_types = schema.filtered_types(values=True, scalars=False) - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) node_ctor_input_str = node_ctor_inputs(schema) # Only generate shape/dtype fn for non-structured kernels, # since we just use the meta function for structured kernels if not f.structured and f.structured_delegate is None: - shape_sig = ComputeShapeSignature(f) + shape_sig = ComputeShapeSignature(metadata.kernel, f) return ["\n".join([f"{shape_sig.shape_decl};"])] else: return [] diff --git a/tools/codegen/dest/lazy_ts_lowering.py b/tools/codegen/dest/lazy_ts_lowering.py index 32d505cda7b..3f7701d5587 100644 --- a/tools/codegen/dest/lazy_ts_lowering.py +++ b/tools/codegen/dest/lazy_ts_lowering.py @@ -18,13 +18,12 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: continue emplace_arguments.append('loctx->GetOutputOp(operand(i++))') continue - emplace_arguments.append(f'"{value.name}", {value.name}_') + emplace_arguments.append(f'"{value.name}", {value.name}') emplace_arguments_str = "\n ".join( [f"arguments.emplace_back({a});" for a in emplace_arguments]) - emplace_kwarg_values = [f'loctx->GetOutputOp(operand({i}))' for i in range(len(schema.keyword_values))] - emplace_kwarg_scalars = [f'"{t.name}", {t.name}_' for t in schema.keyword_scalars] - assert len(schema.keyword_values) == 0, "TODO the logic for operand(i) is broken if there are kw values" + emplace_kwarg_values = [f'"{t.name}", loctx->GetOutputOp(operand(i++))' for t in schema.keyword_values] + emplace_kwarg_scalars = [f'"{t.name}", {t.name}' for t in schema.keyword_scalars] emplace_kwarguments = "\n ".join( [f"kwarguments.emplace_back({a});" for a in emplace_kwarg_values + emplace_kwarg_scalars]) return f"""\ @@ -38,6 +37,5 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments); CHECK_EQ({schema.aten_name}_out.size(), {len(func.returns)}); - // TODO: need to call GenerateClone sometimes? Or else return LowerBuiltIn() directly return {schema.aten_name}_out; """ diff --git a/tools/codegen/gen_lazy_tensor.py b/tools/codegen/gen_lazy_tensor.py index b2515d3d083..9705620fa2e 100644 --- a/tools/codegen/gen_lazy_tensor.py +++ b/tools/codegen/gen_lazy_tensor.py @@ -3,7 +3,8 @@ import argparse import os import yaml from collections import namedtuple -from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple +from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple, Type +from tools.codegen.dest.lazy_ir import LazyIR, TSLazyIR from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml from tools.codegen.model import (FunctionSchema, NativeFunction, NativeFunctionsGroup, OperatorName) @@ -60,20 +61,20 @@ def main() -> None: parser.add_argument( '--node_base_hdr', type=str, default=None, help='Path to header file defining custom Lazy IR Node base class') parser.add_argument( - '--tensor_class', type=str, default="LazyTensor", help='Name of backend specific custom Lazy Tensor class') + '--tensor_class', type=str, default="torch::lazy::LazyTensor", help='Name of backend specific custom Lazy Tensor class') parser.add_argument( - '--tensor_class_hdr', type=str, default="lazy_tensor_core/csrc/tensor.h", + '--tensor_class_hdr', type=str, default="torch/csrc/lazy/core/tensor.h", help='Path to header file defining custom Lazy Tensor class') options = parser.parse_args() run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path, options.gen_ts_lowerings, options.node_base, options.node_base_hdr, - options.tensor_class, options.tensor_class_hdr) + options.tensor_class, options.tensor_class_hdr, TSLazyIR) def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str], gen_ts_lowerings: bool, node_base: str, node_base_hdr: Optional[str], - tensor_class: str, tensor_class_hdr: str) -> None: + tensor_class: str, tensor_class_hdr: str, lazy_ir_cls: Type[LazyIR]) -> None: # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute() @@ -160,11 +161,13 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st fm.write_with_template(f'{backend_key}NativeFunctions.cpp', 'DispatchKeyNativeFunctions.cpp', lambda: { 'includes': [f'#include <{path}>' for path in [ tensor_class_hdr, + "ATen/Functions.h", "ATen/MetaFunctions.h", + "ATen/Operators.h", + "torch/csrc/lazy/core/lazy_graph_executor.h", "torch/csrc/lazy/core/metrics.h", "torch/csrc/lazy/core/shape.h", - "lazy_tensor_core/csrc/aten_ltc_bridge.h", - "lazy_tensor_core/csrc/lazy_graph_executor.h", + "lazy_tensor_core/csrc/ts_backend/aten_eager_fallback.h", f"{output_dir}/{backend_key}NativeFunctions.h", f"{output_dir}/{backend_key}LazyIr.h", f"{output_dir}/{backend_key}ShapeInference.h", @@ -196,7 +199,8 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'func_declarations': list(concat_map_codegen( dest.GenLazyShapeInferenceDefinition(backend_indices[backend_key], tensor_class), - grouped_native_functions + grouped_native_functions, + codegenInplaceVariant=True, )), }) # Generate IR node classes @@ -217,7 +221,7 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'DispatchKey': backend_key, 'dispatch_namespace': backend_key.lower(), 'ir_declarations': list(concat_map_codegen( - dest.LazyIR(backend_indices[backend_key], node_base), + lazy_ir_cls(backend_indices[backend_key], node_base), grouped_native_functions )), }) diff --git a/torch/csrc/lazy/core/config.cpp b/torch/csrc/lazy/core/config.cpp index af86dd926d6..b47054913e1 100644 --- a/torch/csrc/lazy/core/config.cpp +++ b/torch/csrc/lazy/core/config.cpp @@ -7,6 +7,11 @@ C10_DEFINE_bool( false, "Enable parameter aliasing support"); +C10_DEFINE_bool( + torch_lazy_use_thread_pool, + false, + "Use thread pool to schedule backend execution"); + C10_DEFINE_int( torch_lazy_compilation_cache_size, 1024, diff --git a/torch/csrc/lazy/core/config.h b/torch/csrc/lazy/core/config.h index beee5b4b214..fa6630123cd 100644 --- a/torch/csrc/lazy/core/config.h +++ b/torch/csrc/lazy/core/config.h @@ -3,6 +3,7 @@ C10_DECLARE_bool(torch_lazy_ir_debug); C10_DECLARE_bool(torch_lazy_param_aliasing); +C10_DECLARE_bool(torch_lazy_use_thread_pool); C10_DECLARE_int(torch_lazy_compilation_cache_size); C10_DECLARE_int(torch_lazy_device_data_cache_size); diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp index 63e6ee8744c..a1726aacba6 100644 --- a/torch/csrc/lazy/core/ir.cpp +++ b/torch/csrc/lazy/core/ir.cpp @@ -1,6 +1,8 @@ #include #include +C10_DEFINE_bool(ltc_enable_dynamic_shapes, false, "Whether dynamic shape is enabled"); + namespace torch { namespace lazy { @@ -23,6 +25,14 @@ hash_t Value::hash() const { return HashCombine(node->hash(), Hash(index)); } +hash_t Value::hash_with_sizes() const { + return HashCombine(node->hash_with_sizes(), Hash(index)); +} + +hash_t Value::hash_without_sizes() const { + return HashCombine(node->hash_without_sizes(), Hash(index)); +} + OpKind OpKind::Get(const std::string& name) { return OpKind(c10::Symbol::fromQualString(name)); } @@ -31,18 +41,25 @@ hash_t OpKind::hash() const { return StringHash(op.toQualString()); } -Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash) +bool Node::enableDynamicShape() { + static bool enabled = std::getenv("LTC_ENABLE_DYNAMIC_SHAPES") != nullptr; + return enabled || FLAGS_ltc_enable_dynamic_shapes; +} + +Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn) : op_(op), num_outputs_(num_outputs), node_hash_(node_hash), - dag_hash_(dag_hash), + dag_hash_without_sizes_(dag_hash_fn(false)), + dag_hash_with_sizes_(dag_hash_fn(true)), metadata_(GetMetaDataIfDebugging()) {} -Node::Node(OpKind op, size_t num_outputs, hash_t node_hash) +Node::Node(OpKind op, size_t num_outputs, std::function node_hash_fn) : op_(op), num_outputs_(num_outputs), - node_hash_(node_hash), - dag_hash_(node_hash), + node_hash_(node_hash_fn(!enableDynamicShape())), + dag_hash_without_sizes_(node_hash_fn(false)), + dag_hash_with_sizes_(node_hash_fn(true)), metadata_(GetMetaDataIfDebugging()) {} Node::~Node() = default; diff --git a/torch/csrc/lazy/core/ir.h b/torch/csrc/lazy/core/ir.h index 6ca1df8d2fb..4132400bb65 100644 --- a/torch/csrc/lazy/core/ir.h +++ b/torch/csrc/lazy/core/ir.h @@ -15,6 +15,9 @@ #include #include #include +#include + +C10_DECLARE_bool(ltc_enable_dynamic_shapes); namespace torch { namespace lazy { @@ -65,9 +68,12 @@ using OutputMap = std::unordered_map; // Represents an input/operand for a Node object. struct TORCH_API Value { Value() = default; - /* implicit */ Value(NodePtr node, size_t index = 0) : node(std::move(node)), index(index) {} + /* implicit */ Value(NodePtr&& node, size_t index = 0) : node(std::move(node)), index(index) {} + /* implicit */ Value(const NodePtr& node, size_t index = 0) : node(node), index(index) {} hash_t hash() const; + hash_t hash_with_sizes() const; + hash_t hash_without_sizes() const; operator bool() const { return node != nullptr; @@ -121,7 +127,6 @@ inline std::ostream& operator<<(std::ostream& stream, const OpKind& op) { using OpList = c10::ArrayRef; - // A node in the graph. Nodes for operations which requires extra data to be // stored for lowering, should inherit from this class and add operation // specific member there. For example, a constant might create a new @@ -130,13 +135,18 @@ using OpList = c10::ArrayRef; // client data handle in it. class TORCH_API Node { public: + static bool enableDynamicShape(); + // Creates a new node with the given op name. The op is a unique identifier // for the operation. The num_outputs tells how many outputs a given operation // generates. - Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash); + // + // None leaf node's node_hash does not contains shape information always. + // So we pass in the hash value rather than a function. + Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn); // Contructor used to create leaf nodes. - Node(OpKind op, size_t num_outputs, hash_t node_hash); + Node(OpKind op, size_t num_outputs, std::function node_hash_fn); virtual ~Node(); @@ -157,7 +167,15 @@ class TORCH_API Node { } hash_t hash() const { - return dag_hash_; + return enableDynamicShape() ? dag_hash_without_sizes_ : dag_hash_with_sizes_; + } + + hash_t hash_without_sizes() const { + return dag_hash_without_sizes_; + } + + hash_t hash_with_sizes() const { + return dag_hash_with_sizes_; } const MetaData& metadata() const { @@ -183,8 +201,17 @@ class TORCH_API Node { // The hash value of this node. hash_t node_hash_; - // The hash value of the graph rooted at this node. - hash_t dag_hash_; + // dag_hash represents the hash value of the graph rooted at this node. There are 2 variants, one + // with sizes info and one without. We need 2 such hashes to support dynamic + // shape. Here are the logic to pick the hash in the 2 major scenarios that a hash is needed: + // - shape cache: in this case, we always use the dag hash with size info. This way, looking up the + // shape for one node does not get the shape for another node with the same rank but different sizes + // - lookup the compiled graph by a hash: in this case, we will use the dag hash + // WITHOUT size info if dynamic shape is enabled and use the dag hash WITH size info otherwise. + // The different requirement for the hash in these 2 scenarios forces us to maintain 2 + // different hashes. + hash_t dag_hash_without_sizes_; + hash_t dag_hash_with_sizes_; // The IR specific metadata attached to the IR node. MetaData metadata_; // The IR framework user can attach a user defined metadata object deriving diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index 9f504c935e9..3599abb7b8d 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -462,7 +462,7 @@ void LazyGraphExecutor::SyncTensorsGraph( config.sync_ltc_data = sync_ltc_data; auto async = SyncTensorsGraphInternal(tensors, devices, config); - if (wait && async != nullptr) { + if (FLAGS_torch_lazy_use_thread_pool && wait && async != nullptr) { async->mwait.Wait(); } } @@ -972,7 +972,11 @@ std::shared_ptr LazyGraphExecutor:: } }; - ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); + if (FLAGS_torch_lazy_use_thread_pool) { + ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); + } else { + syncfn(); + } return async; } @@ -995,7 +999,7 @@ std::vector LazyGraphExecutor::GetTensorsFused( SyncTensorsConfig config; config.force_ltc_data = false; auto async = SyncTensorsGraphInternal(tensors, {}, config); - if (async != nullptr) { + if (FLAGS_torch_lazy_use_thread_pool && async != nullptr) { async->mwait.Wait(); } std::vector tensors_data = GatherTensorsData( diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp index 2b7fd2c74b8..bd5ea5b75c9 100644 --- a/torch/csrc/lazy/core/shape.cpp +++ b/torch/csrc/lazy/core/shape.cpp @@ -28,8 +28,12 @@ size_t Shape::numel() const { return elts; } -hash_t Shape::hash() const { - return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); +hash_t Shape::hash(bool bakeInSizes) const { + if (bakeInSizes) { + return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); + } else { + return HashCombine(Hash(scalar_type_), Hash(sizes_.size())); + } } } // namespace lazy diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h index c67ff908833..9b34b90fec0 100644 --- a/torch/csrc/lazy/core/shape.h +++ b/torch/csrc/lazy/core/shape.h @@ -25,7 +25,7 @@ class TORCH_API Shape { int64_t size(int64_t dim) const { return sizes_.at(dim); } void set_size(int64_t dim, int64_t size) { sizes_.at(dim) = size; } size_t numel() const; - hash_t hash() const; + hash_t hash(bool bakeInSizes) const; bool operator==(const Shape& other) const; diff --git a/torch/csrc/lazy/ts_backend/ts_node.cpp b/torch/csrc/lazy/ts_backend/ts_node.cpp index d79dd999f81..a7948e5cbec 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.cpp +++ b/torch/csrc/lazy/ts_backend/ts_node.cpp @@ -28,14 +28,15 @@ void TsNodeSetShapeDeferred( throw std::runtime_error("Expected TsNode but could not dynamic cast"); } -hash_t OperandHashes(const OpList& operands, const hash_t& seed) { +hash_t OperandHashes(const OpList& operands, const hash_t& seed, bool bakeInSizes) { hash_t hash = seed; for (auto& operand : operands) { if (!operand) { hash = HashCombine(hash, static_cast(kNullOpt)); continue; } - hash = HashCombine(hash, operand.hash()); + auto operand_hash = bakeInSizes ? operand.hash_with_sizes() : operand.hash_without_sizes(); + hash = HashCombine(hash, operand_hash); } return hash; } @@ -48,7 +49,7 @@ TsNode::TsNode(OpKind op, OpList operands, std::vector&& shapes, // initialization to a separate function? /* node_hash */ HashCombine(op.hash(), hash_seed), /* dag_hash */ - OperandHashes(operands, HashCombine(op.hash(), hash_seed))), + [&](bool bakeInSizes) { return OperandHashes(operands, HashCombine(op.hash(), hash_seed), bakeInSizes); }), shapes_(shapes) { for (auto& operand : operands) { // Ideally, optional operands should be filtered by the leaf node classes, @@ -80,7 +81,7 @@ void TsNode::SetShapeDeferred( } TsNode::TsNode(OpKind op, Shape shape, size_t num_outputs, hash_t hash_seed) - : Node(op, num_outputs, GetOpHash(op, shape, hash_seed)) + : Node(op, num_outputs, [&](bool bakeInSizes) -> hash_t { return GetOpHash(op, shape, hash_seed, bakeInSizes); }) { shapes_.push_back(std::move(shape)); } @@ -98,10 +99,11 @@ ShapeCache* GetShapeCache() { Shape TsNode::GetOpShape( const std::function& shape_fn) const { + auto hash = hash_with_sizes(); ShapeCache* shape_cache = GetShapeCache(); - auto shape = shape_cache->Get(hash()); + auto shape = shape_cache->Get(hash); if (shape == nullptr) { - shape = shape_cache->Add(hash(), + shape = shape_cache->Add(hash, std::make_shared(shape_fn())); } return *shape; @@ -120,8 +122,8 @@ std::string TsNode::ToString() const { return ss.str(); } -hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed) { - hash_t h = HashCombine(op.hash(), shape.hash()); +hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes) { + hash_t h = HashCombine(op.hash(), shape.hash(bakeInSizes)); return HashCombine(h, hash_seed); } diff --git a/torch/csrc/lazy/ts_backend/ts_node.h b/torch/csrc/lazy/ts_backend/ts_node.h index a6595a5337d..156444852d9 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.h +++ b/torch/csrc/lazy/ts_backend/ts_node.h @@ -55,7 +55,7 @@ class TORCH_API TsNode : public lazy::Node { std::string ToString() const override; - static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed); + static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes); const std::vector& operands() const override { return operands_as_outputs_; From 51b04f27c7d7e57ee87a76c19849e8b5db9a1072 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Thu, 17 Feb 2022 21:16:33 -0800 Subject: [PATCH 150/199] [ci] do not run distributed jobs for windows (#73064) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73064 These accidentally got turned on by https://github.com/pytorch/pytorch/pull/73001. Turn them off. Test Plan: Imported from OSS Reviewed By: shannonzhu Differential Revision: D34332530 Pulled By: suo fbshipit-source-id: a6493b7d94465fa9141f1527648dbbec09c5706d (cherry picked from commit b18c95e4a68e7d96e617edfb83a3e55780b49f4c) --- .github/scripts/generate_ci_workflows.py | 4 + ...rated-periodic-win-vs2019-cuda11.1-py3.yml | 156 ------------------ ...rated-periodic-win-vs2019-cuda11.5-py3.yml | 156 ------------------ .../generated-win-vs2019-cpu-py3.yml | 148 ----------------- .../generated-win-vs2019-cuda11.3-py3.yml | 156 ------------------ 5 files changed, 4 insertions(+), 616 deletions(-) diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 5507da89953..82198b077ea 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -385,6 +385,7 @@ WINDOWS_WORKFLOWS = [ arch="windows", build_environment="win-vs2019-cpu-py3", cuda_version="cpu", + enable_distributed_test=False, test_runner_type=WINDOWS_CPU_TEST_RUNNER, num_test_shards=2, ciflow_config=CIFlowConfig( @@ -396,6 +397,7 @@ WINDOWS_WORKFLOWS = [ arch="windows", build_environment="win-vs2019-cuda11.3-py3", cuda_version="11.3", + enable_distributed_test=False, test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, enable_force_on_cpu_test=True, @@ -408,6 +410,7 @@ WINDOWS_WORKFLOWS = [ arch="windows", build_environment="periodic-win-vs2019-cuda11.5-py3", cuda_version="11.5", + enable_distributed_test=False, test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, enable_force_on_cpu_test=True, @@ -421,6 +424,7 @@ WINDOWS_WORKFLOWS = [ arch="windows", build_environment="periodic-win-vs2019-cuda11.1-py3", cuda_version="11.1", + enable_distributed_test=False, test_runner_type=WINDOWS_CUDA_TEST_RUNNER, num_test_shards=2, is_scheduled="45 0,4,8,12,16,20 * * *", diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml index 37957f0b8a0..bb23fb9699b 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.1-py3.yml @@ -131,162 +131,6 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - test_distributed_1_1: - name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: distributed - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.8xlarge.nvidia.gpu - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Upload test statistics - if: always() - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-win-vs2019-cuda11.1-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* test_default_1_2: name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 diff --git a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml index f6171a5776e..98bcb20dd6d 100644 --- a/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml +++ b/.github/workflows/generated-periodic-win-vs2019-cuda11.5-py3.yml @@ -279,162 +279,6 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_distributed_1_1: - name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) - timeout-minutes: 240 - env: - JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: distributed - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.8xlarge.nvidia.gpu - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Upload test statistics - if: always() - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: periodic-win-vs2019-cuda11.5-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* test_default_1_2: name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 diff --git a/.github/workflows/generated-win-vs2019-cpu-py3.yml b/.github/workflows/generated-win-vs2019-cpu-py3.yml index 820a77ddcc4..66c906c0888 100644 --- a/.github/workflows/generated-win-vs2019-cpu-py3.yml +++ b/.github/workflows/generated-win-vs2019-cpu-py3.yml @@ -124,154 +124,6 @@ jobs: run: | rm -rf "${PYTORCH_FINAL_PACKAGE_DIR}" rm -rf ./* - test_distributed_1_1: - name: test (distributed, 1, 1, windows.4xlarge) - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cpu-py3-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: distributed - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.4xlarge - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.4xlarge' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.4xlarge' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Upload test statistics - if: always() - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cpu-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* test_default_1_2: name: test (default, 1, 2, windows.4xlarge) timeout-minutes: 240 diff --git a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml index 3dbbc4cf0a6..900b71aa51a 100644 --- a/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml +++ b/.github/workflows/generated-win-vs2019-cuda11.3-py3.yml @@ -281,162 +281,6 @@ jobs: # Should remove the entirety of pytorch-${{ github.run_id }} run: | rm -rf ./* - test_distributed_1_1: - name: test (distributed, 1, 1, windows.8xlarge.nvidia.gpu) - timeout-minutes: 240 - env: - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test - SHARD_NUMBER: 1 - NUM_TEST_SHARDS: 1 - TEST_CONFIG: distributed - http_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - https_proxy: "http://internal-tf-lb-20210727220640487900000002-835786077.us-east-1.elb.amazonaws.com:3128" - PR_BODY: ${{ github.event.pull_request.body }} - needs: build - runs-on: windows.8xlarge.nvidia.gpu - steps: - - name: Display EC2 information - shell: bash - run: | - set -euo pipefail - function get_ec2_metadata() { - # Pulled from instance metadata endpoint for EC2 - # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html - category=$1 - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - } - echo "ami-id: $(get_ec2_metadata ami-id)" - echo "instance-id: $(get_ec2_metadata instance-id)" - echo "instance-type: $(get_ec2_metadata instance-type)" - - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" - uses: seemethere/add-github-ssh-key@v1 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Checkout PyTorch - uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - # deep clone, to allow use of git merge-base - fetch-depth: 0 - submodules: recursive - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - - name: Install Visual Studio 2019 toolchain - shell: powershell - run: | - .\.circleci\scripts\vs_install.ps1 - - name: Install Cuda - shell: bash - run: | - .circleci/scripts/windows_cuda_install.sh - - name: Install Cudnn - shell: bash - run: | - .circleci/scripts/windows_cudnn_install.sh - - uses: seemethere/download-artifact-s3@0504774707cbc8603d7dca922e8026eb8bf3b47b - name: Download PyTorch Build Artifacts - with: - name: ${{ env.BUILD_ENVIRONMENT }} - path: C:\${{ github.run_id }}\build-results - - name: Check build-results folder - shell: powershell - run: | - tree /F C:\$Env:GITHUB_RUN_ID\build-results - # Needed for coverage in win-test.sh - - uses: actions/setup-python@v2 - name: Setup Python3 - with: - python-version: '3.x' - - name: Test - shell: bash - env: - PYTORCH_FINAL_PACKAGE_DIR: /c/${{ github.run_id }}/build-results/ - # Time out the test phase after 3.5 hours - timeout-minutes: 210 - run: | - .jenkins/pytorch/win-test.sh - - name: Zip JSONs for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-jsons-$Env:FILE_SUFFIX.zip" -ir'!test\*.json' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Downloaded JSONs on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: warn - path: - test-jsons-*.zip - - name: Zip test reports for upload - if: always() - env: - FILE_SUFFIX: '${{ github.job }}-distributed-1-1-windows.8xlarge.nvidia.gpu' - shell: powershell - run: | - # -ir => recursive include all files in pattern - 7z a "test-reports-$Env:FILE_SUFFIX.zip" -ir'!test\*.xml' - - uses: seemethere/upload-artifact-s3@v3 - name: Store Test Reports on S3 - if: always() - with: - retention-days: 14 - if-no-files-found: error - path: - test-reports-*.zip - - name: Install render_test_results dependencies - if: always() - shell: bash - run: | - python3 -m pip install junitparser==2.1.1 rich==10.9.0 - - name: "[[ Click me for rendered test results (useful for finding failing tests) ]]" - if: always() - shell: bash - # Encoding is weird on windows, just try to default to utf-8 if possible - env: - PYTHONIOENCODING: "utf-8" - run: | - python3 tools/render_junit.py test/ - - name: Wait until all sessions have drained - shell: powershell - if: always() - timeout-minutes: 120 - run: | - .github\scripts\wait_for_ssh_to_drain.ps1 - - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) - shell: powershell - if: always() - run: | - .github\scripts\kill_active_ssh_sessions.ps1 - - name: Parse ref - id: parse-ref - run: .github/scripts/parse_ref.py - - name: Upload test statistics - if: always() - env: - AWS_DEFAULT_REGION: us-east-1 - BRANCH: ${{ steps.parse-ref.outputs.branch }} - JOB_BASE_NAME: win-vs2019-cuda11.3-py3-test - PR_NUMBER: ${{ github.event.pull_request.number }} - SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - TAG: ${{ steps.parse-ref.outputs.tag }} - WORKFLOW_ID: '${{ github.run_id }}' - shell: bash - run: | - python3 -m pip install -r requirements.txt - python3 -m pip install boto3==1.19.12 - python3 -m tools.stats.print_test_stats --upload-to-s3 --compare-with-s3 test - - name: Cleanup workspace - if: always() - shell: bash - # Should remove the entirety of pytorch-${{ github.run_id }} - run: | - rm -rf ./* test_default_1_2: name: test (default, 1, 2, windows.8xlarge.nvidia.gpu) timeout-minutes: 240 From 710f12f58e2d24d583c3e3bab75ec8c169ebbf57 Mon Sep 17 00:00:00 2001 From: Terry Chen Date: Thu, 17 Feb 2022 22:24:57 -0800 Subject: [PATCH 151/199] [quant] Add ConvTranspose reference module (#73031) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73031 Add ConvTranspose reference module Test Plan: python3 test/test_quantization.py TestQuantizeEagerOps.test_conv_transpose_2d Imported from OSS Reviewed By: jerryzh168 Differential Revision: D34313425 fbshipit-source-id: 3eeec1b24a51c7951c4d4b0c7dca43a012468b85 (cherry picked from commit 0ee7c1cc39631855ac711d861a46ec0ea65a3cbc) --- .../eager/test_quantize_eager_ptq.py | 125 +++++++++++++ torch/ao/quantization/quantize.py | 12 +- .../quantized/_reference/modules/__init__.py | 5 +- torch/nn/quantized/_reference/modules/conv.py | 169 +++++++++++++++++- 4 files changed, 307 insertions(+), 4 deletions(-) diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py index 6587740bdf9..9665d8554bc 100644 --- a/test/quantization/eager/test_quantize_eager_ptq.py +++ b/test/quantization/eager/test_quantize_eager_ptq.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn import torch.nn.quantized as nnq +import torch.nn.quantized._reference as nnqr from torch.nn.utils.rnn import PackedSequence from torch.ao.quantization import ( quantize, @@ -74,6 +75,130 @@ import unittest import numpy as np class TestQuantizeEagerOps(QuantizationTestCase): + def _test_reference_module_impl(self, + float_module_class, + quantized_module_class, + extra_module_kwargs, + input_size): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = float_module_class(**extra_module_kwargs) + self.quant = QuantStub() + self.dequant = DeQuantStub() + + def forward(self, x): + x = self.quant(x) + x = self.conv(x) + x = self.dequant(x) + return x + + class RefM(torch.nn.Module): + def __init__(self): + super().__init__() + self.conv = float_module_class(**extra_module_kwargs) + self.quant1 = QuantStub() + self.dequant1 = DeQuantStub() + self.quant2 = QuantStub() + self.dequant2 = DeQuantStub() + + def forward(self, x): + x = self.quant1(x) + x = self.dequant1(x) + x = self.conv(x) + x = self.quant2(x) + x = self.dequant2(x) + return x + + qengine = 'fbgemm' + with override_quantized_engine(qengine): + data = torch.randn(*input_size, dtype=torch.float) + original_m = M() + original_ref_m = RefM() + + original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach()) + original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach()) + + original_m.qconfig = torch.quantization.default_qconfig + + m = prepare(original_m) + # calibration + m(data) + m = convert(m) + # check if the module is properly quantized + self.assertEqual(type(m.quant), nnq.Quantize) + self.assertEqual(type(m.conv), quantized_module_class) + self.assertEqual(type(m.dequant), nnq.DeQuantize) + res = m(data) + + # quantize the reference model + original_ref_m.eval() + original_ref_m.qconfig = torch.quantization.default_qconfig + + ref_m = prepare(original_ref_m) + ref_m(data) + reference_module_mapping = { + QuantStub: nnq.Quantize, + DeQuantStub: nnq.DeQuantize, + nn.Conv1d: nnqr.Conv1d, + nn.Conv2d: nnqr.Conv2d, + nn.Conv3d: nnqr.Conv3d, + nn.ConvTranspose1d: nnqr.ConvTranspose1d, + nn.ConvTranspose2d: nnqr.ConvTranspose2d, + nn.ConvTranspose3d: nnqr.ConvTranspose3d, + } + ref_m = convert(ref_m, mapping=reference_module_mapping) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) + + def test_conv_1d(self): + self._test_reference_module_impl( + nn.Conv1d, + nnq.Conv1d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 1) + ) + + def test_conv_2d(self): + self._test_reference_module_impl( + nn.Conv2d, + nnq.Conv2d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10) + ) + + def test_conv_3d(self): + self._test_reference_module_impl( + nn.Conv3d, + nnq.Conv3d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10, 10) + ) + + def test_conv_transpose_1d(self): + self._test_reference_module_impl( + nn.ConvTranspose1d, + nnq.ConvTranspose1d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 1) + ) + + def test_conv_transpose_2d(self): + self._test_reference_module_impl( + nn.ConvTranspose2d, + nnq.ConvTranspose2d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10) + ) + + def test_conv_transpose_3d(self): + self._test_reference_module_impl( + nn.ConvTranspose3d, + nnq.ConvTranspose3d, + {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, + (16, 1, 10, 10, 10) + ) + def _test_activation_op_impl( self, float_module_class, quantized_module_class, extra_module_kwargs): """ Implementation for testing common activation ops like leaky relu diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py index 5afff09b64b..fad2b8abe6e 100644 --- a/torch/ao/quantization/quantize.py +++ b/torch/ao/quantization/quantize.py @@ -16,7 +16,7 @@ from torch.ao.quantization.quantization_mappings import ( _has_special_act_post_process, _get_special_act_post_process, ) - +from .utils import get_qparam_dict from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper from torch.ao.quantization.qconfig import ( add_module_to_qconfig_obs_ctr, @@ -565,7 +565,15 @@ def swap_module(mod, mapping, custom_module_class_mapping): new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) swapped = True elif type(mod) in mapping: - new_mod = mapping[type(mod)].from_float(mod) + qmod = mapping[type(mod)] + if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE: + assert mod.qconfig is not None + weight_post_process = mod.qconfig.weight() + weight_post_process(mod.weight) + weight_qparams = get_qparam_dict(weight_post_process) + new_mod = qmod.from_float(mod, weight_qparams) + else: + new_mod = qmod.from_float(mod) swapped = True if swapped: diff --git a/torch/nn/quantized/_reference/modules/__init__.py b/torch/nn/quantized/_reference/modules/__init__.py index 441852c38f9..efbefdbde60 100644 --- a/torch/nn/quantized/_reference/modules/__init__.py +++ b/torch/nn/quantized/_reference/modules/__init__.py @@ -1,9 +1,12 @@ from .linear import Linear -from .conv import Conv1d, Conv2d, Conv3d +from .conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d __all__ = [ 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', + 'ConvTranspose1d', + 'ConvTranspose2d', + 'ConvTranspose3d', ] diff --git a/torch/nn/quantized/_reference/modules/conv.py b/torch/nn/quantized/_reference/modules/conv.py index ed151cb7f5e..60aed0a91ac 100644 --- a/torch/nn/quantized/_reference/modules/conv.py +++ b/torch/nn/quantized/_reference/modules/conv.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from typing import Optional, Dict, Any +from typing import Optional, Dict, Any, List from torch.nn.common_types import _size_1_t from .utils import _quantize_weight, _quantize_and_dequantize_weight from .utils import _save_weight_qparams @@ -14,6 +14,7 @@ class _ConvNd(torch.nn.modules.conv._ConvNd): this is useful when user want to use this module in other backends like Glow. """ __annotations__ = {"bias": Optional[torch.Tensor]} + _IS_REFERENCE = True def _save_to_state_dict(self, destination, prefix, keep_vars): super()._save_to_state_dict(destination, prefix, keep_vars) @@ -217,3 +218,169 @@ class Conv3d(_ConvNd, nn.Conv3d): @classmethod def from_float(cls, float_conv, weight_qparams): return _ConvNd.from_float(cls, float_conv, weight_qparams) + +class _ConvTransposeNd(_ConvNd, torch.nn.modules.conv._ConvTransposeNd): + """ A reference version of nn.quantized.ConvTranspose2d + we will not pack the parameters in this module, since weight packing is an + optimization for quantized backends supported in PyTorch (fbgemm/qnnpack), + this is useful when user want to use this module in other backends like Glow. + """ + @staticmethod + def from_float(cls, float_conv, weight_qparams): + qref_conv = cls( + float_conv.in_channels, + float_conv.out_channels, + float_conv.kernel_size, # type: ignore[arg-type] + float_conv.stride, # type: ignore[arg-type] + float_conv.padding, # type: ignore[arg-type] + float_conv.output_padding, # type: ignore[arg-type] + float_conv.groups, + float_conv.bias is not None, # type: ignore[arg-type] + float_conv.dilation, # type: ignore[arg-type] + float_conv.padding_mode, + device=float_conv.weight.device, + dtype=float_conv.weight.dtype, + weight_qparams=weight_qparams) + qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach()) + if float_conv.bias is not None: + qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach()) + return qref_conv + + +class ConvTranspose1d(_ConvTransposeNd, nn.ConvTranspose1d): + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: _size_1_t, + stride: _size_1_t = 1, + padding: _size_1_t = 0, + output_padding: _size_1_t = 0, + groups: int = 1, + bias: bool = True, + dilation: _size_1_t = 1, + padding_mode: str = "zeros", + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + nn.ConvTranspose1d.__init__( + self, in_channels, out_channels, kernel_size, stride, padding, output_padding, + groups, bias, dilation, padding_mode, device, dtype) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose1d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose1d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv1d + """ + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] + + weight_dequant = self.get_weight() + result = F.conv_transpose1d( + x, weight_dequant, self.bias, self.stride, + self.padding, output_padding, self.groups, self.dilation) + return result + + def _get_name(self): + return "QuantizedConvTranspose1d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) + +class ConvTranspose2d(_ConvTransposeNd, nn.ConvTranspose2d): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, + padding=0, output_padding=0, + groups=1, bias=True, dilation=1, + padding_mode='zeros', + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + + nn.ConvTranspose2d.__init__( + self, in_channels, out_channels, kernel_size, stride, padding, output_padding, + groups, bias, dilation, padding_mode, device, dtype) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose2d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose2d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv2d + """ + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] + + weight_dequant = self.get_weight() + result = F.conv_transpose2d( + x, weight_dequant, self.bias, self.stride, + self.padding, output_padding, self.groups, self.dilation) + + return result + + def _get_name(self): + return "QuantizedConvTranspose2d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) + +class ConvTranspose3d(_ConvTransposeNd, nn.ConvTranspose3d): + def __init__(self, in_channels, out_channels, kernel_size, stride=1, + padding=0, output_padding=0, + groups=1, bias=True, dilation=1, + padding_mode="zeros", + device=None, + dtype=None, + weight_qparams: Optional[Dict[str, Any]] = None): + nn.ConvTranspose3d.__init__( + self, in_channels, out_channels, kernel_size, stride, padding, output_padding, + groups, bias, dilation, padding_mode, device, dtype) + self._init_weight_qparams(weight_qparams, device) + + def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: + """ + we have: + w(float) -- quant - dequant \ + x(float) ------------- F.convTranspose3d --- + In the full model, we will see + w(float) -- quant - *dequant \ + x -- quant --- *dequant -- *F.convTranspose3d --- *quant - dequant + and the backend should be able to fuse the ops with `*` into a quantized conv3d + """ + + assert isinstance(self.padding, tuple) + # One cannot replace List by Tuple or Sequence in "_output_padding" because + # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. + output_padding = self._output_padding( + input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] + + weight_dequant = self.get_weight() + result = F.conv_transpose3d( + x, weight_dequant, self.bias, self.stride, + self.padding, output_padding, self.groups, self.dilation) + return result + + def _get_name(self): + return "QuantizedConvTranspose3d(Reference)" + + @classmethod + def from_float(cls, float_conv, weight_qparams): + return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) From e73eaffd3b1b7571abbd42d697bd6893dde5701e Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Fri, 18 Feb 2022 05:08:02 -0800 Subject: [PATCH 152/199] quant: add QAT fused Linear-Bn1d [1/x]: prepared module (#72431) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72431 Adds support for a fused QAT observed module for `Linear` followed by `BatchNorm1d`. In this PR, only the support for prepared module with fake_quants in the right places is added. A future PR will add support for `convert`, and tests for eager and FX graph mode workflows. Similar to conv-bn, we rescale the weight before applying the fake quant, and undo the rescaling after the linear operation. Test Plan: ``` python test/test_quantization.py TestQuantizeEagerQATNumerics.test_linear_bn ``` Imported from OSS Reviewed By: jerryzh168, raghuramank10000 Differential Revision: D34044427 fbshipit-source-id: 47a519173939ca4824d2c6e6ea7a599764a8ed10 (cherry picked from commit bfc75fe0785e12b0fcc45d58bb04b6da347c1767) --- .../eager/test_quantize_eager_qat.py | 23 +++ .../ao/quantization/fuser_method_mappings.py | 7 +- .../ao/quantization/quantization_mappings.py | 1 + torch/nn/intrinsic/modules/__init__.py | 2 + torch/nn/intrinsic/modules/fused.py | 9 + torch/nn/intrinsic/qat/modules/__init__.py | 2 + .../nn/intrinsic/qat/modules/linear_fused.py | 154 ++++++++++++++++++ 7 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 torch/nn/intrinsic/qat/modules/linear_fused.py diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py index efb7882c2dc..478c424a03e 100644 --- a/test/quantization/eager/test_quantize_eager_qat.py +++ b/test/quantization/eager/test_quantize_eager_qat.py @@ -1,5 +1,6 @@ # Owner(s): ["oncall: quantization"] +import copy import math import torch import torch.nn as nn @@ -10,6 +11,7 @@ from torch.nn.modules.utils import _pair import torch.nn.quantized as nnq import torch.nn.quantized.dynamic as nnqd import torch.nn.qat as nnqat +import torch.nn.intrinsic.qat as nniqat import torch.nn.qat.dynamic as nnqatd from torch.ao.quantization import ( prepare, @@ -984,6 +986,27 @@ class TestQuantizeEagerQATNumerics(QuantizationTestCase): qat_op_optim.step() qat_ref_op_optim.step() + @override_qengines + def test_linear_bn_numerics(self): + qengine = torch.backends.quantized.engine + m_ref = nn.Sequential( + nn.Linear(4, 4), + nn.BatchNorm1d(4), + ) + m_ref_copy = copy.deepcopy(m_ref) + m_ref_copy = torch.ao.quantization.fuse_modules_qat(m_ref_copy, [['0', '1']]) + qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine) + m_ref_copy[0].qconfig = qconfig + m = nniqat.LinearBn1d.from_float(m_ref_copy[0]) + + # without fake_quants, fused QAT module should match fp32 module + m.apply(torch.quantization.disable_fake_quant) + data = torch.randn(4, 4) + r1 = m_ref(data) + r2 = m(data) + self.assertTrue(torch.allclose(r1, r2)) + + if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" "\tpython test/test_quantization.py TESTNAME\n\n" diff --git a/torch/ao/quantization/fuser_method_mappings.py b/torch/ao/quantization/fuser_method_mappings.py index 23e5a1f4c35..c2b37d28b6f 100644 --- a/torch/ao/quantization/fuser_method_mappings.py +++ b/torch/ao/quantization/fuser_method_mappings.py @@ -114,7 +114,12 @@ def fuse_linear_bn(is_qat, linear, bn): if is_qat: # TODO: remove the assert later assert linear.training, "qat is only supported when linear.training is True currently" - raise Exception("Fusing Linear+BatchNorm not yet supported in training.") + assert bn.num_features == linear.out_features,\ + "Output features of Linear must match num_features of BatchNorm1d" + assert bn.affine, "Only support fusing BatchNorm1d with affine set to True" + assert bn.track_running_stats,\ + "Only support fusing BatchNorm1d with tracking_running_stats set to True" + return nni.LinearBn1d(linear, bn) else: return nn.utils.fusion.fuse_linear_bn_eval(linear, bn) diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py index 3f3ce8fff5d..c299f375720 100644 --- a/torch/ao/quantization/quantization_mappings.py +++ b/torch/ao/quantization/quantization_mappings.py @@ -99,6 +99,7 @@ DEFAULT_QAT_MODULE_MAPPINGS : Dict[Callable, Any] = { nni.ConvReLU2d: nniqat.ConvReLU2d, nni.ConvReLU3d: nniqat.ConvReLU3d, nni.LinearReLU: nniqat.LinearReLU, + nni.LinearBn1d: nniqat.LinearBn1d, } # Default map for swapping dynamic modules diff --git a/torch/nn/intrinsic/modules/__init__.py b/torch/nn/intrinsic/modules/__init__.py index 21536df2501..c998621f668 100644 --- a/torch/nn/intrinsic/modules/__init__.py +++ b/torch/nn/intrinsic/modules/__init__.py @@ -11,6 +11,7 @@ from .fused import ConvReLU3d from .fused import LinearReLU from .fused import BNReLU2d from .fused import BNReLU3d +from .fused import LinearBn1d __all__ = [ @@ -27,4 +28,5 @@ __all__ = [ 'LinearReLU', 'BNReLU2d', 'BNReLU3d', + 'LinearBn1d', ] diff --git a/torch/nn/intrinsic/modules/fused.py b/torch/nn/intrinsic/modules/fused.py index 17ab2c31eb3..1c09caff52a 100644 --- a/torch/nn/intrinsic/modules/fused.py +++ b/torch/nn/intrinsic/modules/fused.py @@ -113,3 +113,12 @@ class BNReLU3d(_FusedModule): 'Incorrect types for input modules{}{}'.format( type(batch_norm), type(relu)) super().__init__(batch_norm, relu) + + +class LinearBn1d(_FusedModule): + r"""This is a sequential container which calls the Linear and BatchNorm1d modules. + During quantization this will be replaced with the corresponding fused module.""" + def __init__(self, linear, bn): + assert type(linear) == Linear and type(bn) == BatchNorm1d, \ + 'Incorrect types for input modules{}{}'.format(type(linear), type(bn)) + super().__init__(linear, bn) diff --git a/torch/nn/intrinsic/qat/modules/__init__.py b/torch/nn/intrinsic/qat/modules/__init__.py index 7b166334a09..355d072c739 100644 --- a/torch/nn/intrinsic/qat/modules/__init__.py +++ b/torch/nn/intrinsic/qat/modules/__init__.py @@ -1,4 +1,5 @@ from .linear_relu import LinearReLU +from .linear_fused import LinearBn1d from .conv_fused import ( ConvBn1d, ConvBn2d, @@ -14,6 +15,7 @@ from .conv_fused import ( __all__ = [ "LinearReLU", + "LinearBn1d", "ConvReLU2d", "ConvReLU3d", "ConvBn1d", diff --git a/torch/nn/intrinsic/qat/modules/linear_fused.py b/torch/nn/intrinsic/qat/modules/linear_fused.py new file mode 100644 index 00000000000..f79b466507b --- /dev/null +++ b/torch/nn/intrinsic/qat/modules/linear_fused.py @@ -0,0 +1,154 @@ +import torch +import torch.nn as nn +import torch.nn.intrinsic as nni +import torch.nn.functional as F +from torch.nn import init +from torch.nn.parameter import Parameter + + +class LinearBn1d(nn.modules.linear.Linear, nni._FusedModule): + r""" + A LinearBn1d module is a module fused from Linear and BatchNorm1d, attached + with FakeQuantize modules for weight, used in quantization aware training. + + We combined the interface of :class:`torch.nn.Linear` and + :class:torch.nn.BatchNorm1d`. + + Similar to :class:`torch.nn.Linear`, with FakeQuantize modules initialized + to default. + + Attributes: + freeze_bn: + weight_fake_quant: fake quant module for weight + + """ + def __init__(self, + # Linear args + in_features, out_features, bias=True, + # BatchNorm1d args + # num_features: out_features + eps=1e-05, momentum=0.1, + # affine: True + # track_running_stats: True + # Args for this module + freeze_bn=False, + qconfig=None): + nn.modules.linear.Linear.__init__(self, in_features, out_features, bias) + assert qconfig, 'qconfig must be provded for QAT module' + self.qconfig = qconfig + self.freeze_bn = freeze_bn if self.training else True + self.bn = nn.BatchNorm1d(out_features, eps, momentum, True, True) + self.weight_fake_quant = self.qconfig.weight() + if bias: + self.bias = Parameter(torch.empty(out_features)) + else: + self.register_parameter('bias', None) + self.reset_bn_parameters() + + # this needs to be called after reset_bn_parameters, + # as they modify the same state + if self.training: + if freeze_bn: + self.freeze_bn_stats() + else: + self.update_bn_stats() + else: + self.freeze_bn_stats() + + def reset_running_stats(self): + self.bn.reset_running_stats() + + def reset_bn_parameters(self): + self.bn.reset_running_stats() + init.uniform_(self.bn.weight) + init.zeros_(self.bn.bias) + + def reset_parameters(self): + super(LinearBn1d, self).reset_parameters() + + def update_bn_stats(self): + self.freeze_bn = False + self.bn.training = True + return self + + def freeze_bn_stats(self): + self.freeze_bn = True + self.bn.training = False + return self + + def forward(self, input): + assert self.bn.running_var is not None + + # Scale the linear weights by BN's running statistics to reduce + # weight jitter, see https://arxiv.org/pdf/1806.08342.pdf, page 18 + # for motivation. + # + # Instead of + # + # x1 = F.linear(x0, fq(w), b) + # x2 = self.bn(x1) + # + # We have + # + # # scale the weight by previous batch's running statistics + # scale_factor = bn.w / bn.running_std_from_prev_batch + # # do the linear transformation without bias + # x1_scaled = F.linear(x0, fq(w * scale_factor), 0) + # # reverse the scaling and add original bias + # x1_orig = x1_scaled / scale_factor + b + # x2 = self.bn(x1_orig) + + running_std = torch.sqrt(self.bn.running_var + self.bn.eps) + scale_factor = self.bn.weight / running_std + weight_shape = [1] * len(self.weight.shape) + weight_shape[0] = -1 + bias_shape = [1] * len(self.weight.shape) + bias_shape[1] = -1 + scaled_weight = self.weight_fake_quant(self.weight * scale_factor.reshape(weight_shape)) + if self.bias is not None: + zero_bias = torch.zeros_like(self.bias) + else: + zero_bias = torch.zeros(self.out_features, device=scaled_weight.device) + linear_out = F.linear(input, scaled_weight, zero_bias) + linear_out_orig = linear_out / scale_factor.reshape(bias_shape) + if self.bias is not None: + linear_out_orig = linear_out_orig + self.bias.reshape(bias_shape) + bn_out = self.bn(linear_out_orig) + return bn_out + + def train(self, mode=True): + """ + Batchnorm's training behavior is using the self.training flag. Prevent + changing it if BN is frozen. This makes sure that calling `model.train()` + on a model with a frozen BN will behave properly. + """ + self.training = mode + if not self.freeze_bn: + for module in self.children(): + module.train(mode) + return self + + @classmethod + def from_float(cls, mod): + r"""Create a qat module from a float module or qparams_dict + + Args: `mod' a float module, either produced by torch.ao.quantization + utilities or directly from user + """ + assert type(mod) == nni.LinearBn1d, 'qat.' + cls.__name__ + \ + '.from_float only works for ' + nni.LinearBn1d.__name__ + assert hasattr(mod, 'qconfig'), 'Input float module must have qconfig defined' + assert mod.qconfig, 'Input float module must have a valid config' + qconfig = mod.qconfig + linear, bn = mod[0], mod[1] + qat_linearbn = cls(linear.in_features, linear.out_features, linear.bias is not None, + bn.eps, bn.momentum, + False, qconfig) + qat_linearbn.weight = linear.weight + qat_linearbn.bias = linear.bias + qat_linearbn.bn.weight = bn.weight + qat_linearbn.bn.bias = bn.bias + qat_linearbn.bn.running_mean = bn.running_mean + qat_linearbn.bn.running_var = bn.running_var + qat_linearbn.bn.num_batches_tracked = bn.num_batches_tracked + return qat_linearbn From 1c0df26597ccd8bc83d65b38e5528ce1cdcd0c6e Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Fri, 18 Feb 2022 05:08:02 -0800 Subject: [PATCH 153/199] eager quant: convert mapping for fused QAT Linear-Bn1d (#72796) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72796 Adds the eager mode convert mappint for fused QAT Linear-Bn1d module. Test Plan: ``` python test/test_quantization.py TestQuantizeEagerQATNumerics.test_linear_bn_workflow ``` Imported from OSS Reviewed By: jerryzh168 Differential Revision: D34213150 fbshipit-source-id: c08b5eb843dea673fd07c6b7b93dcd3ba03eaec2 (cherry picked from commit 722edfe6763940340c3d0f1f282f1a41b85be29f) --- .../eager/test_quantize_eager_qat.py | 16 ++++++++++++++++ torch/ao/ns/fx/mappings.py | 2 ++ torch/ao/quantization/quantization_mappings.py | 1 + torch/nn/quantized/modules/linear.py | 7 ++++++- 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/test/quantization/eager/test_quantize_eager_qat.py b/test/quantization/eager/test_quantize_eager_qat.py index 478c424a03e..02a3175f4c8 100644 --- a/test/quantization/eager/test_quantize_eager_qat.py +++ b/test/quantization/eager/test_quantize_eager_qat.py @@ -1006,6 +1006,22 @@ class TestQuantizeEagerQATNumerics(QuantizationTestCase): r2 = m(data) self.assertTrue(torch.allclose(r1, r2)) + @override_qengines + def test_linear_bn_workflow(self): + qengine = torch.backends.quantized.engine + m = nn.Sequential( + QuantStub(), + nn.Linear(4, 4), + nn.BatchNorm1d(4), + ) + data = torch.randn(4, 4) + m.qconfig = torch.ao.quantization.get_default_qat_qconfig(qengine) + m = torch.ao.quantization.fuse_modules_qat(m, [['1', '2']]) + mp = prepare_qat(m) + mp(data) + mq = convert(mp) + self.assertTrue(type(mq[1]) == nnq.Linear) + self.assertTrue(type(mq[2]) == nn.Identity) if __name__ == '__main__': raise RuntimeError("This test file is not meant to be run directly, use:\n\n" diff --git a/torch/ao/ns/fx/mappings.py b/torch/ao/ns/fx/mappings.py index d27c5d165ad..c31261913ad 100644 --- a/torch/ao/ns/fx/mappings.py +++ b/torch/ao/ns/fx/mappings.py @@ -80,6 +80,7 @@ def get_base_name_to_sets_of_related_ops() -> Dict[str, Set[NSNodeTargetType]]: nnqatd.Linear, nnqd.Linear, nniqat.LinearReLU, + nniqat.LinearBn1d, nn.modules.linear.NonDynamicallyQuantizableLinear, ]), # linear functionals @@ -572,6 +573,7 @@ def get_node_type_to_io_type_map() -> Dict[str, Set[NSNodeTargetType]]: nniqat.ConvReLU2d, nniqat.ConvReLU3d, nniqat.LinearReLU, + nniqat.LinearBn1d, nniqd.LinearReLU, ]) diff --git a/torch/ao/quantization/quantization_mappings.py b/torch/ao/quantization/quantization_mappings.py index c299f375720..fcda0a4fa6c 100644 --- a/torch/ao/quantization/quantization_mappings.py +++ b/torch/ao/quantization/quantization_mappings.py @@ -77,6 +77,7 @@ DEFAULT_STATIC_QUANT_MODULE_MAPPINGS : Dict[Callable, Any] = { nniqat.ConvReLU2d: nniq.ConvReLU2d, nniqat.ConvReLU3d: nniq.ConvReLU3d, nniqat.LinearReLU: nniq.LinearReLU, + nniqat.LinearBn1d: nnq.Linear, # QAT modules: nnqat.Linear: nnq.Linear, nnqat.Conv2d: nnq.Conv2d, diff --git a/torch/nn/quantized/modules/linear.py b/torch/nn/quantized/modules/linear.py index d343ed1b00d..979f2ad9c97 100644 --- a/torch/nn/quantized/modules/linear.py +++ b/torch/nn/quantized/modules/linear.py @@ -3,7 +3,9 @@ import torch import torch.nn as nn import torch.nn.intrinsic as nni +import torch.nn.intrinsic.qat as nniqat from torch.nn.quantized.modules.utils import _quantize_weight, hide_packed_params_repr, ReferenceableQuantizedModule +from torch.nn.utils.fusion import fuse_linear_bn_weights from typing import Optional class LinearPackedParams(torch.nn.Module): @@ -239,7 +241,10 @@ class Linear(ReferenceableQuantizedModule): utilities or provided by the user """ if hasattr(mod, 'weight_fake_quant'): - # assert type(mod) == QATLinear, 'training mode nnq.Linear.from_float only works for nn.qat.Linear' + if type(mod) == nniqat.LinearBn1d: + mod.weight, mod.bias = fuse_linear_bn_weights( + mod.weight, mod.bias, mod.bn.running_mean, mod.bn.running_var, + mod.bn.eps, mod.bn.weight, mod.bn.bias) weight_post_process = mod.weight_fake_quant activation_post_process = mod.activation_post_process else: From f5e201e4e9fe946cd124ddcb4ed27b1b5b14386d Mon Sep 17 00:00:00 2001 From: Kevin Tse Date: Fri, 18 Feb 2022 07:08:03 -0800 Subject: [PATCH 154/199] [DataPipe] Adding usage examples for IterDataPipes (#73033) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73033 Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D34313793 Pulled By: NivekT fbshipit-source-id: 51125be2f79d73d02658b2b1c2691f96be8d4769 (cherry picked from commit 3e3c2df7c6a9f6cb51f2343254063487a091729a) --- torch/utils/data/datapipes/iter/callable.py | 15 +++++- .../data/datapipes/iter/combinatorics.py | 7 +++ torch/utils/data/datapipes/iter/combining.py | 48 +++++++++++++++++ torch/utils/data/datapipes/iter/filelister.py | 6 +++ torch/utils/data/datapipes/iter/fileopener.py | 8 +++ torch/utils/data/datapipes/iter/grouping.py | 53 +++++++++++++++++-- torch/utils/data/datapipes/iter/selecting.py | 9 ++++ .../utils/data/datapipes/iter/streamreader.py | 7 +++ torch/utils/data/datapipes/iter/utils.py | 10 ++-- torch/utils/data/dataset.py | 26 +++++---- 10 files changed, 170 insertions(+), 19 deletions(-) diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py index 200f0c35f06..ef7875fe569 100644 --- a/torch/utils/data/datapipes/iter/callable.py +++ b/torch/utils/data/datapipes/iter/callable.py @@ -33,6 +33,20 @@ class MapperIterDataPipe(IterDataPipe[T_co]): multiple indices, the left-most one is used, and other indices will be removed. - Integer is used for list/tuple. ``-1`` represents to append result at the end. - Key is used for dict. New key is acceptable. + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper, Mapper + >>> def add_one(x): + ... return x + 1 + >>> dp = IterableWrapper(range(10)) + >>> map_dp_1 = dp.map(add_one) # Invocation via functional form is preferred + >>> list(map_dp_1) + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + >>> # We discourage the usage of `lambda` functions as they are not serializable with `pickle` + >>> # Use `functools.partial` or explicitly define the function instead + >>> map_dp_2 = Mapper(dp, lambda x: x + 1) + >>> list(map_dp_2) + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] """ datapipe: IterDataPipe fn: Callable @@ -166,7 +180,6 @@ class CollatorIterDataPipe(MapperIterDataPipe): >>> ds = MyIterDataPipe(start=3, end=7) >>> print(list(ds)) [3, 4, 5, 6] - >>> def collate_fn(batch): ... return torch.tensor(batch, dtype=torch.float) ... diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py index e08ad75e183..fb5d8a6e57a 100644 --- a/torch/utils/data/datapipes/iter/combinatorics.py +++ b/torch/utils/data/datapipes/iter/combinatorics.py @@ -67,6 +67,13 @@ class ShufflerIterDataPipe(IterDataPipe[T_co]): buffer_size: The buffer size for shuffling (default to ``10000``) unbatch_level: Specifies if it is necessary to unbatch source data before applying the shuffle + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> dp = IterableWrapper(range(10)) + >>> shuffle_dp = dp.shuffle() + [0, 4, 1, 6, 3, 2, 9, 5, 7, 8] + >>> list(shuffle_dp) """ datapipe: IterDataPipe[T_co] buffer_size: int diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index 01cbc8cefad..c2c6dd4bbd4 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -21,6 +21,14 @@ class ConcaterIterDataPipe(IterDataPipe): Args: datapipes: Iterable DataPipes being concatenated + + Example: + >>> import random + >>> from torchdata.datapipes.iter import IterableWrapper + >>> dp1 = IterableWrapper(range(3)) + >>> dp2 = IterableWrapper(range(5)) + >>> list(dp1.concat(dp2)) + [0, 1, 2, 0, 1, 2, 3, 4] """ datapipes: Tuple[IterDataPipe] length: Optional[int] @@ -61,6 +69,15 @@ class ForkerIterDataPipe(IterDataPipe): buffer_size: this restricts how far ahead the leading child DataPipe can read relative to the slowest child DataPipe. Defaults to ``1000``. Use ``-1`` for the unlimited buffer. + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> source_dp = IterableWrapper(range(5)) + >>> dp1, dp2 = source_dp.fork(num_instances=2) + >>> list(dp1) + [0, 1, 2, 3, 4] + >>> list(dp2) + [0, 1, 2, 3, 4] """ def __new__(cls, datapipe: IterDataPipe, num_instances: int, buffer_size: int = 1000): if num_instances < 1: @@ -187,6 +204,25 @@ class DemultiplexerIterDataPipe(IterDataPipe): buffer_size: this defines the maximum number of inputs that the buffer can hold across all child DataPipes while waiting for their values to be yielded. Defaults to ``1000``. Use ``-1`` for the unlimited buffer. + + Examples: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> def odd_or_even(n): + ... return n % 2 + >>> source_dp = IterableWrapper(range(5)) + >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even) + >>> list(dp1) + [0, 2, 4] + >>> list(dp2) + [1, 3] + >>> # It can also filter out any element that gets `None` from the `classifier_fn` + >>> def odd_or_even_no_zero(n): + ... return n % 2 if n != 0 else None + >>> dp1, dp2 = source_dp.demux(num_instances=2, classifier_fn=odd_or_even_no_zero, drop_none=True) + >>> list(dp1) + [2, 4] + >>> list(dp2) + [1, 3] """ def __new__(cls, datapipe: IterDataPipe, num_instances: int, classifier_fn: Callable[[T_co], Optional[int]], drop_none: bool = False, buffer_size: int = 1000): @@ -326,6 +362,12 @@ class MultiplexerIterDataPipe(IterDataPipe): Args: datapipes: Iterable DataPipes that will take turn to yield their elements, until they are all exhausted + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25)) + >>> list(dp1.mux(dp2, dp3)) + [0, 10, 20, 1, 11, 21, 2, 12, 22, 3, 13, 23, 4, 14, 24] """ def __init__(self, *datapipes): self.datapipes = datapipes @@ -363,6 +405,12 @@ class ZipperIterDataPipe(IterDataPipe[Tuple[T_co]]): Args: *datapipes: Iterable DataPipes being aggregated + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> dp1, dp2, dp3 = IterableWrapper(range(5)), IterableWrapper(range(10, 15)), IterableWrapper(range(20, 25)) + >>> list(dp1.zip(dp2, dp3)) + [(0, 10, 20), (1, 11, 21), (2, 12, 22), (3, 13, 23), (4, 14, 24)] """ datapipes: Tuple[IterDataPipe] length: Optional[int] diff --git a/torch/utils/data/datapipes/iter/filelister.py b/torch/utils/data/datapipes/iter/filelister.py index 4de205e55f0..e75cc588180 100644 --- a/torch/utils/data/datapipes/iter/filelister.py +++ b/torch/utils/data/datapipes/iter/filelister.py @@ -17,6 +17,12 @@ class FileListerIterDataPipe(IterDataPipe[str]): non_deterministic: Whether to return pathname in sorted order or not. If ``False``, the results yielded from each root directory will be sorted length: Nominal length of the datapipe + + Example: + >>> from torchdata.datapipes.iter import FileLister + >>> dp = FileLister(root=".", recursive=True) + >>> list(dp) + ['example.py', './data/data.tar'] """ def __init__( diff --git a/torch/utils/data/datapipes/iter/fileopener.py b/torch/utils/data/datapipes/iter/fileopener.py index 6c10016dc75..1e92a5cc277 100644 --- a/torch/utils/data/datapipes/iter/fileopener.py +++ b/torch/utils/data/datapipes/iter/fileopener.py @@ -22,6 +22,14 @@ class FileOpenerIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): Note: The opened file handles will be closed by Python's GC periodically. Users can choose to close them explicitly. + + Example: + >>> from torchdata.datapipes.iter import FileLister, FileOpener, StreamReader + >>> dp = FileLister(root=".").filter(lambda fname: fname.endswith('.txt')) + >>> dp = FileOpener(dp) + >>> dp = StreamReader(dp) + >>> list(dp) + [('./abc.txt', 'abc')] """ def __init__( diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index 7e92a49cae4..2e143e0c595 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -58,6 +58,13 @@ class BatcherIterDataPipe(IterDataPipe[DataChunk]): drop_last: Option to drop the last batch if it's not full wrapper_class: wrapper to apply onto each batch (type ``List``) before yielding, defaults to ``DataChunk`` + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> dp = IterableWrapper(range(10)) + >>> dp = dp.batch(batch_size=3, drop_last=True) + >>> list(dp) + [[0, 1, 2], [3, 4, 5], [6, 7, 8]] """ datapipe: IterDataPipe batch_size: int @@ -111,6 +118,16 @@ class UnBatcherIterDataPipe(IterDataPipe): datapipe: Iterable DataPipe being un-batched unbatch_level: Defaults to ``1`` (only flattening the top level). If set to ``2``, it will flatten the top two levels, and ``-1`` will flatten the entire DataPipe. + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> source_dp = IterableWrapper([[[0, 1], [2]], [[3, 4], [5]], [[6]]]) + >>> dp1 = source_dp.unbatch() + >>> list(dp1) + [[0, 1], [2], [3, 4], [5], [6]] + >>> dp2 = source_dp.unbatch(unbatch_level=2) + >>> list(dp2) + [0, 1, 2, 3, 4, 5, 6] """ def __init__(self, @@ -149,16 +166,42 @@ class UnBatcherIterDataPipe(IterDataPipe): class GrouperIterDataPipe(IterDataPipe[DataChunk]): r""" Groups data from input IterDataPipe by keys which are generated from ``group_key_fn``, - and yields a ``DataChunk`` with size ranging from ``guaranteed_group_size`` - to ``group_size`` (functional name: ``groupby``). + and yields a ``DataChunk`` with batch size up to ``group_size`` if defined (functional name: ``groupby``). + + The samples are read sequentially from the source ``datapipe``, and a batch of samples belonging to the same group + will be yielded as soon as the size of the batch reaches ``group_size``. When the buffer is full, + the DataPipe will yield the largest batch with the same key, provided that its size is larger + than ``guaranteed_group_size``. If its size is smaller, it will be dropped if ``drop_remaining=True``. + + After iterating through the entirety of source ``datapipe``, everything not dropped due to the buffer capacity + will be yielded from the buffer, even if the group sizes are smaller than ``guaranteed_group_size``. Args: datapipe: Iterable datapipe to be grouped group_key_fn: Function used to generate group key from the data of the source datapipe buffer_size: The size of buffer for ungrouped data - group_size: The size of each group - guaranteed_group_size: The guaranteed minimum group size - drop_remaining: Specifies if the group smaller than `guaranteed_group_size` will be dropped from buffer + group_size: The max size of each group, a batch is yielded as soon as it reaches this size + guaranteed_group_size: The guaranteed minimum group size to be yielded in case the buffer is full + drop_remaining: Specifies if the group smaller than ``guaranteed_group_size`` will be dropped from buffer + when the buffer is full + + Example: + >>> import os + >>> from torchdata.datapipes.iter import IterableWrapper + >>> def group_fn(file): + ... return os.path.basename(file).split(".")[0] + >>> source_dp = IterableWrapper(["a.png", "b.png", "a.json", "b.json", "a.jpg", "c.json"]) + >>> dp0 = source_dp.groupby(group_key_fn=group_fn) + >>> list(dp0) + [['a.png', 'a.json', 'a.jpg'], ['b.png', 'b.json'], ['c.json']] + >>> # A group is yielded as soon as its size equals to `group_size` + >>> dp1 = source_dp.groupby(group_key_fn=group_fn, group_size=2) + >>> list(dp1) + [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']] + >>> # Scenario where `buffer` is full, and group 'a' needs to be yielded since its size > `guaranteed_group_size` + >>> dp2 = source_dp.groupby(group_key_fn=group_fn, buffer_size=3, group_size=3, guaranteed_group_size=2) + >>> list(dp2) + [['a.png', 'a.json'], ['b.png', 'b.json'], ['a.jpg'], ['c.json']] """ def __init__(self, datapipe: IterDataPipe[T_co], diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py index 818a27ebcdf..074b8d32320 100644 --- a/torch/utils/data/datapipes/iter/selecting.py +++ b/torch/utils/data/datapipes/iter/selecting.py @@ -28,6 +28,15 @@ class FilterIterDataPipe(IterDataPipe[T_co]): datapipe: Iterable DataPipe being filtered filter_fn: Customized function mapping an element to a boolean. drop_empty_batches: By default, drops a batch if it is empty after filtering instead of keeping an empty list + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> def is_even(n): + ... return n % 2 == 0 + >>> dp = IterableWrapper(range(5)) + >>> filter_dp = dp.filter(filter_fn=is_even) + >>> list(filter_dp) + [0, 2, 4] """ datapipe: IterDataPipe filter_fn: Callable diff --git a/torch/utils/data/datapipes/iter/streamreader.py b/torch/utils/data/datapipes/iter/streamreader.py index 3a731f1e9a0..1f1a536c611 100644 --- a/torch/utils/data/datapipes/iter/streamreader.py +++ b/torch/utils/data/datapipes/iter/streamreader.py @@ -10,6 +10,13 @@ class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]): datapipe: Iterable DataPipe provides label/URL and byte stream chunk: Number of bytes to be read from stream per iteration. If ``None``, all bytes will be read util the EOF. + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper, StreamReader + >>> from io import StringIO + >>> dp = IterableWrapper([("alphabet", StringIO("abcde"))]) + >>> list(StreamReader(dp, chunk=1)) + [('alphabet', 'a'), ('alphabet', 'b'), ('alphabet', 'c'), ('alphabet', 'd'), ('alphabet', 'e')] """ def __init__(self, datapipe, chunk=None): self.datapipe = datapipe diff --git a/torch/utils/data/datapipes/iter/utils.py b/torch/utils/data/datapipes/iter/utils.py index e3c57ccaae5..a5cf16aa573 100644 --- a/torch/utils/data/datapipes/iter/utils.py +++ b/torch/utils/data/datapipes/iter/utils.py @@ -13,9 +13,13 @@ class IterableWrapperIterDataPipe(IterDataPipe): iterator. The copy is made when the first element is read in ``iter()``. .. note:: - If ``deepcopy`` is explicitly set to ``False``, users should ensure - that the data pipeline doesn't contain any in-place operations over - the iterable instance to prevent data inconsistency across iterations. + If ``deepcopy`` is explicitly set to ``False``, users should ensure + that the data pipeline doesn't contain any in-place operations over + the iterable instance to prevent data inconsistency across iterations. + + Example: + >>> from torchdata.datapipes.iter import IterableWrapper + >>> dp = IterableWrapper(range(10)) """ def __init__(self, iterable, deepcopy=True): self.iterable = iterable diff --git a/torch/utils/data/dataset.py b/torch/utils/data/dataset.py index 5411e7eacc7..fa68373fd96 100644 --- a/torch/utils/data/dataset.py +++ b/torch/utils/data/dataset.py @@ -87,7 +87,7 @@ class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta): of :class:`~torch.utils.data.DataLoader`. These DataPipes can be invoked in two ways, using the class constructor or applying their - functional form onto an existing `MapDataPipe` (available to most but not all DataPipes). + functional form onto an existing `MapDataPipe` (recommend, available to most but not all DataPipes). Note: :class:`~torch.utils.data.DataLoader` by default constructs an index @@ -97,12 +97,15 @@ class MapDataPipe(Dataset[T_co], metaclass=_DataPipeMeta): Example: >>> from torchdata.datapipes.map import SequenceWrapper, Mapper >>> dp = SequenceWrapper(range(10)) - >>> map_dp_1 = dp.map(lambda x: x + 1) # Using functional form - >>> list(map_dp_1) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + >>> map_dp_1 = dp.map(lambda x: x + 1) # Using functional form (recommended) + >>> list(map_dp_1) + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] >>> map_dp_2 = Mapper(dp, lambda x: x + 1) # Using class constructor - >>> list(map_dp_2) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + >>> list(map_dp_2) + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] >>> batch_dp = map_dp_1.batch(batch_size=2) - >>> list(batch_dp) # [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] + >>> list(batch_dp) + [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] """ functions: Dict[str, Callable] = {} @@ -257,7 +260,7 @@ class IterDataPipe(IterableDataset[T_co], metaclass=_DataPipeMeta): on its iterator. These DataPipes can be invoked in two ways, using the class constructor or applying their - functional form onto an existing `IterDataPipe` (available to most but not all DataPipes). + functional form onto an existing `IterDataPipe` (recommended, available to most but not all DataPipes). You can chain multiple `IterDataPipe` together to form a pipeline that will perform multiple operations in succession. @@ -276,11 +279,14 @@ class IterDataPipe(IterableDataset[T_co], metaclass=_DataPipeMeta): >>> from torchdata.datapipes.iter import IterableWrapper, Mapper >>> dp = IterableWrapper(range(10)) >>> map_dp_1 = Mapper(dp, lambda x: x + 1) # Using class constructor - >>> map_dp_2 = dp.map(lambda x: x + 1) # Using functional form - >>> list(map_dp_1) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] - >>> list(map_dp_2) # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + >>> map_dp_2 = dp.map(lambda x: x + 1) # Using functional form (recommended) + >>> list(map_dp_1) + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + >>> list(map_dp_2) + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] >>> filter_dp = map_dp_1.filter(lambda x: x % 2 == 0) - >>> list(filter_dp) # [2, 4, 6, 8, 10] + >>> list(filter_dp) + [2, 4, 6, 8, 10] """ functions: Dict[str, Callable] = {} reduce_ex_hook : Optional[Callable] = None From 86a961af8790978e2be78bf4613fe9dc38382528 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Fri, 18 Feb 2022 07:41:57 -0800 Subject: [PATCH 155/199] Revert D34250357: Sync lazy_tensor_staging back to master Test Plan: revert-hammer Differential Revision: D34250357 (https://github.com/pytorch/pytorch/commit/69389fb5423832272a36d3ef4bd2a39d10489507) Original commit changeset: aa7d589f6050 Original Phabricator Diff: D34250357 (https://github.com/pytorch/pytorch/commit/69389fb5423832272a36d3ef4bd2a39d10489507) fbshipit-source-id: 43f6da6986f7fc5189d641b7803adc5ada27194c (cherry picked from commit 3c930a5e4e909e1f78f574757f292ba9d608d094) --- test/cpp/lazy/test_cache.cpp | 2 +- test/cpp/lazy/test_ir.cpp | 2 +- test/cpp/lazy/test_ir_util.cpp | 2 +- tools/codegen/api/lazy.py | 53 ++----- tools/codegen/dest/lazy_ir.py | 147 ++++++------------- tools/codegen/dest/lazy_ts_lowering.py | 8 +- tools/codegen/gen_lazy_tensor.py | 22 ++- torch/csrc/lazy/core/config.cpp | 5 - torch/csrc/lazy/core/config.h | 1 - torch/csrc/lazy/core/ir.cpp | 27 +--- torch/csrc/lazy/core/ir.h | 41 +----- torch/csrc/lazy/core/lazy_graph_executor.cpp | 10 +- torch/csrc/lazy/core/shape.cpp | 8 +- torch/csrc/lazy/core/shape.h | 2 +- torch/csrc/lazy/ts_backend/ts_node.cpp | 18 +-- torch/csrc/lazy/ts_backend/ts_node.h | 2 +- 16 files changed, 102 insertions(+), 248 deletions(-) diff --git a/test/cpp/lazy/test_cache.cpp b/test/cpp/lazy/test_cache.cpp index a6da9bccbd2..033b6c21b1e 100644 --- a/test/cpp/lazy/test_cache.cpp +++ b/test/cpp/lazy/test_cache.cpp @@ -11,7 +11,7 @@ namespace lazy { class CacheNode : public Node { public: explicit CacheNode(const std::string& str) - : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool /*bakeInSizes*/) -> hash_t { return Hash(str); }), + : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(str)), str_(str) {} ~CacheNode() override = default; diff --git a/test/cpp/lazy/test_ir.cpp b/test/cpp/lazy/test_ir.cpp index 326f7a9092c..78b94618c7f 100644 --- a/test/cpp/lazy/test_ir.cpp +++ b/test/cpp/lazy/test_ir.cpp @@ -12,7 +12,7 @@ namespace lazy { class TestLeafNode : public Node { public: explicit TestLeafNode(size_t param) - : Node(OpKind(), /* num_outputs */ 1, /* hash_func */[&](bool /*bakeInSizes*/) -> hash_t { return Hash(param); }), + : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(param)), param_(param) {} ~TestLeafNode() override = default; diff --git a/test/cpp/lazy/test_ir_util.cpp b/test/cpp/lazy/test_ir_util.cpp index bb29cff6f6b..5c216258f9a 100644 --- a/test/cpp/lazy/test_ir_util.cpp +++ b/test/cpp/lazy/test_ir_util.cpp @@ -12,7 +12,7 @@ namespace lazy { class IrUtilNode : public Node { public: explicit IrUtilNode() - : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool /*bakeInSizes*/) -> hash_t { return Hash(0); }) {} + : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(0)) {} ~IrUtilNode() override = default; void AddOperand(Value v) { diff --git a/tools/codegen/api/lazy.py b/tools/codegen/api/lazy.py index ebbc72eb1fc..3fe83936eef 100644 --- a/tools/codegen/api/lazy.py +++ b/tools/codegen/api/lazy.py @@ -1,11 +1,12 @@ from typing import List, Union, Tuple from tools.codegen.model import (Type, BaseTy, BaseType, OptionalType, ListType, OperatorName, FunctionSchema, - Return, TensorOptionsArguments) -from tools.codegen.api.types import (CType, BaseCppType, BaseCType, OptionalCType, - NamedCType, deviceT, layoutT, + Return) +from tools.codegen.api.types import (BaseCppType, BaseCType, OptionalCType, + ConstRefCType, NamedCType, + MutRefCType, VectorCType, boolT, longT, doubleT, ListCType, stringT, - scalarT, scalarTypeT) + scalarT, scalarTypeT, ArrayRefCType, ArrayCType, TupleCType) valueT = BaseCppType('torch::lazy', 'Value') @@ -32,9 +33,7 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L if typ.name == BaseTy.Tensor: return BaseCType(valueT) elif typ.name == BaseTy.Scalar: - # at::scalar has special handling, - # and is wrapped in an IR value just like at::tensor - return BaseCType(valueT) + return BaseCType(scalarT) elif typ.name == BaseTy.ScalarType: return BaseCType(scalarTypeT) elif typ.name == BaseTy.int: @@ -45,10 +44,6 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L return BaseCType(doubleT) elif typ.name == BaseTy.str: return BaseCType(stringT) - elif typ.name == BaseTy.Device: - return BaseCType(deviceT) - elif typ.name == BaseTy.Layout: - return BaseCType(layoutT) else: raise AssertionError(f"TODO add support for type {repr(typ)}") elif isinstance(typ, OptionalType): @@ -63,36 +58,19 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L raise AssertionError(f"unrecognized type {repr(typ)}") -def isValueType(typ: CType) -> bool: +def isValueType(typ: Union[Type, BaseCType, OptionalCType, ConstRefCType, MutRefCType, + ListCType, ArrayRefCType, ArrayCType, VectorCType, TupleCType]) -> bool: """ Given a type, determine if it is a Value-like type. This is equivalent to being Tensor-like, but assumes the type has already been transformed. """ if isinstance(typ, BaseCType): - # I am regretting my naming conventions, but now we are wrapping at::scalar in - # lazy value, while preserving other 'scalar' types as scalars in the IR - return typ.type == valueT or typ.type == scalarT + return typ.type == valueT elif isinstance(typ, (OptionalCType, ListCType, VectorCType)): return isValueType(typ.elem) else: return False -def isWrappedScalarType(typ: Type) -> bool: - """ - Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value. - Since we literally change the type from scalarT to valueT, information is lost. - This function helps build a list of wrapped scalars to save that information - """ - if isinstance(typ, BaseType): - # I am regretting my naming conventions, but now we are wrapping at::scalar in - # lazy value, while preserving other 'scalar' types as scalars in the IR - return typ.name == BaseTy.Scalar - elif isinstance(typ, (OptionalType, ListType)): - return isWrappedScalarType(typ.elem) - else: - return False - - # Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node. # Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML), # but carries type information from a native FunctionSchema modified for use with IR nodes, @@ -109,8 +87,6 @@ class LazyIrSchema: # TODO: Need to handle collisions with argument names at some point returns: Tuple['Return', ...] - wrapped_scalar_names: List[str] - def __init__(self, func: FunctionSchema): positional_arg_types = [] @@ -132,15 +108,14 @@ class LazyIrSchema: "tensor_options", "post_tensor_options_kwarg_only", "out"]: - curr_args = getattr(func.arguments, arg_field) - if curr_args is not None: - if isinstance(curr_args, TensorOptionsArguments): - curr_args = curr_args.all() - keyword_arg_types.extend([NamedCType(arg.name, process_ir_type(arg.type)) for arg in curr_args]) + if getattr(func.arguments, arg_field) is not None: + keyword_arg_types.extend([ + NamedCType( + arg.name, + process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)]) self.keyword_arg_types = tuple(keyword_arg_types) self.name = func.name self.returns = func.returns - self.wrapped_scalar_names = [arg.name for arg in func.schema_order_arguments() if isWrappedScalarType(arg.type)] @property def node_name(self) -> str: diff --git a/tools/codegen/dest/lazy_ir.py b/tools/codegen/dest/lazy_ir.py index 58fc6862900..d41b4edcd8a 100644 --- a/tools/codegen/dest/lazy_ir.py +++ b/tools/codegen/dest/lazy_ir.py @@ -1,4 +1,3 @@ -from abc import ABC, abstractmethod from typing import List, Union from dataclasses import dataclass from tools.codegen.context import method_with_native_function @@ -10,23 +9,17 @@ import tools.codegen.api.dispatcher as dispatcher from tools.codegen.api.lazy import LazyIrSchema, isValueType from tools.codegen.dest.lazy_ts_lowering import ts_lowering_body -def node_ctor_arg_rvalue_string(arg: NamedCType, schema: LazyIrSchema) -> str: + +def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: """ Given a NamedCType from a lazy IR schema, generate a c++ string for materializing an rvalue of that arg for passing into a lazy Node constructor. """ - if isValueType(arg.type): if isinstance(arg.type, BaseCType): - if arg.name in schema.wrapped_scalar_names: - return f"torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen({arg.name})" return f"lazy_{arg.name}.GetIrValue()" elif isinstance(arg.type, OptionalCType): - if arg.name in schema.wrapped_scalar_names: - return f"{arg.name} ? " \ - f"c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen(*{arg.name})) : " \ - "c10::nullopt" return f"lazy_{arg.name} ? " \ f"c10::make_optional(lazy_{arg.name}.GetIrValue()) : " \ "c10::nullopt" @@ -42,55 +35,24 @@ def node_ctor_arg_rvalue_string(arg: NamedCType, schema: LazyIrSchema) -> str: else: return f"{arg.name}" -def node_ctor_inputs(schema: LazyIrSchema) -> str: +def node_ctor_inputs(func: LazyIrSchema) -> str: """ Produce a formatted string with the arguments as passed into the constructor of a node class. """ - node_ctor_values = [node_ctor_arg_rvalue_string(arg, schema) for arg in schema.filtered_types()] + node_ctor_values = [node_ctor_arg_rvalue_string(arg) for arg in func.filtered_types()] return ",\n ".join(node_ctor_values) -def gen_fallback_code(schema: LazyIrSchema, overload_name: str) -> str: - """ - Generate code that falls back to eager conditioned on a predicate - """ - fallback_args = ",\n ".join([str(arg.name) for arg in schema.filtered_types()]) - if len(overload_name): - aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})" - else: - aten_op_str = f"ATEN_OP({schema.aten_name})" - return f""" - if (force_eager_fallback({aten_symbol(schema)})) {{ - return at::native::call_fallback_fn<<c_eager_fallback, {aten_op_str}>::call( - {fallback_args} - ); - }} -""" - -def aten_symbol(schema: LazyIrSchema) -> str: - missing_interned_strings = { - 'sigmoid_backward', - } - if schema.aten_name in missing_interned_strings: - return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")' - return f'at::aten::{schema.aten_name}' @dataclass(frozen=True) -class LazyIR(ABC): +class LazyIR: backend_index: BackendIndex node_base: str - lowering_function_type: str = "" - lowering_context_type: str = "" - lowering_return_type: str = "" @method_with_native_function def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func return self.gen(f) - @abstractmethod - def lowering_body(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> str: - pass - def gen(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: # for now, we just want one IR class decl and soon after also the method defs # and we use the functional version not out/inplace. @@ -101,9 +63,9 @@ class LazyIR(ABC): scalar_types = schema.filtered_types(values=False, scalars=True) node_ctor_args = ", ".join([f"const {i.cpp_type()}& {i.name}" for i in all_types]) - scalar_initializers = ",\n ".join([f"{t.name}({t.name})" for t in scalar_types]) + scalar_initializers = ",\n ".join([f"{t.name}_({t.name})" for t in scalar_types]) comma_if_scalar_initializers = ",\n" if len(scalar_initializers) else "" - scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name};" for t in scalar_types]) + scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name}_;" for t in scalar_types]) scalar_hashes = ", ".join([f"{f.name}" for f in scalar_types]) base_ctor_value_args_list = [] optional_values = [] @@ -121,20 +83,21 @@ class LazyIR(ABC): members_to_string = [] for t in scalar_types: if isinstance(t.type, OptionalCType): - members_to_string.append(f"""if ({t.name}.has_value()) {{ - ss << ", {t.name}=" << {t.name}.value(); + members_to_string.append(f"""if ({t.name}_.has_value()) {{ + ss << ", {t.name}=" << {t.name}_.value(); }} else {{ ss << ", {t.name}=null"; }}""") else: - members_to_string.append(f'ss << ", {t.name}=" << {t.name};') + members_to_string.append(f'ss << ", {t.name}=" << {t.name}_;') members_to_string_str = "\n ".join(members_to_string) return [f"""\ +// TODO(alanwaketan): Public members don't need to have _ suffix. class {schema.node_name} : public {self.node_base} {{ public: {schema.node_name}({node_ctor_args}, std::vector&& shapes) - : {self.node_base}(torch::lazy::OpKind({aten_symbol(schema)}), + : {self.node_base}(torch::lazy::OpKind(at::aten::{schema.aten_name}), {{{base_ctor_value_args}}}, std::move(shapes), /* num_outputs */ {len(func.returns)}, torch::lazy::MHash({scalar_hashes})){comma_if_scalar_initializers} @@ -146,14 +109,14 @@ class {schema.node_name} : public {self.node_base} {{ std::string ToString() const override {{ std::stringstream ss; - ss << {self.node_base}::ToString(); + ss << TsNode::ToString(); {members_to_string_str} return ss.str(); }} - {self.lowering_return_type} Lower({self.lowering_function_type} function, - {self.lowering_context_type} loctx) const override {{ - {self.lowering_body(f)} + torch::lazy::TSOpVector Lower(std::shared_ptr function, + torch::lazy::TSLoweringContext* loctx) const override {{ + {ts_lowering_body(f)} }} {scalar_decls} @@ -164,34 +127,21 @@ class {schema.node_name} : public {self.node_base} {{ """, ] -@dataclass(frozen=True) -class TSLazyIR(LazyIR): - lowering_function_type: str = "std::shared_ptr" - lowering_context_type: str = "torch::lazy::TSLoweringContext*" - lowering_return_type: str = "torch::lazy::TSOpVector" - - def lowering_body(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> str: - return ts_lowering_body(f) - - -def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str, schema: LazyIrSchema) -> str: +def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str) -> str: lazy_tensor_decls: List[str] = [] for t in value_types: - if t.name in schema.wrapped_scalar_names: - # no lazy tensor wrapper for scalars that are promoted to IR values - continue if isinstance(t.type, BaseCType): lazy_tensor_decls.append( f"{tensor_class} lazy_{t.name} = " - f"torch::lazy::GetLtcTensorOrCreateForWrappedNumber({t.name}, *common_device);") + f"GetLtcTensorOrCreateForWrappedNumber({t.name}, *device);") elif isinstance(t.type, OptionalCType): # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it # until we encounter a real world example. lazy_tensor_decls.append( - f" {tensor_class} lazy_{t.name} = torch::lazy::TryGetLtcTensor({t.name}.value_or(at::Tensor()));") + f" {tensor_class} lazy_{t.name} = TryGetLtcTensor({t.name}.value_or(at::Tensor()));") else: raise AssertionError("TODO not sure if there are other valid types to handle here") - return ("\n ").join(lazy_tensor_decls) + return "\n ".join(lazy_tensor_decls) @dataclass(frozen=True) class GenLazyNativeFuncDefinition: @@ -202,22 +152,17 @@ class GenLazyNativeFuncDefinition: @method_with_native_function def __call__(self, func: NativeFunction) -> List[str]: sig = kernel_signature(func, self.backend_index) - metadata = self.backend_index.get_kernel(func) - assert metadata is not None + + # Lazy IR stuff schema = LazyIrSchema(func.func) all_types = schema.filtered_types() value_types = schema.filtered_types(values=True, scalars=False) scalar_types = schema.filtered_types(values=False, scalars=True) returns_length = len(schema.returns) - fallback_str = gen_fallback_code(schema, overload_name=func.func.name.overload_name) - value_types_names = [f"{t.name}" for t in value_types if t.name not in schema.wrapped_scalar_names] - assert len(value_types_names) > 0, "Code below assumes there is at least one tensor arg" - get_device_str = f"""auto common_device = torch::lazy::GetBackendDevice({', '.join(value_types_names)}); - TORCH_INTERNAL_ASSERT(common_device); - """ - - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) + value_types_names = ", ".join([f"{t.name}" for t in value_types]) + get_device_str = f"""auto device = bridge::GetBackendDevice({value_types_names});""" + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) node_ctor_input_str = node_ctor_inputs(schema) # call the meta kernel if it exists, to compute output shape/dtype for our IR @@ -229,40 +174,37 @@ class GenLazyNativeFuncDefinition: shapes_str = ','.join([this_shape(i) for i in range(returns_length)]) meta_out = "std::vector shapes{" + shapes_str + "};" - # TODO: INTEGRATION POINT HERE: meta_str = f"""auto out_meta = at::meta::{schema.aten_name}({', '.join(str(t.name) for t in all_types)}); {meta_out}""" else: - shape_sig = ComputeShapeSignature(metadata.kernel, func) + shape_sig = ComputeShapeSignature(func) meta_str = f""" auto shapes = {shape_sig.shape_call};""" - meta_str += f""" TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});""" node_str = f"""auto node = torch::lazy::MakeNode({node_ctor_input_str}, std::move(shapes));""" - first_tensor_name = value_types_names[0] - bridge_str = """auto result = torch::lazy::CreateAtenFromLtcTensor( - torch::lazy::LazyTensor::Create(std::move(node), *common_device));""" + assert len(value_types) > 0, f"Only supporting tensor ops so far, none found in {sig}" + first_tensor = value_types[0] + bridge_str = f"""auto result = CreateAtenFromLtcTensor(lazy_{first_tensor.name}.CreateFrom(node));""" if returns_length > 1: bridge_str = f"""std::vector<{self.tensor_class}> lazy_tensors; for (int i = 0; i < {returns_length}; i++) {{ - lazy_tensors.push_back(torch::lazy::LazyTensor::Create(torch::lazy::Value(node, i), *common_device)); + lazy_tensors.push_back(lazy_{first_tensor.name}.CreateFrom(torch::lazy::Value(node, i))); }} - auto result = torch::lazy::TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" - - if schema.name.name.inplace or func.func.is_out_fn(): + auto result = TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" + if schema.name.name.inplace: assert returns_length == 1, "We assumed there was no such case where an op is an in-place variant " \ "and has tuple outputs." - bridge_str = f"""lazy_{first_tensor_name}.SetInPlaceIrValue(node); - auto& result = {first_tensor_name};""" + bridge_str = f"""lazy_{first_tensor.name}.SetInPlaceIrValue(node); + auto& result = {first_tensor.name};""" return [f"""\ - {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{ - {fallback_str} + // TODO(alanwaketan): Quite a lot inefficient copy-by-value there. Let's optimize it. + {sig.decl(name=f"{self.class_method_name}::{schema.aten_name}")} {{ TORCH_LAZY_FN_COUNTER("lazy::"); {get_device_str} {lazy_tensor_decls_str} @@ -277,17 +219,17 @@ class ComputeShapeSignature: """ Here we use the base name as the suffix of the signature to avoid generating for in-place variants. """ - def __init__(self, kernel_name: str, f: NativeFunction): + @method_with_native_function + def __init__(self, f: NativeFunction): self.__schema = LazyIrSchema(f.func) self.__dispatch_args = ', '.join([a.decl() for a in dispatcher.arguments(f.func)]) self.__call_args = ", ".join([f"{t.name}" for t in self.__schema.filtered_types()]) - self.__kernel_name = kernel_name def __decl_suffix(self) -> str: - return f"{self.__kernel_name}({self.__dispatch_args})" + return f"{self.__schema.base_name}({self.__dispatch_args})" def __call_suffix(self) -> str: - return f"{self.__kernel_name}({self.__call_args})" + return f"{self.__schema.base_name}({self.__call_args})" @property def shape_decl(self) -> str: @@ -304,20 +246,19 @@ class GenLazyShapeInferenceDefinition: tensor_class: str @method_with_native_function - # def gen_lazy_shape_inference_decl(f: NativeFunction, backend_index: BackendIndex, tensor_class: str) -> List[str]: def __call__(self, f: NativeFunction) -> List[str]: sig = kernel_signature(f, self.backend_index) - metadata = self.backend_index.get_kernel(f) - assert metadata is not None + + # Lazy IR stuff schema = LazyIrSchema(f.func) value_types = schema.filtered_types(values=True, scalars=False) - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) node_ctor_input_str = node_ctor_inputs(schema) # Only generate shape/dtype fn for non-structured kernels, # since we just use the meta function for structured kernels if not f.structured and f.structured_delegate is None: - shape_sig = ComputeShapeSignature(metadata.kernel, f) + shape_sig = ComputeShapeSignature(f) return ["\n".join([f"{shape_sig.shape_decl};"])] else: return [] diff --git a/tools/codegen/dest/lazy_ts_lowering.py b/tools/codegen/dest/lazy_ts_lowering.py index 3f7701d5587..32d505cda7b 100644 --- a/tools/codegen/dest/lazy_ts_lowering.py +++ b/tools/codegen/dest/lazy_ts_lowering.py @@ -18,12 +18,13 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: continue emplace_arguments.append('loctx->GetOutputOp(operand(i++))') continue - emplace_arguments.append(f'"{value.name}", {value.name}') + emplace_arguments.append(f'"{value.name}", {value.name}_') emplace_arguments_str = "\n ".join( [f"arguments.emplace_back({a});" for a in emplace_arguments]) - emplace_kwarg_values = [f'"{t.name}", loctx->GetOutputOp(operand(i++))' for t in schema.keyword_values] - emplace_kwarg_scalars = [f'"{t.name}", {t.name}' for t in schema.keyword_scalars] + emplace_kwarg_values = [f'loctx->GetOutputOp(operand({i}))' for i in range(len(schema.keyword_values))] + emplace_kwarg_scalars = [f'"{t.name}", {t.name}_' for t in schema.keyword_scalars] + assert len(schema.keyword_values) == 0, "TODO the logic for operand(i) is broken if there are kw values" emplace_kwarguments = "\n ".join( [f"kwarguments.emplace_back({a});" for a in emplace_kwarg_values + emplace_kwarg_scalars]) return f"""\ @@ -37,5 +38,6 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments); CHECK_EQ({schema.aten_name}_out.size(), {len(func.returns)}); + // TODO: need to call GenerateClone sometimes? Or else return LowerBuiltIn() directly return {schema.aten_name}_out; """ diff --git a/tools/codegen/gen_lazy_tensor.py b/tools/codegen/gen_lazy_tensor.py index 9705620fa2e..b2515d3d083 100644 --- a/tools/codegen/gen_lazy_tensor.py +++ b/tools/codegen/gen_lazy_tensor.py @@ -3,8 +3,7 @@ import argparse import os import yaml from collections import namedtuple -from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple, Type -from tools.codegen.dest.lazy_ir import LazyIR, TSLazyIR +from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml from tools.codegen.model import (FunctionSchema, NativeFunction, NativeFunctionsGroup, OperatorName) @@ -61,20 +60,20 @@ def main() -> None: parser.add_argument( '--node_base_hdr', type=str, default=None, help='Path to header file defining custom Lazy IR Node base class') parser.add_argument( - '--tensor_class', type=str, default="torch::lazy::LazyTensor", help='Name of backend specific custom Lazy Tensor class') + '--tensor_class', type=str, default="LazyTensor", help='Name of backend specific custom Lazy Tensor class') parser.add_argument( - '--tensor_class_hdr', type=str, default="torch/csrc/lazy/core/tensor.h", + '--tensor_class_hdr', type=str, default="lazy_tensor_core/csrc/tensor.h", help='Path to header file defining custom Lazy Tensor class') options = parser.parse_args() run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path, options.gen_ts_lowerings, options.node_base, options.node_base_hdr, - options.tensor_class, options.tensor_class_hdr, TSLazyIR) + options.tensor_class, options.tensor_class_hdr) def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str], gen_ts_lowerings: bool, node_base: str, node_base_hdr: Optional[str], - tensor_class: str, tensor_class_hdr: str, lazy_ir_cls: Type[LazyIR]) -> None: + tensor_class: str, tensor_class_hdr: str) -> None: # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute() @@ -161,13 +160,11 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st fm.write_with_template(f'{backend_key}NativeFunctions.cpp', 'DispatchKeyNativeFunctions.cpp', lambda: { 'includes': [f'#include <{path}>' for path in [ tensor_class_hdr, - "ATen/Functions.h", "ATen/MetaFunctions.h", - "ATen/Operators.h", - "torch/csrc/lazy/core/lazy_graph_executor.h", "torch/csrc/lazy/core/metrics.h", "torch/csrc/lazy/core/shape.h", - "lazy_tensor_core/csrc/ts_backend/aten_eager_fallback.h", + "lazy_tensor_core/csrc/aten_ltc_bridge.h", + "lazy_tensor_core/csrc/lazy_graph_executor.h", f"{output_dir}/{backend_key}NativeFunctions.h", f"{output_dir}/{backend_key}LazyIr.h", f"{output_dir}/{backend_key}ShapeInference.h", @@ -199,8 +196,7 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'func_declarations': list(concat_map_codegen( dest.GenLazyShapeInferenceDefinition(backend_indices[backend_key], tensor_class), - grouped_native_functions, - codegenInplaceVariant=True, + grouped_native_functions )), }) # Generate IR node classes @@ -221,7 +217,7 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'DispatchKey': backend_key, 'dispatch_namespace': backend_key.lower(), 'ir_declarations': list(concat_map_codegen( - lazy_ir_cls(backend_indices[backend_key], node_base), + dest.LazyIR(backend_indices[backend_key], node_base), grouped_native_functions )), }) diff --git a/torch/csrc/lazy/core/config.cpp b/torch/csrc/lazy/core/config.cpp index b47054913e1..af86dd926d6 100644 --- a/torch/csrc/lazy/core/config.cpp +++ b/torch/csrc/lazy/core/config.cpp @@ -7,11 +7,6 @@ C10_DEFINE_bool( false, "Enable parameter aliasing support"); -C10_DEFINE_bool( - torch_lazy_use_thread_pool, - false, - "Use thread pool to schedule backend execution"); - C10_DEFINE_int( torch_lazy_compilation_cache_size, 1024, diff --git a/torch/csrc/lazy/core/config.h b/torch/csrc/lazy/core/config.h index fa6630123cd..beee5b4b214 100644 --- a/torch/csrc/lazy/core/config.h +++ b/torch/csrc/lazy/core/config.h @@ -3,7 +3,6 @@ C10_DECLARE_bool(torch_lazy_ir_debug); C10_DECLARE_bool(torch_lazy_param_aliasing); -C10_DECLARE_bool(torch_lazy_use_thread_pool); C10_DECLARE_int(torch_lazy_compilation_cache_size); C10_DECLARE_int(torch_lazy_device_data_cache_size); diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp index a1726aacba6..63e6ee8744c 100644 --- a/torch/csrc/lazy/core/ir.cpp +++ b/torch/csrc/lazy/core/ir.cpp @@ -1,8 +1,6 @@ #include #include -C10_DEFINE_bool(ltc_enable_dynamic_shapes, false, "Whether dynamic shape is enabled"); - namespace torch { namespace lazy { @@ -25,14 +23,6 @@ hash_t Value::hash() const { return HashCombine(node->hash(), Hash(index)); } -hash_t Value::hash_with_sizes() const { - return HashCombine(node->hash_with_sizes(), Hash(index)); -} - -hash_t Value::hash_without_sizes() const { - return HashCombine(node->hash_without_sizes(), Hash(index)); -} - OpKind OpKind::Get(const std::string& name) { return OpKind(c10::Symbol::fromQualString(name)); } @@ -41,25 +31,18 @@ hash_t OpKind::hash() const { return StringHash(op.toQualString()); } -bool Node::enableDynamicShape() { - static bool enabled = std::getenv("LTC_ENABLE_DYNAMIC_SHAPES") != nullptr; - return enabled || FLAGS_ltc_enable_dynamic_shapes; -} - -Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn) +Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash) : op_(op), num_outputs_(num_outputs), node_hash_(node_hash), - dag_hash_without_sizes_(dag_hash_fn(false)), - dag_hash_with_sizes_(dag_hash_fn(true)), + dag_hash_(dag_hash), metadata_(GetMetaDataIfDebugging()) {} -Node::Node(OpKind op, size_t num_outputs, std::function node_hash_fn) +Node::Node(OpKind op, size_t num_outputs, hash_t node_hash) : op_(op), num_outputs_(num_outputs), - node_hash_(node_hash_fn(!enableDynamicShape())), - dag_hash_without_sizes_(node_hash_fn(false)), - dag_hash_with_sizes_(node_hash_fn(true)), + node_hash_(node_hash), + dag_hash_(node_hash), metadata_(GetMetaDataIfDebugging()) {} Node::~Node() = default; diff --git a/torch/csrc/lazy/core/ir.h b/torch/csrc/lazy/core/ir.h index 4132400bb65..6ca1df8d2fb 100644 --- a/torch/csrc/lazy/core/ir.h +++ b/torch/csrc/lazy/core/ir.h @@ -15,9 +15,6 @@ #include #include #include -#include - -C10_DECLARE_bool(ltc_enable_dynamic_shapes); namespace torch { namespace lazy { @@ -68,12 +65,9 @@ using OutputMap = std::unordered_map; // Represents an input/operand for a Node object. struct TORCH_API Value { Value() = default; - /* implicit */ Value(NodePtr&& node, size_t index = 0) : node(std::move(node)), index(index) {} - /* implicit */ Value(const NodePtr& node, size_t index = 0) : node(node), index(index) {} + /* implicit */ Value(NodePtr node, size_t index = 0) : node(std::move(node)), index(index) {} hash_t hash() const; - hash_t hash_with_sizes() const; - hash_t hash_without_sizes() const; operator bool() const { return node != nullptr; @@ -127,6 +121,7 @@ inline std::ostream& operator<<(std::ostream& stream, const OpKind& op) { using OpList = c10::ArrayRef; + // A node in the graph. Nodes for operations which requires extra data to be // stored for lowering, should inherit from this class and add operation // specific member there. For example, a constant might create a new @@ -135,18 +130,13 @@ using OpList = c10::ArrayRef; // client data handle in it. class TORCH_API Node { public: - static bool enableDynamicShape(); - // Creates a new node with the given op name. The op is a unique identifier // for the operation. The num_outputs tells how many outputs a given operation // generates. - // - // None leaf node's node_hash does not contains shape information always. - // So we pass in the hash value rather than a function. - Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn); + Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash); // Contructor used to create leaf nodes. - Node(OpKind op, size_t num_outputs, std::function node_hash_fn); + Node(OpKind op, size_t num_outputs, hash_t node_hash); virtual ~Node(); @@ -167,15 +157,7 @@ class TORCH_API Node { } hash_t hash() const { - return enableDynamicShape() ? dag_hash_without_sizes_ : dag_hash_with_sizes_; - } - - hash_t hash_without_sizes() const { - return dag_hash_without_sizes_; - } - - hash_t hash_with_sizes() const { - return dag_hash_with_sizes_; + return dag_hash_; } const MetaData& metadata() const { @@ -201,17 +183,8 @@ class TORCH_API Node { // The hash value of this node. hash_t node_hash_; - // dag_hash represents the hash value of the graph rooted at this node. There are 2 variants, one - // with sizes info and one without. We need 2 such hashes to support dynamic - // shape. Here are the logic to pick the hash in the 2 major scenarios that a hash is needed: - // - shape cache: in this case, we always use the dag hash with size info. This way, looking up the - // shape for one node does not get the shape for another node with the same rank but different sizes - // - lookup the compiled graph by a hash: in this case, we will use the dag hash - // WITHOUT size info if dynamic shape is enabled and use the dag hash WITH size info otherwise. - // The different requirement for the hash in these 2 scenarios forces us to maintain 2 - // different hashes. - hash_t dag_hash_without_sizes_; - hash_t dag_hash_with_sizes_; + // The hash value of the graph rooted at this node. + hash_t dag_hash_; // The IR specific metadata attached to the IR node. MetaData metadata_; // The IR framework user can attach a user defined metadata object deriving diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index 3599abb7b8d..9f504c935e9 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -462,7 +462,7 @@ void LazyGraphExecutor::SyncTensorsGraph( config.sync_ltc_data = sync_ltc_data; auto async = SyncTensorsGraphInternal(tensors, devices, config); - if (FLAGS_torch_lazy_use_thread_pool && wait && async != nullptr) { + if (wait && async != nullptr) { async->mwait.Wait(); } } @@ -972,11 +972,7 @@ std::shared_ptr LazyGraphExecutor:: } }; - if (FLAGS_torch_lazy_use_thread_pool) { - ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); - } else { - syncfn(); - } + ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); return async; } @@ -999,7 +995,7 @@ std::vector LazyGraphExecutor::GetTensorsFused( SyncTensorsConfig config; config.force_ltc_data = false; auto async = SyncTensorsGraphInternal(tensors, {}, config); - if (FLAGS_torch_lazy_use_thread_pool && async != nullptr) { + if (async != nullptr) { async->mwait.Wait(); } std::vector tensors_data = GatherTensorsData( diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp index bd5ea5b75c9..2b7fd2c74b8 100644 --- a/torch/csrc/lazy/core/shape.cpp +++ b/torch/csrc/lazy/core/shape.cpp @@ -28,12 +28,8 @@ size_t Shape::numel() const { return elts; } -hash_t Shape::hash(bool bakeInSizes) const { - if (bakeInSizes) { - return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); - } else { - return HashCombine(Hash(scalar_type_), Hash(sizes_.size())); - } +hash_t Shape::hash() const { + return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); } } // namespace lazy diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h index 9b34b90fec0..c67ff908833 100644 --- a/torch/csrc/lazy/core/shape.h +++ b/torch/csrc/lazy/core/shape.h @@ -25,7 +25,7 @@ class TORCH_API Shape { int64_t size(int64_t dim) const { return sizes_.at(dim); } void set_size(int64_t dim, int64_t size) { sizes_.at(dim) = size; } size_t numel() const; - hash_t hash(bool bakeInSizes) const; + hash_t hash() const; bool operator==(const Shape& other) const; diff --git a/torch/csrc/lazy/ts_backend/ts_node.cpp b/torch/csrc/lazy/ts_backend/ts_node.cpp index a7948e5cbec..d79dd999f81 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.cpp +++ b/torch/csrc/lazy/ts_backend/ts_node.cpp @@ -28,15 +28,14 @@ void TsNodeSetShapeDeferred( throw std::runtime_error("Expected TsNode but could not dynamic cast"); } -hash_t OperandHashes(const OpList& operands, const hash_t& seed, bool bakeInSizes) { +hash_t OperandHashes(const OpList& operands, const hash_t& seed) { hash_t hash = seed; for (auto& operand : operands) { if (!operand) { hash = HashCombine(hash, static_cast(kNullOpt)); continue; } - auto operand_hash = bakeInSizes ? operand.hash_with_sizes() : operand.hash_without_sizes(); - hash = HashCombine(hash, operand_hash); + hash = HashCombine(hash, operand.hash()); } return hash; } @@ -49,7 +48,7 @@ TsNode::TsNode(OpKind op, OpList operands, std::vector&& shapes, // initialization to a separate function? /* node_hash */ HashCombine(op.hash(), hash_seed), /* dag_hash */ - [&](bool bakeInSizes) { return OperandHashes(operands, HashCombine(op.hash(), hash_seed), bakeInSizes); }), + OperandHashes(operands, HashCombine(op.hash(), hash_seed))), shapes_(shapes) { for (auto& operand : operands) { // Ideally, optional operands should be filtered by the leaf node classes, @@ -81,7 +80,7 @@ void TsNode::SetShapeDeferred( } TsNode::TsNode(OpKind op, Shape shape, size_t num_outputs, hash_t hash_seed) - : Node(op, num_outputs, [&](bool bakeInSizes) -> hash_t { return GetOpHash(op, shape, hash_seed, bakeInSizes); }) + : Node(op, num_outputs, GetOpHash(op, shape, hash_seed)) { shapes_.push_back(std::move(shape)); } @@ -99,11 +98,10 @@ ShapeCache* GetShapeCache() { Shape TsNode::GetOpShape( const std::function& shape_fn) const { - auto hash = hash_with_sizes(); ShapeCache* shape_cache = GetShapeCache(); - auto shape = shape_cache->Get(hash); + auto shape = shape_cache->Get(hash()); if (shape == nullptr) { - shape = shape_cache->Add(hash, + shape = shape_cache->Add(hash(), std::make_shared(shape_fn())); } return *shape; @@ -122,8 +120,8 @@ std::string TsNode::ToString() const { return ss.str(); } -hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes) { - hash_t h = HashCombine(op.hash(), shape.hash(bakeInSizes)); +hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed) { + hash_t h = HashCombine(op.hash(), shape.hash()); return HashCombine(h, hash_seed); } diff --git a/torch/csrc/lazy/ts_backend/ts_node.h b/torch/csrc/lazy/ts_backend/ts_node.h index 156444852d9..a6595a5337d 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.h +++ b/torch/csrc/lazy/ts_backend/ts_node.h @@ -55,7 +55,7 @@ class TORCH_API TsNode : public lazy::Node { std::string ToString() const override; - static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes); + static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed); const std::vector& operands() const override { return operands_as_outputs_; From 7e35922a2cbc2eacfd9a11967d76db334b5b78d4 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Fri, 18 Feb 2022 16:49:33 +0000 Subject: [PATCH 156/199] Fix test tools This is a temp fix to get trunk green (uses newer stats). The reason the test failed before was because the tests referred to old stats that were deleted after their lifecycle in S3. However, we need to come up with a better way of testing this as this same failure will occur again in a few months. Pull Request resolved: https://github.com/pytorch/pytorch/pull/73082 --- tools/stats/test_history.py | 67 +++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/tools/stats/test_history.py b/tools/stats/test_history.py index 24678aabba9..d9a1e29ad8b 100755 --- a/tools/stats/test_history.py +++ b/tools/stats/test_history.py @@ -193,50 +193,45 @@ In multiline mode, each line next includes the name of a CircleCI job, followed by the time of the specified test in that job at that commit. Example: - $ tools/stats/test_history.py --mode=multiline --ref=594a66 --sha-length=8 --test=test_set_dir \ - --job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test - 2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc5_4_test 0.36s - 2021-02-10 11:13:34Z 594a66d7 pytorch_linux_xenial_py3_6_gcc7_test 0.573s errored - 2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc5_4_test 0.819s - 2021-02-10 10:13:25Z 9c0caf03 pytorch_linux_xenial_py3_6_gcc7_test 0.449s - 2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc5_4_test 0.361s - 2021-02-10 10:09:14Z 602434bc pytorch_linux_xenial_py3_6_gcc7_test 0.454s - 2021-02-10 10:09:10Z 2e35fe95 (no reports in S3) - 2021-02-10 10:09:07Z ff73be7e (no reports in S3) - 2021-02-10 10:05:39Z 74082f0d (no reports in S3) - 2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.414s - 2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc5_4_test 0.476s - 2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.377s - 2021-02-10 07:42:29Z 0620c96f pytorch_linux_xenial_py3_6_gcc7_test 0.326s + $ tools/stats/test_history.py --mode=multiline --ref=86a961af879 --sha-length=8 \ + --test=test_composite_compliance_dot_cpu_float32 \ + --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1 + 2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc5.4-test-default1 0.001s + 2022-02-18 15:47:37Z 86a961af linux-xenial-py3.7-gcc7-test-default1 0.001s + 2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc5.4-test-default1 0.001s + 2022-02-18 15:12:34Z f5e201e4 linux-xenial-py3.7-gcc7-test-default1 0.001s + 2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc5.4-test-default1 0.001s + 2022-02-18 13:14:56Z 1c0df265 linux-xenial-py3.7-gcc7-test-default1 0.001s + 2022-02-18 13:14:56Z e73eaffd (no reports in S3) + 2022-02-18 06:29:12Z 710f12f5 linux-xenial-py3.7-gcc5.4-test-default1 0.001s Another multiline example, this time with the --all flag: - $ tools/stats/test_history.py --mode=multiline --all --ref=321b9 --delta=12 --sha-length=8 \ - --test=test_qr_square_many_batched_complex_cuda - 2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 424.284s - 2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped - 2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 402.572s - 2021-01-07 10:04:56Z 321b9883 pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.164s - 2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_test2 436.732s - 2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda10_2_cudnn7_py3_slow_test 0.006s skipped - 2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda11_1_cudnn8_py3_gcc7_test 407.616s - 2021-01-06 20:58:28Z fcb69d2e pytorch_linux_xenial_cuda9_2_cudnn7_py3_gcc7_test 287.044s + $ tools/stats/test_history.py --mode=multiline --all --ref=86a961af879 --delta=12 --sha-length=8 \ + --test=test_composite_compliance_dot_cuda_float32 + 2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-default1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 linux-bionic-cuda10.2-py3.9-gcc7-test-slow1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 linux-xenial-cuda11.3-py3.7-gcc7-test-default1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 periodic-linux-bionic-cuda11.5-py3.7-gcc7-test-default1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda10.2-py3-gcc7-slow-gradcheck-test-default1 0.001s skipped + 2022-02-18 03:49:46Z 69389fb5 periodic-linux-xenial-cuda11.1-py3.7-gcc7-debug-test-default1 0.001s skipped In columns mode, the name of the job isn't printed, but the order of the columns is guaranteed to match the order of the jobs passed on the command line. Example: - $ tools/stats/test_history.py --mode=columns --ref=3cf783 --sha-length=8 --test=test_set_dir \ - --job pytorch_linux_xenial_py3_6_gcc5_4_test --job pytorch_linux_xenial_py3_6_gcc7_test - 2021-02-10 12:18:50Z 3cf78395 0.644s 0.312s - 2021-02-10 11:13:34Z 594a66d7 0.360s errored - 2021-02-10 10:13:25Z 9c0caf03 0.819s 0.449s - 2021-02-10 10:09:14Z 602434bc 0.361s 0.454s - 2021-02-10 10:09:10Z 2e35fe95 - 2021-02-10 10:09:07Z ff73be7e - 2021-02-10 10:05:39Z 74082f0d - 2021-02-10 07:42:29Z 0620c96f 0.414s 0.377s (2 job re-runs omitted) - 2021-02-10 07:27:53Z 33afb5f1 0.381s 0.294s + $ tools/stats/test_history.py --mode=columns --ref=86a961af879 --sha-length=8 \ + --test=test_composite_compliance_dot_cpu_float32 \ + --job linux-xenial-py3.7-gcc5.4-test-default1 --job linux-xenial-py3.7-gcc7-test-default1 + 2022-02-18 15:47:37Z 86a961af 0.001s 0.001s + 2022-02-18 15:12:34Z f5e201e4 0.001s 0.001s + 2022-02-18 13:14:56Z 1c0df265 0.001s 0.001s + 2022-02-18 13:14:56Z e73eaffd + 2022-02-18 06:29:12Z 710f12f5 0.001s 0.001s + 2022-02-18 05:20:30Z 51b04f27 0.001s 0.001s + 2022-02-18 03:49:46Z 69389fb5 0.001s 0.001s + 2022-02-18 00:19:12Z 056b6260 0.001s 0.001s + 2022-02-17 23:58:32Z 39fb7714 0.001s 0.001s Minor note: in columns mode, a blank cell means that no report was found in S3, while the word "absent" means that a report was found but the From 477d1bd6cfa86e868e0e09648a6c1a08f2befaf2 Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Fri, 18 Feb 2022 08:45:20 -0800 Subject: [PATCH 157/199] Revert D34313425: [quant] Add ConvTranspose reference module Test Plan: revert-hammer Differential Revision: D34313425 (https://github.com/pytorch/pytorch/commit/710f12f58e2d24d583c3e3bab75ec8c169ebbf57) Original commit changeset: 3eeec1b24a51 Original Phabricator Diff: D34313425 (https://github.com/pytorch/pytorch/commit/710f12f58e2d24d583c3e3bab75ec8c169ebbf57) fbshipit-source-id: aecf9113d2e4cef3ccf4e1a9c4c33b07dc2ad385 (cherry picked from commit 3fcb9cd14da0a6efb564d8c116f80d6883527cfc) --- .../eager/test_quantize_eager_ptq.py | 125 ------------- torch/ao/quantization/quantize.py | 12 +- .../quantized/_reference/modules/__init__.py | 5 +- torch/nn/quantized/_reference/modules/conv.py | 169 +----------------- 4 files changed, 4 insertions(+), 307 deletions(-) diff --git a/test/quantization/eager/test_quantize_eager_ptq.py b/test/quantization/eager/test_quantize_eager_ptq.py index 9665d8554bc..6587740bdf9 100644 --- a/test/quantization/eager/test_quantize_eager_ptq.py +++ b/test/quantization/eager/test_quantize_eager_ptq.py @@ -3,7 +3,6 @@ import torch import torch.nn as nn import torch.nn.quantized as nnq -import torch.nn.quantized._reference as nnqr from torch.nn.utils.rnn import PackedSequence from torch.ao.quantization import ( quantize, @@ -75,130 +74,6 @@ import unittest import numpy as np class TestQuantizeEagerOps(QuantizationTestCase): - def _test_reference_module_impl(self, - float_module_class, - quantized_module_class, - extra_module_kwargs, - input_size): - class M(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = float_module_class(**extra_module_kwargs) - self.quant = QuantStub() - self.dequant = DeQuantStub() - - def forward(self, x): - x = self.quant(x) - x = self.conv(x) - x = self.dequant(x) - return x - - class RefM(torch.nn.Module): - def __init__(self): - super().__init__() - self.conv = float_module_class(**extra_module_kwargs) - self.quant1 = QuantStub() - self.dequant1 = DeQuantStub() - self.quant2 = QuantStub() - self.dequant2 = DeQuantStub() - - def forward(self, x): - x = self.quant1(x) - x = self.dequant1(x) - x = self.conv(x) - x = self.quant2(x) - x = self.dequant2(x) - return x - - qengine = 'fbgemm' - with override_quantized_engine(qengine): - data = torch.randn(*input_size, dtype=torch.float) - original_m = M() - original_ref_m = RefM() - - original_ref_m.conv.weight = torch.nn.Parameter(original_m.conv.weight.detach()) - original_ref_m.conv.bias = torch.nn.Parameter(original_m.conv.bias.detach()) - - original_m.qconfig = torch.quantization.default_qconfig - - m = prepare(original_m) - # calibration - m(data) - m = convert(m) - # check if the module is properly quantized - self.assertEqual(type(m.quant), nnq.Quantize) - self.assertEqual(type(m.conv), quantized_module_class) - self.assertEqual(type(m.dequant), nnq.DeQuantize) - res = m(data) - - # quantize the reference model - original_ref_m.eval() - original_ref_m.qconfig = torch.quantization.default_qconfig - - ref_m = prepare(original_ref_m) - ref_m(data) - reference_module_mapping = { - QuantStub: nnq.Quantize, - DeQuantStub: nnq.DeQuantize, - nn.Conv1d: nnqr.Conv1d, - nn.Conv2d: nnqr.Conv2d, - nn.Conv3d: nnqr.Conv3d, - nn.ConvTranspose1d: nnqr.ConvTranspose1d, - nn.ConvTranspose2d: nnqr.ConvTranspose2d, - nn.ConvTranspose3d: nnqr.ConvTranspose3d, - } - ref_m = convert(ref_m, mapping=reference_module_mapping) - ref_res = ref_m(data) - self.assertEqual(res, ref_res) - - def test_conv_1d(self): - self._test_reference_module_impl( - nn.Conv1d, - nnq.Conv1d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 1) - ) - - def test_conv_2d(self): - self._test_reference_module_impl( - nn.Conv2d, - nnq.Conv2d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 10, 10) - ) - - def test_conv_3d(self): - self._test_reference_module_impl( - nn.Conv3d, - nnq.Conv3d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 10, 10, 10) - ) - - def test_conv_transpose_1d(self): - self._test_reference_module_impl( - nn.ConvTranspose1d, - nnq.ConvTranspose1d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 1) - ) - - def test_conv_transpose_2d(self): - self._test_reference_module_impl( - nn.ConvTranspose2d, - nnq.ConvTranspose2d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 10, 10) - ) - - def test_conv_transpose_3d(self): - self._test_reference_module_impl( - nn.ConvTranspose3d, - nnq.ConvTranspose3d, - {'in_channels': 1, 'out_channels': 1, 'kernel_size': 1}, - (16, 1, 10, 10, 10) - ) - def _test_activation_op_impl( self, float_module_class, quantized_module_class, extra_module_kwargs): """ Implementation for testing common activation ops like leaky relu diff --git a/torch/ao/quantization/quantize.py b/torch/ao/quantization/quantize.py index fad2b8abe6e..5afff09b64b 100644 --- a/torch/ao/quantization/quantize.py +++ b/torch/ao/quantization/quantize.py @@ -16,7 +16,7 @@ from torch.ao.quantization.quantization_mappings import ( _has_special_act_post_process, _get_special_act_post_process, ) -from .utils import get_qparam_dict + from torch.ao.quantization.stubs import DeQuantStub, QuantWrapper from torch.ao.quantization.qconfig import ( add_module_to_qconfig_obs_ctr, @@ -565,15 +565,7 @@ def swap_module(mod, mapping, custom_module_class_mapping): new_mod = custom_module_class_mapping[type(mod)].from_observed(mod) swapped = True elif type(mod) in mapping: - qmod = mapping[type(mod)] - if hasattr(qmod, '_IS_REFERENCE') and qmod._IS_REFERENCE: - assert mod.qconfig is not None - weight_post_process = mod.qconfig.weight() - weight_post_process(mod.weight) - weight_qparams = get_qparam_dict(weight_post_process) - new_mod = qmod.from_float(mod, weight_qparams) - else: - new_mod = qmod.from_float(mod) + new_mod = mapping[type(mod)].from_float(mod) swapped = True if swapped: diff --git a/torch/nn/quantized/_reference/modules/__init__.py b/torch/nn/quantized/_reference/modules/__init__.py index efbefdbde60..441852c38f9 100644 --- a/torch/nn/quantized/_reference/modules/__init__.py +++ b/torch/nn/quantized/_reference/modules/__init__.py @@ -1,12 +1,9 @@ from .linear import Linear -from .conv import Conv1d, Conv2d, Conv3d, ConvTranspose1d, ConvTranspose2d, ConvTranspose3d +from .conv import Conv1d, Conv2d, Conv3d __all__ = [ 'Linear', 'Conv1d', 'Conv2d', 'Conv3d', - 'ConvTranspose1d', - 'ConvTranspose2d', - 'ConvTranspose3d', ] diff --git a/torch/nn/quantized/_reference/modules/conv.py b/torch/nn/quantized/_reference/modules/conv.py index 60aed0a91ac..ed151cb7f5e 100644 --- a/torch/nn/quantized/_reference/modules/conv.py +++ b/torch/nn/quantized/_reference/modules/conv.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn import torch.nn.functional as F -from typing import Optional, Dict, Any, List +from typing import Optional, Dict, Any from torch.nn.common_types import _size_1_t from .utils import _quantize_weight, _quantize_and_dequantize_weight from .utils import _save_weight_qparams @@ -14,7 +14,6 @@ class _ConvNd(torch.nn.modules.conv._ConvNd): this is useful when user want to use this module in other backends like Glow. """ __annotations__ = {"bias": Optional[torch.Tensor]} - _IS_REFERENCE = True def _save_to_state_dict(self, destination, prefix, keep_vars): super()._save_to_state_dict(destination, prefix, keep_vars) @@ -218,169 +217,3 @@ class Conv3d(_ConvNd, nn.Conv3d): @classmethod def from_float(cls, float_conv, weight_qparams): return _ConvNd.from_float(cls, float_conv, weight_qparams) - -class _ConvTransposeNd(_ConvNd, torch.nn.modules.conv._ConvTransposeNd): - """ A reference version of nn.quantized.ConvTranspose2d - we will not pack the parameters in this module, since weight packing is an - optimization for quantized backends supported in PyTorch (fbgemm/qnnpack), - this is useful when user want to use this module in other backends like Glow. - """ - @staticmethod - def from_float(cls, float_conv, weight_qparams): - qref_conv = cls( - float_conv.in_channels, - float_conv.out_channels, - float_conv.kernel_size, # type: ignore[arg-type] - float_conv.stride, # type: ignore[arg-type] - float_conv.padding, # type: ignore[arg-type] - float_conv.output_padding, # type: ignore[arg-type] - float_conv.groups, - float_conv.bias is not None, # type: ignore[arg-type] - float_conv.dilation, # type: ignore[arg-type] - float_conv.padding_mode, - device=float_conv.weight.device, - dtype=float_conv.weight.dtype, - weight_qparams=weight_qparams) - qref_conv.weight = torch.nn.Parameter(float_conv.weight.detach()) - if float_conv.bias is not None: - qref_conv.bias = torch.nn.Parameter(float_conv.bias.detach()) - return qref_conv - - -class ConvTranspose1d(_ConvTransposeNd, nn.ConvTranspose1d): - def __init__(self, - in_channels: int, - out_channels: int, - kernel_size: _size_1_t, - stride: _size_1_t = 1, - padding: _size_1_t = 0, - output_padding: _size_1_t = 0, - groups: int = 1, - bias: bool = True, - dilation: _size_1_t = 1, - padding_mode: str = "zeros", - device=None, - dtype=None, - weight_qparams: Optional[Dict[str, Any]] = None): - nn.ConvTranspose1d.__init__( - self, in_channels, out_channels, kernel_size, stride, padding, output_padding, - groups, bias, dilation, padding_mode, device, dtype) - self._init_weight_qparams(weight_qparams, device) - - def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: - """ - we have: - w(float) -- quant - dequant \ - x(float) ------------- F.convTranspose1d --- - In the full model, we will see - w(float) -- quant - *dequant \ - x -- quant --- *dequant -- *F.convTranspose1d --- *quant - dequant - and the backend should be able to fuse the ops with `*` into a quantized conv1d - """ - - assert isinstance(self.padding, tuple) - # One cannot replace List by Tuple or Sequence in "_output_padding" because - # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. - output_padding = self._output_padding( - input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] - - weight_dequant = self.get_weight() - result = F.conv_transpose1d( - x, weight_dequant, self.bias, self.stride, - self.padding, output_padding, self.groups, self.dilation) - return result - - def _get_name(self): - return "QuantizedConvTranspose1d(Reference)" - - @classmethod - def from_float(cls, float_conv, weight_qparams): - return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) - -class ConvTranspose2d(_ConvTransposeNd, nn.ConvTranspose2d): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, output_padding=0, - groups=1, bias=True, dilation=1, - padding_mode='zeros', - device=None, - dtype=None, - weight_qparams: Optional[Dict[str, Any]] = None): - - nn.ConvTranspose2d.__init__( - self, in_channels, out_channels, kernel_size, stride, padding, output_padding, - groups, bias, dilation, padding_mode, device, dtype) - self._init_weight_qparams(weight_qparams, device) - - def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: - """ - we have: - w(float) -- quant - dequant \ - x(float) ------------- F.convTranspose2d --- - In the full model, we will see - w(float) -- quant - *dequant \ - x -- quant --- *dequant -- *F.convTranspose2d --- *quant - dequant - and the backend should be able to fuse the ops with `*` into a quantized conv2d - """ - assert isinstance(self.padding, tuple) - # One cannot replace List by Tuple or Sequence in "_output_padding" because - # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. - - output_padding = self._output_padding( - input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] - - weight_dequant = self.get_weight() - result = F.conv_transpose2d( - x, weight_dequant, self.bias, self.stride, - self.padding, output_padding, self.groups, self.dilation) - - return result - - def _get_name(self): - return "QuantizedConvTranspose2d(Reference)" - - @classmethod - def from_float(cls, float_conv, weight_qparams): - return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) - -class ConvTranspose3d(_ConvTransposeNd, nn.ConvTranspose3d): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, output_padding=0, - groups=1, bias=True, dilation=1, - padding_mode="zeros", - device=None, - dtype=None, - weight_qparams: Optional[Dict[str, Any]] = None): - nn.ConvTranspose3d.__init__( - self, in_channels, out_channels, kernel_size, stride, padding, output_padding, - groups, bias, dilation, padding_mode, device, dtype) - self._init_weight_qparams(weight_qparams, device) - - def forward(self, x: torch.Tensor, output_size: Optional[List[int]] = None) -> torch.Tensor: - """ - we have: - w(float) -- quant - dequant \ - x(float) ------------- F.convTranspose3d --- - In the full model, we will see - w(float) -- quant - *dequant \ - x -- quant --- *dequant -- *F.convTranspose3d --- *quant - dequant - and the backend should be able to fuse the ops with `*` into a quantized conv3d - """ - - assert isinstance(self.padding, tuple) - # One cannot replace List by Tuple or Sequence in "_output_padding" because - # TorchScript does not support `Sequence[T]` or `Tuple[T, ...]`. - output_padding = self._output_padding( - input, output_size, self.stride, self.padding, self.kernel_size, self.dilation) # type: ignore[arg-type] - - weight_dequant = self.get_weight() - result = F.conv_transpose3d( - x, weight_dequant, self.bias, self.stride, - self.padding, output_padding, self.groups, self.dilation) - return result - - def _get_name(self): - return "QuantizedConvTranspose3d(Reference)" - - @classmethod - def from_float(cls, float_conv, weight_qparams): - return _ConvTransposeNd.from_float(cls, float_conv, weight_qparams) From 0951cb513a68f9229b35b37f9147904e2c2e4232 Mon Sep 17 00:00:00 2001 From: Alban Desmaison Date: Fri, 18 Feb 2022 09:13:11 -0800 Subject: [PATCH 158/199] Revert D34342689: Revert D34250357: Sync lazy_tensor_staging back to master Test Plan: revert-hammer Differential Revision: D34342689 Original commit changeset: 43f6da6986f7 Original Phabricator Diff: D34250357 (https://github.com/pytorch/pytorch/commit/69389fb5423832272a36d3ef4bd2a39d10489507) fbshipit-source-id: 8a3fb74877e719e9b9577b58027b4e7061a04ef0 (cherry picked from commit c749f08e7aac9075a98f0b9cb4494d857fe0b481) --- test/cpp/lazy/test_cache.cpp | 2 +- test/cpp/lazy/test_ir.cpp | 2 +- test/cpp/lazy/test_ir_util.cpp | 2 +- tools/codegen/api/lazy.py | 53 +++++-- tools/codegen/dest/lazy_ir.py | 147 +++++++++++++------ tools/codegen/dest/lazy_ts_lowering.py | 8 +- tools/codegen/gen_lazy_tensor.py | 22 +-- torch/csrc/lazy/core/config.cpp | 5 + torch/csrc/lazy/core/config.h | 1 + torch/csrc/lazy/core/ir.cpp | 27 +++- torch/csrc/lazy/core/ir.h | 41 +++++- torch/csrc/lazy/core/lazy_graph_executor.cpp | 10 +- torch/csrc/lazy/core/shape.cpp | 8 +- torch/csrc/lazy/core/shape.h | 2 +- torch/csrc/lazy/ts_backend/ts_node.cpp | 18 ++- torch/csrc/lazy/ts_backend/ts_node.h | 2 +- 16 files changed, 248 insertions(+), 102 deletions(-) diff --git a/test/cpp/lazy/test_cache.cpp b/test/cpp/lazy/test_cache.cpp index 033b6c21b1e..a6da9bccbd2 100644 --- a/test/cpp/lazy/test_cache.cpp +++ b/test/cpp/lazy/test_cache.cpp @@ -11,7 +11,7 @@ namespace lazy { class CacheNode : public Node { public: explicit CacheNode(const std::string& str) - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(str)), + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool /*bakeInSizes*/) -> hash_t { return Hash(str); }), str_(str) {} ~CacheNode() override = default; diff --git a/test/cpp/lazy/test_ir.cpp b/test/cpp/lazy/test_ir.cpp index 78b94618c7f..326f7a9092c 100644 --- a/test/cpp/lazy/test_ir.cpp +++ b/test/cpp/lazy/test_ir.cpp @@ -12,7 +12,7 @@ namespace lazy { class TestLeafNode : public Node { public: explicit TestLeafNode(size_t param) - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(param)), + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */[&](bool /*bakeInSizes*/) -> hash_t { return Hash(param); }), param_(param) {} ~TestLeafNode() override = default; diff --git a/test/cpp/lazy/test_ir_util.cpp b/test/cpp/lazy/test_ir_util.cpp index 5c216258f9a..bb29cff6f6b 100644 --- a/test/cpp/lazy/test_ir_util.cpp +++ b/test/cpp/lazy/test_ir_util.cpp @@ -12,7 +12,7 @@ namespace lazy { class IrUtilNode : public Node { public: explicit IrUtilNode() - : Node(OpKind(), /* num_outputs */ 1, /* hash_seed */ Hash(0)) {} + : Node(OpKind(), /* num_outputs */ 1, /* hash_func */ [&](bool /*bakeInSizes*/) -> hash_t { return Hash(0); }) {} ~IrUtilNode() override = default; void AddOperand(Value v) { diff --git a/tools/codegen/api/lazy.py b/tools/codegen/api/lazy.py index 3fe83936eef..ebbc72eb1fc 100644 --- a/tools/codegen/api/lazy.py +++ b/tools/codegen/api/lazy.py @@ -1,12 +1,11 @@ from typing import List, Union, Tuple from tools.codegen.model import (Type, BaseTy, BaseType, OptionalType, ListType, OperatorName, FunctionSchema, - Return) -from tools.codegen.api.types import (BaseCppType, BaseCType, OptionalCType, - ConstRefCType, NamedCType, - MutRefCType, + Return, TensorOptionsArguments) +from tools.codegen.api.types import (CType, BaseCppType, BaseCType, OptionalCType, + NamedCType, deviceT, layoutT, VectorCType, boolT, longT, doubleT, ListCType, stringT, - scalarT, scalarTypeT, ArrayRefCType, ArrayCType, TupleCType) + scalarT, scalarTypeT) valueT = BaseCppType('torch::lazy', 'Value') @@ -33,7 +32,9 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L if typ.name == BaseTy.Tensor: return BaseCType(valueT) elif typ.name == BaseTy.Scalar: - return BaseCType(scalarT) + # at::scalar has special handling, + # and is wrapped in an IR value just like at::tensor + return BaseCType(valueT) elif typ.name == BaseTy.ScalarType: return BaseCType(scalarTypeT) elif typ.name == BaseTy.int: @@ -44,6 +45,10 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L return BaseCType(doubleT) elif typ.name == BaseTy.str: return BaseCType(stringT) + elif typ.name == BaseTy.Device: + return BaseCType(deviceT) + elif typ.name == BaseTy.Layout: + return BaseCType(layoutT) else: raise AssertionError(f"TODO add support for type {repr(typ)}") elif isinstance(typ, OptionalType): @@ -58,19 +63,36 @@ def process_ir_type(typ: Type) -> Union[BaseCType, VectorCType, OptionalCType, L raise AssertionError(f"unrecognized type {repr(typ)}") -def isValueType(typ: Union[Type, BaseCType, OptionalCType, ConstRefCType, MutRefCType, - ListCType, ArrayRefCType, ArrayCType, VectorCType, TupleCType]) -> bool: +def isValueType(typ: CType) -> bool: """ Given a type, determine if it is a Value-like type. This is equivalent to being Tensor-like, but assumes the type has already been transformed. """ if isinstance(typ, BaseCType): - return typ.type == valueT + # I am regretting my naming conventions, but now we are wrapping at::scalar in + # lazy value, while preserving other 'scalar' types as scalars in the IR + return typ.type == valueT or typ.type == scalarT elif isinstance(typ, (OptionalCType, ListCType, VectorCType)): return isValueType(typ.elem) else: return False +def isWrappedScalarType(typ: Type) -> bool: + """ + Given a type, determine if it is a c10::scalar which we will wrap in a lazy Value. + Since we literally change the type from scalarT to valueT, information is lost. + This function helps build a list of wrapped scalars to save that information + """ + if isinstance(typ, BaseType): + # I am regretting my naming conventions, but now we are wrapping at::scalar in + # lazy value, while preserving other 'scalar' types as scalars in the IR + return typ.name == BaseTy.Scalar + elif isinstance(typ, (OptionalType, ListType)): + return isWrappedScalarType(typ.elem) + else: + return False + + # Inspired by a FunctionSchema object, a LazyIrSchema holds the schema of a Lazy IR node. # Unlike a FunctionSchema, it has no round-trippable string form (relating to the YAML), # but carries type information from a native FunctionSchema modified for use with IR nodes, @@ -87,6 +109,8 @@ class LazyIrSchema: # TODO: Need to handle collisions with argument names at some point returns: Tuple['Return', ...] + wrapped_scalar_names: List[str] + def __init__(self, func: FunctionSchema): positional_arg_types = [] @@ -108,14 +132,15 @@ class LazyIrSchema: "tensor_options", "post_tensor_options_kwarg_only", "out"]: - if getattr(func.arguments, arg_field) is not None: - keyword_arg_types.extend([ - NamedCType( - arg.name, - process_ir_type(arg.type)) for arg in getattr(func.arguments, arg_field)]) + curr_args = getattr(func.arguments, arg_field) + if curr_args is not None: + if isinstance(curr_args, TensorOptionsArguments): + curr_args = curr_args.all() + keyword_arg_types.extend([NamedCType(arg.name, process_ir_type(arg.type)) for arg in curr_args]) self.keyword_arg_types = tuple(keyword_arg_types) self.name = func.name self.returns = func.returns + self.wrapped_scalar_names = [arg.name for arg in func.schema_order_arguments() if isWrappedScalarType(arg.type)] @property def node_name(self) -> str: diff --git a/tools/codegen/dest/lazy_ir.py b/tools/codegen/dest/lazy_ir.py index d41b4edcd8a..58fc6862900 100644 --- a/tools/codegen/dest/lazy_ir.py +++ b/tools/codegen/dest/lazy_ir.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from typing import List, Union from dataclasses import dataclass from tools.codegen.context import method_with_native_function @@ -9,17 +10,23 @@ import tools.codegen.api.dispatcher as dispatcher from tools.codegen.api.lazy import LazyIrSchema, isValueType from tools.codegen.dest.lazy_ts_lowering import ts_lowering_body - -def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: +def node_ctor_arg_rvalue_string(arg: NamedCType, schema: LazyIrSchema) -> str: """ Given a NamedCType from a lazy IR schema, generate a c++ string for materializing an rvalue of that arg for passing into a lazy Node constructor. """ + if isValueType(arg.type): if isinstance(arg.type, BaseCType): + if arg.name in schema.wrapped_scalar_names: + return f"torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen({arg.name})" return f"lazy_{arg.name}.GetIrValue()" elif isinstance(arg.type, OptionalCType): + if arg.name in schema.wrapped_scalar_names: + return f"{arg.name} ? " \ + f"c10::make_optional(torch::lazy::LazyGraphExecutor::Get()->GetIrValueForScalarFromCodegen(*{arg.name})) : " \ + "c10::nullopt" return f"lazy_{arg.name} ? " \ f"c10::make_optional(lazy_{arg.name}.GetIrValue()) : " \ "c10::nullopt" @@ -35,24 +42,55 @@ def node_ctor_arg_rvalue_string(arg: NamedCType) -> str: else: return f"{arg.name}" -def node_ctor_inputs(func: LazyIrSchema) -> str: +def node_ctor_inputs(schema: LazyIrSchema) -> str: """ Produce a formatted string with the arguments as passed into the constructor of a node class. """ - node_ctor_values = [node_ctor_arg_rvalue_string(arg) for arg in func.filtered_types()] + node_ctor_values = [node_ctor_arg_rvalue_string(arg, schema) for arg in schema.filtered_types()] return ",\n ".join(node_ctor_values) +def gen_fallback_code(schema: LazyIrSchema, overload_name: str) -> str: + """ + Generate code that falls back to eager conditioned on a predicate + """ + fallback_args = ",\n ".join([str(arg.name) for arg in schema.filtered_types()]) + if len(overload_name): + aten_op_str = f"ATEN_OP2({schema.aten_name}, {overload_name})" + else: + aten_op_str = f"ATEN_OP({schema.aten_name})" + return f""" + if (force_eager_fallback({aten_symbol(schema)})) {{ + return at::native::call_fallback_fn<<c_eager_fallback, {aten_op_str}>::call( + {fallback_args} + ); + }} +""" + +def aten_symbol(schema: LazyIrSchema) -> str: + missing_interned_strings = { + 'sigmoid_backward', + } + if schema.aten_name in missing_interned_strings: + return f'c10::Symbol::fromQualString("aten::{schema.aten_name}")' + return f'at::aten::{schema.aten_name}' @dataclass(frozen=True) -class LazyIR: +class LazyIR(ABC): backend_index: BackendIndex node_base: str + lowering_function_type: str = "" + lowering_context_type: str = "" + lowering_return_type: str = "" @method_with_native_function def __call__(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: func = f.functional.func if isinstance(f, NativeFunctionsGroup) else f.func return self.gen(f) + @abstractmethod + def lowering_body(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> str: + pass + def gen(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> List[str]: # for now, we just want one IR class decl and soon after also the method defs # and we use the functional version not out/inplace. @@ -63,9 +101,9 @@ class LazyIR: scalar_types = schema.filtered_types(values=False, scalars=True) node_ctor_args = ", ".join([f"const {i.cpp_type()}& {i.name}" for i in all_types]) - scalar_initializers = ",\n ".join([f"{t.name}_({t.name})" for t in scalar_types]) + scalar_initializers = ",\n ".join([f"{t.name}({t.name})" for t in scalar_types]) comma_if_scalar_initializers = ",\n" if len(scalar_initializers) else "" - scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name}_;" for t in scalar_types]) + scalar_decls = "\n ".join([f"{t.cpp_type()} {t.name};" for t in scalar_types]) scalar_hashes = ", ".join([f"{f.name}" for f in scalar_types]) base_ctor_value_args_list = [] optional_values = [] @@ -83,21 +121,20 @@ class LazyIR: members_to_string = [] for t in scalar_types: if isinstance(t.type, OptionalCType): - members_to_string.append(f"""if ({t.name}_.has_value()) {{ - ss << ", {t.name}=" << {t.name}_.value(); + members_to_string.append(f"""if ({t.name}.has_value()) {{ + ss << ", {t.name}=" << {t.name}.value(); }} else {{ ss << ", {t.name}=null"; }}""") else: - members_to_string.append(f'ss << ", {t.name}=" << {t.name}_;') + members_to_string.append(f'ss << ", {t.name}=" << {t.name};') members_to_string_str = "\n ".join(members_to_string) return [f"""\ -// TODO(alanwaketan): Public members don't need to have _ suffix. class {schema.node_name} : public {self.node_base} {{ public: {schema.node_name}({node_ctor_args}, std::vector&& shapes) - : {self.node_base}(torch::lazy::OpKind(at::aten::{schema.aten_name}), + : {self.node_base}(torch::lazy::OpKind({aten_symbol(schema)}), {{{base_ctor_value_args}}}, std::move(shapes), /* num_outputs */ {len(func.returns)}, torch::lazy::MHash({scalar_hashes})){comma_if_scalar_initializers} @@ -109,14 +146,14 @@ class {schema.node_name} : public {self.node_base} {{ std::string ToString() const override {{ std::stringstream ss; - ss << TsNode::ToString(); + ss << {self.node_base}::ToString(); {members_to_string_str} return ss.str(); }} - torch::lazy::TSOpVector Lower(std::shared_ptr function, - torch::lazy::TSLoweringContext* loctx) const override {{ - {ts_lowering_body(f)} + {self.lowering_return_type} Lower({self.lowering_function_type} function, + {self.lowering_context_type} loctx) const override {{ + {self.lowering_body(f)} }} {scalar_decls} @@ -127,21 +164,34 @@ class {schema.node_name} : public {self.node_base} {{ """, ] -def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str) -> str: +@dataclass(frozen=True) +class TSLazyIR(LazyIR): + lowering_function_type: str = "std::shared_ptr" + lowering_context_type: str = "torch::lazy::TSLoweringContext*" + lowering_return_type: str = "torch::lazy::TSOpVector" + + def lowering_body(self, f: Union[NativeFunctionsGroup, NativeFunction]) -> str: + return ts_lowering_body(f) + + +def lazy_tensor_decls(value_types: List[NamedCType], tensor_class: str, schema: LazyIrSchema) -> str: lazy_tensor_decls: List[str] = [] for t in value_types: + if t.name in schema.wrapped_scalar_names: + # no lazy tensor wrapper for scalars that are promoted to IR values + continue if isinstance(t.type, BaseCType): lazy_tensor_decls.append( f"{tensor_class} lazy_{t.name} = " - f"GetLtcTensorOrCreateForWrappedNumber({t.name}, *device);") + f"torch::lazy::GetLtcTensorOrCreateForWrappedNumber({t.name}, *common_device);") elif isinstance(t.type, OptionalCType): # TODO(alanwaketan): Maybe we want to apply GetLtcTensorOrCreateForWrappedNumber here, but hold it # until we encounter a real world example. lazy_tensor_decls.append( - f" {tensor_class} lazy_{t.name} = TryGetLtcTensor({t.name}.value_or(at::Tensor()));") + f" {tensor_class} lazy_{t.name} = torch::lazy::TryGetLtcTensor({t.name}.value_or(at::Tensor()));") else: raise AssertionError("TODO not sure if there are other valid types to handle here") - return "\n ".join(lazy_tensor_decls) + return ("\n ").join(lazy_tensor_decls) @dataclass(frozen=True) class GenLazyNativeFuncDefinition: @@ -152,17 +202,22 @@ class GenLazyNativeFuncDefinition: @method_with_native_function def __call__(self, func: NativeFunction) -> List[str]: sig = kernel_signature(func, self.backend_index) - - # Lazy IR stuff + metadata = self.backend_index.get_kernel(func) + assert metadata is not None schema = LazyIrSchema(func.func) all_types = schema.filtered_types() value_types = schema.filtered_types(values=True, scalars=False) scalar_types = schema.filtered_types(values=False, scalars=True) returns_length = len(schema.returns) - value_types_names = ", ".join([f"{t.name}" for t in value_types]) - get_device_str = f"""auto device = bridge::GetBackendDevice({value_types_names});""" - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) + fallback_str = gen_fallback_code(schema, overload_name=func.func.name.overload_name) + value_types_names = [f"{t.name}" for t in value_types if t.name not in schema.wrapped_scalar_names] + assert len(value_types_names) > 0, "Code below assumes there is at least one tensor arg" + get_device_str = f"""auto common_device = torch::lazy::GetBackendDevice({', '.join(value_types_names)}); + TORCH_INTERNAL_ASSERT(common_device); + """ + + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) node_ctor_input_str = node_ctor_inputs(schema) # call the meta kernel if it exists, to compute output shape/dtype for our IR @@ -174,37 +229,40 @@ class GenLazyNativeFuncDefinition: shapes_str = ','.join([this_shape(i) for i in range(returns_length)]) meta_out = "std::vector shapes{" + shapes_str + "};" + # TODO: INTEGRATION POINT HERE: meta_str = f"""auto out_meta = at::meta::{schema.aten_name}({', '.join(str(t.name) for t in all_types)}); {meta_out}""" else: - shape_sig = ComputeShapeSignature(func) + shape_sig = ComputeShapeSignature(metadata.kernel, func) meta_str = f""" auto shapes = {shape_sig.shape_call};""" + meta_str += f""" TORCH_INTERNAL_ASSERT(shapes.size() == {returns_length});""" node_str = f"""auto node = torch::lazy::MakeNode({node_ctor_input_str}, std::move(shapes));""" + first_tensor_name = value_types_names[0] + bridge_str = """auto result = torch::lazy::CreateAtenFromLtcTensor( + torch::lazy::LazyTensor::Create(std::move(node), *common_device));""" - assert len(value_types) > 0, f"Only supporting tensor ops so far, none found in {sig}" - first_tensor = value_types[0] - bridge_str = f"""auto result = CreateAtenFromLtcTensor(lazy_{first_tensor.name}.CreateFrom(node));""" if returns_length > 1: bridge_str = f"""std::vector<{self.tensor_class}> lazy_tensors; for (int i = 0; i < {returns_length}; i++) {{ - lazy_tensors.push_back(lazy_{first_tensor.name}.CreateFrom(torch::lazy::Value(node, i))); + lazy_tensors.push_back(torch::lazy::LazyTensor::Create(torch::lazy::Value(node, i), *common_device)); }} - auto result = TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" - if schema.name.name.inplace: + auto result = torch::lazy::TupleAtenFromLtcTensors<{returns_length}>(lazy_tensors);""" + + if schema.name.name.inplace or func.func.is_out_fn(): assert returns_length == 1, "We assumed there was no such case where an op is an in-place variant " \ "and has tuple outputs." - bridge_str = f"""lazy_{first_tensor.name}.SetInPlaceIrValue(node); - auto& result = {first_tensor.name};""" + bridge_str = f"""lazy_{first_tensor_name}.SetInPlaceIrValue(node); + auto& result = {first_tensor_name};""" return [f"""\ - // TODO(alanwaketan): Quite a lot inefficient copy-by-value there. Let's optimize it. - {sig.decl(name=f"{self.class_method_name}::{schema.aten_name}")} {{ + {sig.decl(name=f"{self.class_method_name}::{metadata.kernel}")} {{ + {fallback_str} TORCH_LAZY_FN_COUNTER("lazy::"); {get_device_str} {lazy_tensor_decls_str} @@ -219,17 +277,17 @@ class ComputeShapeSignature: """ Here we use the base name as the suffix of the signature to avoid generating for in-place variants. """ - @method_with_native_function - def __init__(self, f: NativeFunction): + def __init__(self, kernel_name: str, f: NativeFunction): self.__schema = LazyIrSchema(f.func) self.__dispatch_args = ', '.join([a.decl() for a in dispatcher.arguments(f.func)]) self.__call_args = ", ".join([f"{t.name}" for t in self.__schema.filtered_types()]) + self.__kernel_name = kernel_name def __decl_suffix(self) -> str: - return f"{self.__schema.base_name}({self.__dispatch_args})" + return f"{self.__kernel_name}({self.__dispatch_args})" def __call_suffix(self) -> str: - return f"{self.__schema.base_name}({self.__call_args})" + return f"{self.__kernel_name}({self.__call_args})" @property def shape_decl(self) -> str: @@ -246,19 +304,20 @@ class GenLazyShapeInferenceDefinition: tensor_class: str @method_with_native_function + # def gen_lazy_shape_inference_decl(f: NativeFunction, backend_index: BackendIndex, tensor_class: str) -> List[str]: def __call__(self, f: NativeFunction) -> List[str]: sig = kernel_signature(f, self.backend_index) - - # Lazy IR stuff + metadata = self.backend_index.get_kernel(f) + assert metadata is not None schema = LazyIrSchema(f.func) value_types = schema.filtered_types(values=True, scalars=False) - lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class) + lazy_tensor_decls_str = lazy_tensor_decls(value_types, self.tensor_class, schema) node_ctor_input_str = node_ctor_inputs(schema) # Only generate shape/dtype fn for non-structured kernels, # since we just use the meta function for structured kernels if not f.structured and f.structured_delegate is None: - shape_sig = ComputeShapeSignature(f) + shape_sig = ComputeShapeSignature(metadata.kernel, f) return ["\n".join([f"{shape_sig.shape_decl};"])] else: return [] diff --git a/tools/codegen/dest/lazy_ts_lowering.py b/tools/codegen/dest/lazy_ts_lowering.py index 32d505cda7b..3f7701d5587 100644 --- a/tools/codegen/dest/lazy_ts_lowering.py +++ b/tools/codegen/dest/lazy_ts_lowering.py @@ -18,13 +18,12 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: continue emplace_arguments.append('loctx->GetOutputOp(operand(i++))') continue - emplace_arguments.append(f'"{value.name}", {value.name}_') + emplace_arguments.append(f'"{value.name}", {value.name}') emplace_arguments_str = "\n ".join( [f"arguments.emplace_back({a});" for a in emplace_arguments]) - emplace_kwarg_values = [f'loctx->GetOutputOp(operand({i}))' for i in range(len(schema.keyword_values))] - emplace_kwarg_scalars = [f'"{t.name}", {t.name}_' for t in schema.keyword_scalars] - assert len(schema.keyword_values) == 0, "TODO the logic for operand(i) is broken if there are kw values" + emplace_kwarg_values = [f'"{t.name}", loctx->GetOutputOp(operand(i++))' for t in schema.keyword_values] + emplace_kwarg_scalars = [f'"{t.name}", {t.name}' for t in schema.keyword_scalars] emplace_kwarguments = "\n ".join( [f"kwarguments.emplace_back({a});" for a in emplace_kwarg_values + emplace_kwarg_scalars]) return f"""\ @@ -38,6 +37,5 @@ def ts_lowering_body(f: Union[NativeFunctionsGroup, NativeFunction]) -> str: torch::lazy::TSOpVector {schema.aten_name}_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments); CHECK_EQ({schema.aten_name}_out.size(), {len(func.returns)}); - // TODO: need to call GenerateClone sometimes? Or else return LowerBuiltIn() directly return {schema.aten_name}_out; """ diff --git a/tools/codegen/gen_lazy_tensor.py b/tools/codegen/gen_lazy_tensor.py index b2515d3d083..9705620fa2e 100644 --- a/tools/codegen/gen_lazy_tensor.py +++ b/tools/codegen/gen_lazy_tensor.py @@ -3,7 +3,8 @@ import argparse import os import yaml from collections import namedtuple -from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple +from typing import List, Dict, Union, Sequence, Optional, Callable, Iterable, Iterator, Tuple, Type +from tools.codegen.dest.lazy_ir import LazyIR, TSLazyIR from tools.codegen.gen import get_grouped_native_functions, parse_native_yaml from tools.codegen.model import (FunctionSchema, NativeFunction, NativeFunctionsGroup, OperatorName) @@ -60,20 +61,20 @@ def main() -> None: parser.add_argument( '--node_base_hdr', type=str, default=None, help='Path to header file defining custom Lazy IR Node base class') parser.add_argument( - '--tensor_class', type=str, default="LazyTensor", help='Name of backend specific custom Lazy Tensor class') + '--tensor_class', type=str, default="torch::lazy::LazyTensor", help='Name of backend specific custom Lazy Tensor class') parser.add_argument( - '--tensor_class_hdr', type=str, default="lazy_tensor_core/csrc/tensor.h", + '--tensor_class_hdr', type=str, default="torch/csrc/lazy/core/tensor.h", help='Path to header file defining custom Lazy Tensor class') options = parser.parse_args() run(options.source_yaml, options.output_dir, options.dry_run, options.impl_path, options.gen_ts_lowerings, options.node_base, options.node_base_hdr, - options.tensor_class, options.tensor_class_hdr) + options.tensor_class, options.tensor_class_hdr, TSLazyIR) def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[str], gen_ts_lowerings: bool, node_base: str, node_base_hdr: Optional[str], - tensor_class: str, tensor_class_hdr: str) -> None: + tensor_class: str, tensor_class_hdr: str, lazy_ir_cls: Type[LazyIR]) -> None: # Assumes that this file lives at PYTORCH_ROOT/tools/codegen/gen_backend_stubs.py pytorch_root = pathlib.Path(__file__).parent.parent.parent.absolute() @@ -160,11 +161,13 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st fm.write_with_template(f'{backend_key}NativeFunctions.cpp', 'DispatchKeyNativeFunctions.cpp', lambda: { 'includes': [f'#include <{path}>' for path in [ tensor_class_hdr, + "ATen/Functions.h", "ATen/MetaFunctions.h", + "ATen/Operators.h", + "torch/csrc/lazy/core/lazy_graph_executor.h", "torch/csrc/lazy/core/metrics.h", "torch/csrc/lazy/core/shape.h", - "lazy_tensor_core/csrc/aten_ltc_bridge.h", - "lazy_tensor_core/csrc/lazy_graph_executor.h", + "lazy_tensor_core/csrc/ts_backend/aten_eager_fallback.h", f"{output_dir}/{backend_key}NativeFunctions.h", f"{output_dir}/{backend_key}LazyIr.h", f"{output_dir}/{backend_key}ShapeInference.h", @@ -196,7 +199,8 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'func_declarations': list(concat_map_codegen( dest.GenLazyShapeInferenceDefinition(backend_indices[backend_key], tensor_class), - grouped_native_functions + grouped_native_functions, + codegenInplaceVariant=True, )), }) # Generate IR node classes @@ -217,7 +221,7 @@ def run(source_yaml: str, output_dir: str, dry_run: bool, impl_path: Optional[st 'DispatchKey': backend_key, 'dispatch_namespace': backend_key.lower(), 'ir_declarations': list(concat_map_codegen( - dest.LazyIR(backend_indices[backend_key], node_base), + lazy_ir_cls(backend_indices[backend_key], node_base), grouped_native_functions )), }) diff --git a/torch/csrc/lazy/core/config.cpp b/torch/csrc/lazy/core/config.cpp index af86dd926d6..b47054913e1 100644 --- a/torch/csrc/lazy/core/config.cpp +++ b/torch/csrc/lazy/core/config.cpp @@ -7,6 +7,11 @@ C10_DEFINE_bool( false, "Enable parameter aliasing support"); +C10_DEFINE_bool( + torch_lazy_use_thread_pool, + false, + "Use thread pool to schedule backend execution"); + C10_DEFINE_int( torch_lazy_compilation_cache_size, 1024, diff --git a/torch/csrc/lazy/core/config.h b/torch/csrc/lazy/core/config.h index beee5b4b214..fa6630123cd 100644 --- a/torch/csrc/lazy/core/config.h +++ b/torch/csrc/lazy/core/config.h @@ -3,6 +3,7 @@ C10_DECLARE_bool(torch_lazy_ir_debug); C10_DECLARE_bool(torch_lazy_param_aliasing); +C10_DECLARE_bool(torch_lazy_use_thread_pool); C10_DECLARE_int(torch_lazy_compilation_cache_size); C10_DECLARE_int(torch_lazy_device_data_cache_size); diff --git a/torch/csrc/lazy/core/ir.cpp b/torch/csrc/lazy/core/ir.cpp index 63e6ee8744c..a1726aacba6 100644 --- a/torch/csrc/lazy/core/ir.cpp +++ b/torch/csrc/lazy/core/ir.cpp @@ -1,6 +1,8 @@ #include #include +C10_DEFINE_bool(ltc_enable_dynamic_shapes, false, "Whether dynamic shape is enabled"); + namespace torch { namespace lazy { @@ -23,6 +25,14 @@ hash_t Value::hash() const { return HashCombine(node->hash(), Hash(index)); } +hash_t Value::hash_with_sizes() const { + return HashCombine(node->hash_with_sizes(), Hash(index)); +} + +hash_t Value::hash_without_sizes() const { + return HashCombine(node->hash_without_sizes(), Hash(index)); +} + OpKind OpKind::Get(const std::string& name) { return OpKind(c10::Symbol::fromQualString(name)); } @@ -31,18 +41,25 @@ hash_t OpKind::hash() const { return StringHash(op.toQualString()); } -Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash) +bool Node::enableDynamicShape() { + static bool enabled = std::getenv("LTC_ENABLE_DYNAMIC_SHAPES") != nullptr; + return enabled || FLAGS_ltc_enable_dynamic_shapes; +} + +Node::Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn) : op_(op), num_outputs_(num_outputs), node_hash_(node_hash), - dag_hash_(dag_hash), + dag_hash_without_sizes_(dag_hash_fn(false)), + dag_hash_with_sizes_(dag_hash_fn(true)), metadata_(GetMetaDataIfDebugging()) {} -Node::Node(OpKind op, size_t num_outputs, hash_t node_hash) +Node::Node(OpKind op, size_t num_outputs, std::function node_hash_fn) : op_(op), num_outputs_(num_outputs), - node_hash_(node_hash), - dag_hash_(node_hash), + node_hash_(node_hash_fn(!enableDynamicShape())), + dag_hash_without_sizes_(node_hash_fn(false)), + dag_hash_with_sizes_(node_hash_fn(true)), metadata_(GetMetaDataIfDebugging()) {} Node::~Node() = default; diff --git a/torch/csrc/lazy/core/ir.h b/torch/csrc/lazy/core/ir.h index 6ca1df8d2fb..4132400bb65 100644 --- a/torch/csrc/lazy/core/ir.h +++ b/torch/csrc/lazy/core/ir.h @@ -15,6 +15,9 @@ #include #include #include +#include + +C10_DECLARE_bool(ltc_enable_dynamic_shapes); namespace torch { namespace lazy { @@ -65,9 +68,12 @@ using OutputMap = std::unordered_map; // Represents an input/operand for a Node object. struct TORCH_API Value { Value() = default; - /* implicit */ Value(NodePtr node, size_t index = 0) : node(std::move(node)), index(index) {} + /* implicit */ Value(NodePtr&& node, size_t index = 0) : node(std::move(node)), index(index) {} + /* implicit */ Value(const NodePtr& node, size_t index = 0) : node(node), index(index) {} hash_t hash() const; + hash_t hash_with_sizes() const; + hash_t hash_without_sizes() const; operator bool() const { return node != nullptr; @@ -121,7 +127,6 @@ inline std::ostream& operator<<(std::ostream& stream, const OpKind& op) { using OpList = c10::ArrayRef; - // A node in the graph. Nodes for operations which requires extra data to be // stored for lowering, should inherit from this class and add operation // specific member there. For example, a constant might create a new @@ -130,13 +135,18 @@ using OpList = c10::ArrayRef; // client data handle in it. class TORCH_API Node { public: + static bool enableDynamicShape(); + // Creates a new node with the given op name. The op is a unique identifier // for the operation. The num_outputs tells how many outputs a given operation // generates. - Node(OpKind op, size_t num_outputs, hash_t node_hash, hash_t dag_hash); + // + // None leaf node's node_hash does not contains shape information always. + // So we pass in the hash value rather than a function. + Node(OpKind op, size_t num_outputs, hash_t node_hash, std::function dag_hash_fn); // Contructor used to create leaf nodes. - Node(OpKind op, size_t num_outputs, hash_t node_hash); + Node(OpKind op, size_t num_outputs, std::function node_hash_fn); virtual ~Node(); @@ -157,7 +167,15 @@ class TORCH_API Node { } hash_t hash() const { - return dag_hash_; + return enableDynamicShape() ? dag_hash_without_sizes_ : dag_hash_with_sizes_; + } + + hash_t hash_without_sizes() const { + return dag_hash_without_sizes_; + } + + hash_t hash_with_sizes() const { + return dag_hash_with_sizes_; } const MetaData& metadata() const { @@ -183,8 +201,17 @@ class TORCH_API Node { // The hash value of this node. hash_t node_hash_; - // The hash value of the graph rooted at this node. - hash_t dag_hash_; + // dag_hash represents the hash value of the graph rooted at this node. There are 2 variants, one + // with sizes info and one without. We need 2 such hashes to support dynamic + // shape. Here are the logic to pick the hash in the 2 major scenarios that a hash is needed: + // - shape cache: in this case, we always use the dag hash with size info. This way, looking up the + // shape for one node does not get the shape for another node with the same rank but different sizes + // - lookup the compiled graph by a hash: in this case, we will use the dag hash + // WITHOUT size info if dynamic shape is enabled and use the dag hash WITH size info otherwise. + // The different requirement for the hash in these 2 scenarios forces us to maintain 2 + // different hashes. + hash_t dag_hash_without_sizes_; + hash_t dag_hash_with_sizes_; // The IR specific metadata attached to the IR node. MetaData metadata_; // The IR framework user can attach a user defined metadata object deriving diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index 9f504c935e9..3599abb7b8d 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -462,7 +462,7 @@ void LazyGraphExecutor::SyncTensorsGraph( config.sync_ltc_data = sync_ltc_data; auto async = SyncTensorsGraphInternal(tensors, devices, config); - if (wait && async != nullptr) { + if (FLAGS_torch_lazy_use_thread_pool && wait && async != nullptr) { async->mwait.Wait(); } } @@ -972,7 +972,11 @@ std::shared_ptr LazyGraphExecutor:: } }; - ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); + if (FLAGS_torch_lazy_use_thread_pool) { + ScheduleIoClosure(async->mwait.Completer(std::move(syncfn))); + } else { + syncfn(); + } return async; } @@ -995,7 +999,7 @@ std::vector LazyGraphExecutor::GetTensorsFused( SyncTensorsConfig config; config.force_ltc_data = false; auto async = SyncTensorsGraphInternal(tensors, {}, config); - if (async != nullptr) { + if (FLAGS_torch_lazy_use_thread_pool && async != nullptr) { async->mwait.Wait(); } std::vector tensors_data = GatherTensorsData( diff --git a/torch/csrc/lazy/core/shape.cpp b/torch/csrc/lazy/core/shape.cpp index 2b7fd2c74b8..bd5ea5b75c9 100644 --- a/torch/csrc/lazy/core/shape.cpp +++ b/torch/csrc/lazy/core/shape.cpp @@ -28,8 +28,12 @@ size_t Shape::numel() const { return elts; } -hash_t Shape::hash() const { - return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); +hash_t Shape::hash(bool bakeInSizes) const { + if (bakeInSizes) { + return HashCombine(Hash(scalar_type_), DataHash(sizes_.data(), sizes_.size() * sizeof(int64_t))); + } else { + return HashCombine(Hash(scalar_type_), Hash(sizes_.size())); + } } } // namespace lazy diff --git a/torch/csrc/lazy/core/shape.h b/torch/csrc/lazy/core/shape.h index c67ff908833..9b34b90fec0 100644 --- a/torch/csrc/lazy/core/shape.h +++ b/torch/csrc/lazy/core/shape.h @@ -25,7 +25,7 @@ class TORCH_API Shape { int64_t size(int64_t dim) const { return sizes_.at(dim); } void set_size(int64_t dim, int64_t size) { sizes_.at(dim) = size; } size_t numel() const; - hash_t hash() const; + hash_t hash(bool bakeInSizes) const; bool operator==(const Shape& other) const; diff --git a/torch/csrc/lazy/ts_backend/ts_node.cpp b/torch/csrc/lazy/ts_backend/ts_node.cpp index d79dd999f81..a7948e5cbec 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.cpp +++ b/torch/csrc/lazy/ts_backend/ts_node.cpp @@ -28,14 +28,15 @@ void TsNodeSetShapeDeferred( throw std::runtime_error("Expected TsNode but could not dynamic cast"); } -hash_t OperandHashes(const OpList& operands, const hash_t& seed) { +hash_t OperandHashes(const OpList& operands, const hash_t& seed, bool bakeInSizes) { hash_t hash = seed; for (auto& operand : operands) { if (!operand) { hash = HashCombine(hash, static_cast(kNullOpt)); continue; } - hash = HashCombine(hash, operand.hash()); + auto operand_hash = bakeInSizes ? operand.hash_with_sizes() : operand.hash_without_sizes(); + hash = HashCombine(hash, operand_hash); } return hash; } @@ -48,7 +49,7 @@ TsNode::TsNode(OpKind op, OpList operands, std::vector&& shapes, // initialization to a separate function? /* node_hash */ HashCombine(op.hash(), hash_seed), /* dag_hash */ - OperandHashes(operands, HashCombine(op.hash(), hash_seed))), + [&](bool bakeInSizes) { return OperandHashes(operands, HashCombine(op.hash(), hash_seed), bakeInSizes); }), shapes_(shapes) { for (auto& operand : operands) { // Ideally, optional operands should be filtered by the leaf node classes, @@ -80,7 +81,7 @@ void TsNode::SetShapeDeferred( } TsNode::TsNode(OpKind op, Shape shape, size_t num_outputs, hash_t hash_seed) - : Node(op, num_outputs, GetOpHash(op, shape, hash_seed)) + : Node(op, num_outputs, [&](bool bakeInSizes) -> hash_t { return GetOpHash(op, shape, hash_seed, bakeInSizes); }) { shapes_.push_back(std::move(shape)); } @@ -98,10 +99,11 @@ ShapeCache* GetShapeCache() { Shape TsNode::GetOpShape( const std::function& shape_fn) const { + auto hash = hash_with_sizes(); ShapeCache* shape_cache = GetShapeCache(); - auto shape = shape_cache->Get(hash()); + auto shape = shape_cache->Get(hash); if (shape == nullptr) { - shape = shape_cache->Add(hash(), + shape = shape_cache->Add(hash, std::make_shared(shape_fn())); } return *shape; @@ -120,8 +122,8 @@ std::string TsNode::ToString() const { return ss.str(); } -hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed) { - hash_t h = HashCombine(op.hash(), shape.hash()); +hash_t TsNode::GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes) { + hash_t h = HashCombine(op.hash(), shape.hash(bakeInSizes)); return HashCombine(h, hash_seed); } diff --git a/torch/csrc/lazy/ts_backend/ts_node.h b/torch/csrc/lazy/ts_backend/ts_node.h index a6595a5337d..156444852d9 100644 --- a/torch/csrc/lazy/ts_backend/ts_node.h +++ b/torch/csrc/lazy/ts_backend/ts_node.h @@ -55,7 +55,7 @@ class TORCH_API TsNode : public lazy::Node { std::string ToString() const override; - static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed); + static hash_t GetOpHash(OpKind op, const Shape& shape, hash_t hash_seed, bool bakeInSizes); const std::vector& operands() const override { return operands_as_outputs_; From 99427654aa86d052420f18b03ee9aa9abcf7e6d0 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 18 Feb 2022 17:54:47 +0000 Subject: [PATCH 159/199] Use "large" macos for binary builds Hopefully it will fix the timeout Pull Request resolved: https://github.com/pytorch/pytorch/pull/73089 --- .circleci/config.yml | 1 + .circleci/verbatim-sources/job-specs/binary-job-specs.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7310a4f853c..aa59ec758f2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -847,6 +847,7 @@ jobs: <<: *binary_mac_params macos: xcode: "12.0" + resource_class: "large" steps: # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml - checkout diff --git a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml index 5dd8dab85c9..ab60b0d372d 100644 --- a/.circleci/verbatim-sources/job-specs/binary-job-specs.yml +++ b/.circleci/verbatim-sources/job-specs/binary-job-specs.yml @@ -161,6 +161,7 @@ <<: *binary_mac_params macos: xcode: "12.0" + resource_class: "large" steps: # See Note [Workspace for CircleCI scripts] in job-specs-setup.yml - checkout From 38d37436f21c6bd301bb7fd0467e05b76dcea14d Mon Sep 17 00:00:00 2001 From: Linbin Yu Date: Fri, 18 Feb 2022 17:58:37 +0000 Subject: [PATCH 160/199] Add all 4 android ABIs to android build script by default The android build script (./scripts/build_pytorch_android.sh) is broken if user didn't provide the abi list. ./scripts/build_pytorch_android.sh ==> failed due to the so file for armv7 cannot be found ./scripts/build_pytorch_android.sh x86,x86_64 ==> works This is because by default we will build 4 ABIs: https://github.com/pytorch/pytorch/blob/0ca0e02685a9d033ac4f04e2fa5c8ba6dbc5ae50/android/gradle.properties#L1 but only one is provided in this script. Pull Request resolved: https://github.com/pytorch/pytorch/pull/73063 --- android/common.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/android/common.sh b/android/common.sh index ab1cb5ff43c..1fee30bdc38 100644 --- a/android/common.sh +++ b/android/common.sh @@ -29,7 +29,8 @@ check_gradle() { } parse_abis_list() { - ABIS_LIST="x86" + # sync with https://github.com/pytorch/pytorch/blob/0ca0e02685a9d033ac4f04e2fa5c8ba6dbc5ae50/android/gradle.properties#L1 + ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64" CUSTOM_ABIS_LIST=false if [ $# -gt 0 ]; then ABIS_LIST=$1 From 08510ba5e4ae0b53b67f0fbbc9f53b35aec9902c Mon Sep 17 00:00:00 2001 From: Jane Xu Date: Fri, 18 Feb 2022 18:19:03 +0000 Subject: [PATCH 161/199] Disable test history as it's fragile Related to #73083 Pull Request resolved: https://github.com/pytorch/pytorch/pull/73093 --- tools/test/test_test_history.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/test/test_test_history.py b/tools/test/test_test_history.py index 8863c24a5d5..1b8b5c9c60e 100644 --- a/tools/test/test_test_history.py +++ b/tools/test/test_test_history.py @@ -53,6 +53,7 @@ def parse_description(description: str) -> List[Example]: return examples +@unittest.skip("Skipping as this test is fragile, issue #73083") class TestTestHistory(unittest.TestCase): maxDiff = None From a6517c20cf423c9f6e35f77b880bfe98e0ce86e3 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Wed, 16 Feb 2022 18:00:59 -0800 Subject: [PATCH 162/199] [ONNX] Improve Expand shape inference (#69264) Extend shape inference support for `Expand`, when value of argument `shape` is unknown. Infer the rank of the output of `Expand`, and set shape to dynamic, if shape of argument `shape` is known. Without this, shape inference aborts, and falls back to the static shape provided by tracer, which is incorrect in many cases. Co-authored-by: BowenBao Pull Request resolved: https://github.com/pytorch/pytorch/pull/72985 --- .../onnx/test_pytorch_onnx_shape_inference.py | 9 +++++++++ .../jit/passes/onnx/shape_type_inference.cpp | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py index 9319670f977..3808de1ec25 100644 --- a/test/onnx/test_pytorch_onnx_shape_inference.py +++ b/test/onnx/test_pytorch_onnx_shape_inference.py @@ -114,5 +114,14 @@ class TestONNXShapeInference(unittest.TestCase): slice = g.op("Slice", input, start_input, end, axis, step) self.run_test(g, slice.node(), expect_tensor(None, shape=(None, None))) + def test_expand(self): + g = self.create_empty_graph() + input = g.addInput() + constant = self.insert_tensor_constant(g, torch.ones(2, 4)) + input.setType(constant.type().with_sizes([None, None])) + shape = g.op("Shape", input) + expand = g.op("Expand", constant, shape) + self.run_test(g, expand.node(), expect_tensor("Float", shape=(None, None))) + if __name__ == '__main__': unittest.main() diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index 7219d2d9de6..167e401adfb 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -1374,6 +1374,8 @@ void ComputeConstant(Node* n, int opset_version) { if (input0_shape_size.has_value()) { auto input0_shape_value = input0_shape_size.value(); if (ConstantValueMap::HasValue(n->input(1)->debugName())) { + // When value of `shape` is statically known, + // output shape can be computed. auto shape_temp = ConstantValueMap::GetValueInto1DInt64Vector( n->input(1)->debugName()); auto final_shape = @@ -1381,6 +1383,23 @@ void ComputeConstant(Node* n, int opset_version) { if (final_shape.has_value()) { UpdateShape(n->output(), final_shape.value()); } + } else if ( + auto expand_shape = + ConstantValueMap::GetShapeInto1DInt64VectorWithOneUnknown( + n->input(1)->debugName())) { + // When shape of `shape` is statically known, + // output rank can be computed. + TORCH_INTERNAL_ASSERT( + expand_shape.value().size() == 1, + "`Shape` input to `Expand` should be a 1-D tensor. Instead got rank ", + expand_shape.value().size()); + if (expand_shape.value()[0] > 0) { + std::vector final_shape; + for (const auto i : c10::irange(expand_shape.value()[0])) { + final_shape.emplace_back(c10::ShapeSymbol::newSymbol()); + } + UpdateShape(n->output(), c10::SymbolicShape(final_shape)); + } } } } From 32f6a1e2a2e0afb0f94c7f2cfc84fdaae5c90f85 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Thu, 17 Feb 2022 10:45:23 -0800 Subject: [PATCH 163/199] [ONNX] First version of quantized model export: Support quantized.Linear (#69232) Co-authored-by: David Fan Pull Request resolved: https://github.com/pytorch/pytorch/pull/72986 --- test/onnx/test_pytorch_onnx_onnxruntime.py | 26 +- torch/_C/__init__.pyi.in | 3 +- torch/csrc/jit/passes/onnx/helper.h | 7 + torch/csrc/jit/passes/onnx/peephole.cpp | 19 ++ .../jit/passes/onnx/scalar_type_analysis.cpp | 8 +- .../passes/onnx/unpack_quantized_weights.cpp | 246 +++++++++++++----- .../passes/onnx/unpack_quantized_weights.h | 4 +- torch/csrc/onnx/init.cpp | 5 +- torch/onnx/symbolic_helper.py | 4 + torch/onnx/symbolic_opset10.py | 40 +++ torch/onnx/symbolic_opset9.py | 6 + torch/onnx/symbolic_registry.py | 2 +- torch/onnx/utils.py | 35 ++- 13 files changed, 318 insertions(+), 87 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 283968612a0..b912aa78ae6 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -41,6 +41,7 @@ from collections import OrderedDict from torch.nn.utils.rnn import PackedSequence from torch.onnx import CheckerError, register_custom_op_symbolic, unregister_custom_op_symbolic from torch.onnx.symbolic_helper import _unimplemented +from torch.onnx.utils import unpack_quantized_tensor def flatten_tuples(elem): @@ -108,9 +109,16 @@ def inline_flatten_list(inputs, res_list): return res_list +def unpack_to_numpy(value): + value_unpacked = [] + for value_ in value: + value_unpacked.extend(unpack_quantized_tensor(value_)) + value_final = [to_numpy(v) for v in value_unpacked] + return value_final + + def run_ort(ort_sess, input): - input = flatten_tuples(input) - input = to_numpy(input) + input = unpack_to_numpy(flatten_tuples(input)) ort_inputs = dict((ort_sess.get_inputs()[i].name, input) for i, input in enumerate(input)) ort_outs = ort_sess.run(None, ort_inputs) return inline_flatten_list(ort_outs, []) @@ -118,7 +126,7 @@ def run_ort(ort_sess, input): def ort_compare_with_pytorch(ort_outs, output, rtol, atol): output, _ = torch.jit._flatten(output) - outputs = [to_numpy(outp) for outp in output] + outputs = unpack_to_numpy(output) # compare onnxruntime and PyTorch results assert len(outputs) == len(ort_outs), "number of outputs differ" @@ -10256,6 +10264,18 @@ class TestONNXRuntime(unittest.TestCase): loaded_model = onnx.load_from_string(f.getvalue()) self.assertEqual(loaded_model.graph.output[0].type.tensor_type.shape.dim[1].dim_value, 128) + @skipIfUnsupportedMinOpsetVersion(10) + def test_quantized_linear(self): + model = torch.nn.quantized.Linear(1, 2) + input = torch.rand(1, 1) + input_tensor = torch.quantize_per_tensor(input, 1, 0, torch.quint8) + # Currently, we need convert the model to ScriptModule before export. + # The reason is that PackedParams contains int (not tensor). + # Then it fails when the exporter calls _trace_and_get_graph_from_model(). + # TODO: https://msdata.visualstudio.com/Vienna/_workitems/edit/1547858 + self.run_test(torch.jit.trace(model, input_tensor), (input_tensor,)) + self.run_test(torch.jit.script(model), (input_tensor,)) + def make_test(name, base, layer, bidirectional, initial_state, variable_length, dropout, script_test_min_opset_version, **extra_kwargs): diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index ae77710cdbc..60ebbde3030 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -326,7 +326,8 @@ def _jit_pass_onnx_remove_print(graph: Graph) -> None: ... def _jit_pass_onnx_preprocess_caffe2(graph: Graph) -> None: ... def _jit_pass_onnx_unpack_quantized_weights( graph: Graph, - paramsDict: Dict[str, IValue] + paramsDict: Dict[str, IValue], + caffe2: _bool ) -> Dict[str, IValue]: ... def _jit_pass_onnx_quantization_insert_permutes( graph: Graph, diff --git a/torch/csrc/jit/passes/onnx/helper.h b/torch/csrc/jit/passes/onnx/helper.h index 7891ea3d6bc..741eb2f1784 100644 --- a/torch/csrc/jit/passes/onnx/helper.h +++ b/torch/csrc/jit/passes/onnx/helper.h @@ -61,5 +61,12 @@ Node* transformToONNXConcatNode( bool need_new_input, int opset_version); +class ScalarTypeHashFunction { + public: + size_t operator()(const c10::ScalarType& type) const { + return static_cast(type); + } +}; + } // namespace jit } // namespace torch diff --git a/torch/csrc/jit/passes/onnx/peephole.cpp b/torch/csrc/jit/passes/onnx/peephole.cpp index b873bcd6a0b..0282070c748 100644 --- a/torch/csrc/jit/passes/onnx/peephole.cpp +++ b/torch/csrc/jit/passes/onnx/peephole.cpp @@ -761,6 +761,24 @@ static void fuseListConstructListUnpack(Block* b) { } } +// https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export +static void eraseTupleConstruct(Block* block) { + size_t index = 0; + // TupleConstruct is generated from the symbolics in quantized domain, and consumed + // by other quantized operators. The remained TupleConstruct should be at the output of the blocks. + for (auto* output : block->outputs()) { + auto output_node = output->node(); + if (output_node->kind() == prim::TupleConstruct) { + block->eraseOutput(index); + size_t input_index = 0; + for (auto* input: output_node->inputs()) { + block->insertOutput(index + (input_index++), input); + } + } + index++; + } +} + void removeMaxPoolUnusedOutput(Block* b) { for (auto it = b->nodes().begin(), end = b->nodes().end(); it != end; ++it) { auto n = *it; @@ -1025,6 +1043,7 @@ void PeepholeOptimizeONNX( fuseListConstructListUnpack(graph->block()); fuseLogSoftmaxNllLoss(graph->block()); eraseListConstruct(graph->block(), opset_version); + eraseTupleConstruct(graph->block()); EliminateDeadCode( graph->block(), true, diff --git a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp index 61b43fa6b7a..31a727c13ba 100644 --- a/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp +++ b/torch/csrc/jit/passes/onnx/scalar_type_analysis.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -11,13 +12,6 @@ using namespace ::c10::onnx; } namespace { -class ScalarTypeHashFunction { - public: - size_t operator()(const c10::ScalarType& type) const { - return static_cast(type); - } -}; - const int ONNX_OPSET_14 = 14; static const std::unordered_map diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp index 0367ad6972c..bffb5161327 100644 --- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp +++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.cpp @@ -9,12 +9,9 @@ #include #include -#ifndef AT_PER_OPERATOR_HEADERS +// TODO: Switch to per operator headers after +// https://github.com/pytorch/pytorch/pull/68693 is merged #include -#else -#include -#include -#endif #include @@ -104,7 +101,7 @@ double getScaleFromInput(Node* input_node) { input_name); } -Node* CreateQuantizedWeights( +Node* CreateQuantizedWeightsCaffe2( std::string data, std::shared_ptr& graph, std::vector shapes, @@ -118,7 +115,7 @@ Node* CreateQuantizedWeights( return const_node; } -Node* CreateQuantizedBias( +Node* CreateQuantizedBiasCaffe2( std::vector data, std::shared_ptr& graph, std::vector shapes, @@ -132,6 +129,51 @@ Node* CreateQuantizedBias( return const_node; } +std::vector CreateQuantizedWeights( + std::vector data, + std::shared_ptr& graph, + std::vector shapes, + float scale, + int64_t zero_point) { + Node* const_node_1 = graph->create(prim::Constant); + auto const_value = at::from_blob(data.data(), c10::IntArrayRef(shapes), at::kFloat).to(at::kCPU); + auto options = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU); + at::Tensor const_value_copy = at::empty(c10::IntArrayRef(shapes), options); + const_value.copy_(const_value); + const_node_1->t_(Symbol::attr("value"), const_value_copy); + + Node* const_node_2 = graph->create(prim::Constant); + std::vector scale_v{scale}; + std::vector scale_shapes{1}; + auto const_shape = at::from_blob(scale_v.data(), c10::IntArrayRef(scale_shapes), at::kFloat).to(at::kCPU); + at::Tensor const_shape_copy = at::empty(c10::IntArrayRef(scale_shapes), options); + const_shape_copy.copy_(const_shape); + const_node_2->t_(Symbol::attr("value"), const_shape_copy); + + Node* const_node_3 = graph->create(prim::Constant); + std::vector zero_point_v{zero_point}; + std::vector zero_shapes{1}; + auto const_zero = at::from_blob(zero_point_v.data(), c10::IntArrayRef(zero_shapes), at::kInt).to(at::kCPU); + at::Tensor const_zero_copy = at::empty(c10::IntArrayRef(zero_shapes), options); + const_zero_copy.copy_(const_zero); + const_node_3->t_(Symbol::attr("value"), const_zero_copy); + + return {const_node_1, const_node_2, const_node_3}; +} + +Node* CreateQuantizedBias( + std::vector data, + std::shared_ptr& graph, + std::vector shapes) { + Node* const_node_1 = graph->create(prim::Constant); + auto const_bias = at::from_blob(data.data(), c10::IntArrayRef(shapes), at::kFloat).to(at::kCPU); + auto options = c10::TensorOptions().dtype(at::kFloat).device(at::kCPU); + at::Tensor const_bias_copy = at::empty(c10::IntArrayRef(shapes), options); + const_bias_copy.copy_(const_bias); + const_node_1->t_(Symbol::attr("value"), const_bias_copy); + return const_node_1; +} + Node* createIntTuple( const std::vector& is, std::shared_ptr& graph) { @@ -158,7 +200,8 @@ void unpackQuantizedWeightsHelper( std::map& paramsDict, const std::string& pattern, const std::string& unpack_fn, - QuantizedParamsType params_type) { + QuantizedParamsType params_type, + bool caffe2=true) { Graph pattern_graph; std::unordered_map vmap; parseIR(pattern, &pattern_graph, vmap); @@ -368,26 +411,41 @@ void unpackQuantizedWeightsHelper( const int64_t weight_zp = unpacked_weight.q_zero_point() + 128; const int64_t wt_numel = unpacked_weight.numel(); - // Create caffe2::Int8GivenTensorFill node - std::ostringstream os; - for (const auto i : c10::irange(wt_numel)) { - os << static_cast(inp_data[i] + 128); + if (caffe2) { + // Create caffe2::Int8GivenTensorFill node + std::ostringstream os; + for (const auto i : c10::irange(wt_numel)) { + os << static_cast(inp_data[i] + 128); + } + Node* c2_weight = CreateQuantizedWeightsCaffe2( + os.str(), graph, wt_sizes, unpacked_weight.q_scale(), weight_zp); + graph->setInsertPoint(qlinear_node); + c2_weight->insertBefore(qlinear_node); + qlinear_node->insertInput(1, c2_weight->output()); + } else { + std::vector unpacked_weight_values; + unpacked_weight_values.reserve(unpacked_weight.numel()); + auto unpacked_weight_data = reinterpret_cast(unpacked_weight.data_ptr()); + for (const auto i : c10::irange(unpacked_weight.numel())) { + unpacked_weight_values.push_back(static_cast(unpacked_weight_data[i])); + } + std::vector c2_weight = CreateQuantizedWeights( + unpacked_weight_values, graph, wt_sizes, static_cast(unpacked_weight.q_scale()), weight_zp); + graph->setInsertPoint(qlinear_node); + c2_weight[0]->insertBefore(qlinear_node); + qlinear_node->insertInput(1, c2_weight[0]->output()); + c2_weight[1]->insertBefore(qlinear_node); + qlinear_node->insertInput(2, c2_weight[1]->output()); + c2_weight[2]->insertBefore(qlinear_node); + qlinear_node->insertInput(3, c2_weight[2]->output()); } - Node* c2_weight = CreateQuantizedWeights( - os.str(), graph, wt_sizes, unpacked_weight.q_scale(), weight_zp); - graph->setInsertPoint(qlinear_node); - c2_weight->insertBefore(qlinear_node); - qlinear_node->insertInput(1, c2_weight->output()); - // Add bias at::Tensor original_bias; if (bias.has_value()) { original_bias = bias.value(); original_bias.set_requires_grad(false); } else { - // Caffe2 ops always expect bias tensor so if not present create empty - // tensor. int64_t bias_size = unpacked_weight.size(0); original_bias = at::zeros(bias_size, unpacked_weight.options().dtype(at::kFloat)); @@ -402,24 +460,42 @@ void unpackQuantizedWeightsHelper( input_val->type()->str()); auto input_node = match_vmap.at(vmap.at("r"))->node()->inputs()[0]->node(); - auto input_scale = getScaleFromInput(input_node); - auto q_bias = at::quantize_per_tensor( - original_bias, weight_scale * input_scale, 0, at::kQInt32); + at::Tensor q_bias; - std::vector bias_values; - bias_values.reserve(q_bias.numel()); - auto bias_data = (int32_t*)q_bias.data_ptr(); - for (const auto i : c10::irange(q_bias.numel())) { - bias_values.push_back(bias_data[i]); + if (caffe2) { + auto input_scale = getScaleFromInput(input_node); + q_bias = at::quantize_per_tensor(original_bias, weight_scale * input_scale, 0, at::kQInt32); + std::vector bias_values; + bias_values.reserve(q_bias.numel()); + auto bias_data = (int32_t*)q_bias.data_ptr(); + for (const auto i : c10::irange(q_bias.numel())) { + bias_values.push_back(bias_data[i]); + } + Node* c2_bias = CreateQuantizedBiasCaffe2( + bias_values, + graph, + q_bias.sizes().vec(), + q_bias.q_scale(), + q_bias.q_zero_point()); + c2_bias->insertBefore(qlinear_node); + qlinear_node->insertInput(2, c2_bias->output()); + } else { + std::vector bias_values(original_bias.numel()); + auto bias_data = original_bias.data_ptr(); + for (const auto i : c10::irange(original_bias.numel())) { + bias_values[i] = bias_data[i]; + } + Node* bias = CreateQuantizedBias( + bias_values, + graph, + original_bias.sizes().vec()); + bias->insertBefore(qlinear_node); + // For quantized_linear inputs, the order is input, weight, bias, .... + // We unpack weight into 3 values. then it is + // input, weight_value, weight_scale, weight_zero_point, bias, ... + // Therefore bias is at location 4. + qlinear_node->insertInput(4, bias->output()); } - Node* c2_bias = CreateQuantizedBias( - bias_values, - graph, - q_bias.sizes().vec(), - q_bias.q_scale(), - q_bias.q_zero_point()); - c2_bias->insertBefore(qlinear_node); - qlinear_node->insertInput(2, c2_bias->output()); // add conv arguments: stride, padding, dilation, groups if (stride.has_value() && padding.has_value() && dilation.has_value() && @@ -444,9 +520,50 @@ void unpackQuantizedWeightsHelper( eraseUnusedValuesFromMap(valsToParamsMap); } } + +static std::unordered_map qTypeToValType = { + {c10::ScalarType::QInt8, c10::ScalarType::Char}, + {c10::ScalarType::QUInt8, c10::ScalarType::Byte}, + {c10::ScalarType::QInt32, c10::ScalarType::Int}, + {c10::ScalarType::QUInt4x2, c10::ScalarType::Byte}, +}; + +// Unpack quantized tensor inputs into {value, scale, zero_point}, +// Then create a prim::TupleConstruct node based on these three values. +void UnpackQuantizedTensorInputs(std::shared_ptr& graph) { + for (size_t index = 0; index < graph->inputs().size();) { + auto g_input = graph->inputs()[index]; + TensorTypePtr shape_type = g_input->type()->cast(); + if (!shape_type || !shape_type->scalarType().has_value()) { + index++; + continue; + } + auto scalar_type = shape_type->scalarType().value(); + if (qTypeToValType.find(scalar_type) == qTypeToValType.end()) { + index++; + continue; + } + std::string input_name = g_input->debugName(); + auto input_value = graph->insertInput(index, input_name + "_value")->setType(shape_type->withScalarType(qTypeToValType[scalar_type])); + // scale and zero_point type can be found at torch/include/ATen/Operators.h + auto input_scale = graph->insertInput(index + 1, input_name + "_scale")->setType(TensorType::create( + at::kDouble, at::kCPU, 0, /*requires_grad=*/c10::nullopt)); + auto input_zero_point = graph->insertInput(index + 2, input_name + "_zero_point")->setType(TensorType::create( + at::kLong, at::kCPU, 0, /*requires_grad=*/c10::nullopt)); + std::vector converted{input_value, input_scale, input_zero_point}; + auto input_tuple = graph->prependNode(graph->createTuple(converted))->output(); + g_input->replaceAllUsesWith(input_tuple); + // Erase the original quantized tensor input. + graph->eraseInput(index + converted.size()); + index += 3; + } +} + +// https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export void UnpackQuantizedWeights( std::shared_ptr& graph, - std::map& paramsDict) { + std::map& paramsDict, + bool caffe2) { std::string qlinear = R"( graph(%input, %packed_weight, %w_scale, %w_zero_point): %r = quantized::linear(%input, %packed_weight, %w_scale, %w_zero_point) @@ -472,31 +589,36 @@ void UnpackQuantizedWeights( paramsDict, qlinear, "quantized::linear_unpack", - QuantizedParamsType::LINEAR); - unpackQuantizedWeightsHelper( - graph, - paramsDict, - qconv2d, - "quantized::conv2d_unpack", - QuantizedParamsType::CONV); - unpackQuantizedWeightsHelper( - graph, - paramsDict, - qconv2d_relu, - "quantized::conv2d_unpack", - QuantizedParamsType::CONV); - unpackQuantizedWeightsHelper( - graph, - paramsDict, - qconv3d, - "quantized::conv3d_unpack", - QuantizedParamsType::CONV); - unpackQuantizedWeightsHelper( - graph, - paramsDict, - qconv3d_relu, - "quantized::conv3d_unpack", - QuantizedParamsType::CONV); + QuantizedParamsType::LINEAR, + caffe2); + if (caffe2) { + unpackQuantizedWeightsHelper( + graph, + paramsDict, + qconv2d, + "quantized::conv2d_unpack", + QuantizedParamsType::CONV); + unpackQuantizedWeightsHelper( + graph, + paramsDict, + qconv2d_relu, + "quantized::conv2d_unpack", + QuantizedParamsType::CONV); + unpackQuantizedWeightsHelper( + graph, + paramsDict, + qconv3d, + "quantized::conv3d_unpack", + QuantizedParamsType::CONV); + unpackQuantizedWeightsHelper( + graph, + paramsDict, + qconv3d_relu, + "quantized::conv3d_unpack", + QuantizedParamsType::CONV); + } else { + UnpackQuantizedTensorInputs(graph); + } GRAPH_DUMP("After UnpackQuantizedWeights: ", graph); } diff --git a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h index 6a38c02181d..d9fd6f5eba7 100644 --- a/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h +++ b/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h @@ -2,6 +2,7 @@ #include #include +#include #include @@ -10,7 +11,8 @@ namespace jit { TORCH_API void UnpackQuantizedWeights( std::shared_ptr& graph, - std::map& paramsDict); + std::map& paramsDict, + bool caffe2); TORCH_API void insertPermutes( std::shared_ptr& graph, std::map& paramsDict); diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp index c7f33f20d26..fce1dd29afa 100644 --- a/torch/csrc/onnx/init.cpp +++ b/torch/csrc/onnx/init.cpp @@ -119,8 +119,9 @@ void initONNXBindings(PyObject* module) { .def( "_jit_pass_onnx_unpack_quantized_weights", [](std::shared_ptr& graph, - std::map& paramsDict) { - UnpackQuantizedWeights(graph, paramsDict); + std::map& paramsDict, + bool caffe2) { + UnpackQuantizedWeights(graph, paramsDict, caffe2); return paramsDict; }, pybind11::return_value_policy::move) diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py index b20b1929ec3..9adc8397aae 100644 --- a/torch/onnx/symbolic_helper.py +++ b/torch/onnx/symbolic_helper.py @@ -132,6 +132,10 @@ def _unpack_list(list_value): assert list_node.kind() == "prim::ListConstruct" return list(list_node.inputs()) +def _unpack_tuple(tuple_value): + tuple_node = tuple_value.node() + assert tuple_node.kind() == "prim::TupleConstruct" + return list(tuple_node.inputs()) # Check if list_value is output from prim::ListConstruct # This is usually called before _unpack_list to ensure the list can be unpacked. diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py index 82709d4e946..0c15875eb58 100644 --- a/torch/onnx/symbolic_opset10.py +++ b/torch/onnx/symbolic_opset10.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import torch from torch.nn.modules.utils import _single, _pair, _triple @@ -9,6 +10,7 @@ import torch.onnx.utils import torch.onnx.symbolic_helper as sym_help from torch.onnx.symbolic_helper import parse_args, _unimplemented import torch.onnx.symbolic_opset9 +from torch.onnx.symbolic_opset9 import linear from sys import maxsize @@ -318,3 +320,41 @@ def isfinite(g, input): inf_node = isinf(g, input) nan_node = isnan(g, input) return __not_(g, __or_(g, inf_node, nan_node)) + + +# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export +class Quantized: + domain = "quantized" + + # DequantizeLinear was added in opset version 10. + @staticmethod + def linear(g, input_original, weight, weight_scale, weight_zero_point, bias, op_scale, op_zero_point): + input_value, input_scale, input_zero_point = sym_help._unpack_tuple(input_original) + # From https://pytorch.org/docs/master/generated/torch.nn.quantized.functional.linear.html + # input (Tensor) – Quantized input of type torch.quint8 + input_type_dq = torch.onnx.TensorProtoDataType.UINT8 + input_value = g.op("Cast", input_value, to_i=input_type_dq) + input_scale = g.op("Cast", input_scale, to_i=torch.onnx.TensorProtoDataType.FLOAT) + input_zero_point = g.op("Cast", input_zero_point, to_i=input_type_dq) + input = g.op("DequantizeLinear", input_value, input_scale, input_zero_point) + # weight (Tensor) – Quantized weight of type torch.qint8 + weight_type_dq = torch.onnx.TensorProtoDataType.INT8 + weight = g.op("Cast", weight, to_i=weight_type_dq) + weight_scale = g.op("Cast", weight_scale, to_i=torch.onnx.TensorProtoDataType.FLOAT) + weight_zero_point = g.op("Cast", weight_zero_point, to_i=weight_type_dq) + weight = g.op("DequantizeLinear", weight, weight_scale, weight_zero_point) + # bias (Tensor) – None or fp32 bias of type torch.float + bias = g.op("Cast", bias, to_i=torch.onnx.TensorProtoDataType.FLOAT) + output = linear(g, input, weight, bias) + + if op_scale is None: + op_scale = input_scale + elif op_scale.type().scalarType() != "Float": + op_scale = g.op("Cast", op_scale, to_i=sym_help.cast_pytorch_to_onnx["Float"]) + + if op_zero_point is None: + op_zero_point = input_zero_point + elif op_zero_point.type().scalarType() != "Byte": + op_zero_point = g.op("Cast", op_zero_point, to_i=sym_help.cast_pytorch_to_onnx["Byte"]) + output = g.op("QuantizeLinear", output, op_scale, op_zero_point) + return g.op("prim::TupleConstruct", output, op_scale, op_zero_point) diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index b73ee60b87e..42fdcf1961e 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + import torch from torch._C import ListType, OptionalType from torch.nn.modules.utils import _single, _pair, _triple @@ -3366,6 +3368,10 @@ class Prim: def ListUnpack(g, *inputs, **kwargs): return None + @staticmethod + def TupleConstruct(g, *inputs, **kwargs): + return None + @staticmethod def Uninitialized(g, *inputs, **kwargs): return None diff --git a/torch/onnx/symbolic_registry.py b/torch/onnx/symbolic_registry.py index 09fcab1ba81..2e577222a89 100644 --- a/torch/onnx/symbolic_registry.py +++ b/torch/onnx/symbolic_registry.py @@ -130,7 +130,7 @@ def get_registered_op(opname, domain, version): class UnsupportedOperatorError(RuntimeError): def __init__(self, domain, opname, version): supported_version = get_op_supported_version(opname, domain, version) - if domain in ["", "aten", "prim"]: + if domain in ["", "aten", "prim", "quantized"]: msg = "Exporting the operator " + opname + " to ONNX opset version " + str(version) + " is not supported. " if supported_version is not None: msg += "Support for this operator was added in version " + str(supported_version) + \ diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index cb5aa150a0a..b2f605c93cb 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -203,10 +203,10 @@ def _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop=Fa # Caffe2-specific optimization is_caffe2_aten_fallback = (operator_export_type == OperatorExportTypes.ONNX_ATEN_FALLBACK and torch.onnx._CAFFE2_ATEN_FALLBACK) + torch.onnx.symbolic_helper._quantized_ops.clear() + # Unpack quantized weights for conv and linear ops and insert into graph. + torch._C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict, is_caffe2_aten_fallback) if is_caffe2_aten_fallback: - torch.onnx.symbolic_helper._quantized_ops.clear() - # Unpack quantized weights for conv and linear ops and insert into graph. - torch._C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict) # Insert permutes before and after each conv op to ensure correct order. torch._C._jit_pass_onnx_quantization_insert_permutes(graph, params_dict) @@ -471,6 +471,21 @@ def _get_example_outputs(model, args): return example_outputs +_qtype_vtype_map = {torch.quint8: torch.uint8, torch.qint8: torch.int8, torch.qint32: torch.int32, torch.quint4x2: torch.int8} + + +def unpack_quantized_tensor(value): + if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map: + q_value_dequantize = value.dequantize() + q_scale = torch.tensor(value.q_scale(), dtype=torch.double) + q_zero_point = torch.tensor(value.q_zero_point(), dtype=torch.int64) + q_value = q_value_dequantize / q_scale + q_zero_point + q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype]) + return q_value, q_scale, q_zero_point + else: + return (value,) + + def _model_to_graph(model, args, verbose=False, input_names=None, output_names=None, operator_export_type=OperatorExportTypes.ONNX, @@ -505,7 +520,10 @@ def _model_to_graph(model, args, verbose=False, from torch.onnx.symbolic_helper import _onnx_shape_inference if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction): example_outputs = _get_example_outputs(model, args) - out_vars, desc = torch.jit._flatten(tuple(example_outputs)) + example_outputs_final = () + for example_output in example_outputs: + example_outputs_final += unpack_quantized_tensor(example_output) + out_vars, desc = torch.jit._flatten(example_outputs_final) torch._C._jit_pass_onnx_assign_output_shape(graph, out_vars, desc, _onnx_shape_inference) else: flatten_args, _ = torch._C._jit_flatten(args) @@ -619,7 +637,7 @@ def unconvertible_ops(model, args, training=TrainingMode.EVAL, opset_version=Non # as-is rather than cause a failure. operator_export_type=OperatorExportTypes.ONNX_FALLTHROUGH) unsupported_ops = list() - supported_namespaces = ("onnx", "prim") + supported_namespaces = ("onnx", "prim", "quantized") for node in graph.nodes(): if node.kind().split(":")[0] not in supported_namespaces: unsupported_ops.append(node.kind()) @@ -1051,11 +1069,8 @@ def _run_symbolic_function(g, block, n, inputs, env, operator_export_type=Operat domain = ns if ns == "aten": domain = "" - if ns == "quantized": - domain = "" - # Caffe2-specific quantized op - if is_caffe2_aten_fallback: - domain = "caffe2" + elif ns == "quantized" and is_caffe2_aten_fallback: + domain = "caffe2" if sym_registry.is_registered_op(op_name, domain, opset_version): symbolic_fn = _find_symbolic_in_registry(domain, op_name, opset_version, operator_export_type) From 5843fea94dde059db913e26b8f345e41976edf87 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Thu, 17 Feb 2022 10:45:23 -0800 Subject: [PATCH 164/199] [ONNX] Add export support for linalg norm (#66575) * Add matrix_norm * Add vector norm * Fixe flake * Fixe flake * nit fixes * Nit fixes * Restructure and add comments Pull Request resolved: https://github.com/pytorch/pytorch/pull/72987 --- test/onnx/test_pytorch_onnx_onnxruntime.py | 122 +++++++++++++++++++++ torch/onnx/symbolic_opset11.py | 12 ++ torch/onnx/symbolic_opset9.py | 87 +++++++++++++++ 3 files changed, 221 insertions(+) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index b912aa78ae6..cfca86679b1 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -6932,6 +6932,128 @@ class TestONNXRuntime(unittest.TestCase): x = torch.randn(2, 3, 5, 5) self.run_test(Det(), x) + def test_linalg_norm(self): + class LinalgSingleDimModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgSingleDimModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.norm(x, ord=self.ord, dim=1) + + x = torch.randn(2, 3, 5, 5) + self.run_test(LinalgSingleDimModel(None), x) + self.run_test(LinalgSingleDimModel(2), x) + self.run_test(LinalgSingleDimModel(float('inf')), x) + self.run_test(LinalgSingleDimModel(-float('inf')), x) + self.run_test(LinalgSingleDimModel(-4), x) + self.run_test(LinalgSingleDimModel(1.5), x) + + class LinalgMultiDimModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgMultiDimModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.norm(x, ord=self.ord, dim=(0, 2)) + + x = torch.randn(2, 3, 5, 5) + self.run_test(LinalgMultiDimModel('fro'), x) + self.run_test(LinalgMultiDimModel(float('inf')), x) + self.run_test(LinalgMultiDimModel(-float('inf')), x) + self.run_test(LinalgMultiDimModel(1), x) + self.run_test(LinalgMultiDimModel(-1), x) + + class LinalgNoDimNoOrdModel(torch.nn.Module): + def forward(self, x): + return torch.linalg.norm(x) + + x = torch.randn(2, 3, 5, 5) + self.run_test(LinalgNoDimNoOrdModel(), x) + y = torch.randn(2, 3) + self.run_test(LinalgNoDimNoOrdModel(), y) + z = torch.randn(2) + self.run_test(LinalgNoDimNoOrdModel(), z) + + class LinalgNoDim1DModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgNoDim1DModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.norm(x, ord=self.ord) + + x = torch.randn(2) + self.run_test(LinalgNoDim1DModel(None), x) + self.run_test(LinalgNoDim1DModel(2), x) + self.run_test(LinalgNoDim1DModel(float('inf')), x) + self.run_test(LinalgNoDim1DModel(-float('inf')), x) + self.run_test(LinalgNoDim1DModel(-4), x) + self.run_test(LinalgNoDim1DModel(1.5), x) + + class LinalgNoDim2DModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgNoDim2DModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.norm(x, ord=self.ord) + + x = torch.randn(2, 3) + self.run_test(LinalgNoDim2DModel('fro'), x) + self.run_test(LinalgNoDim2DModel(float('inf')), x) + self.run_test(LinalgNoDim2DModel(-float('inf')), x) + self.run_test(LinalgNoDim2DModel(1), x) + self.run_test(LinalgNoDim2DModel(-1), x) + + @skipIfUnsupportedMinOpsetVersion(11) + def test_linalg_vector_norm_zero(self): + class LinalgVectorNormModel(torch.nn.Module): + def __init__(self, ord_val): + super(LinalgVectorNormModel, self).__init__() + self.ord = ord_val + + def forward(self, x): + return torch.linalg.vector_norm(x, ord=self.ord) + + x = torch.randn(2, 3, 5, 5) + self.run_test(LinalgVectorNormModel(0), x) + + def test_linalg_vector_norm(self): + class LinalgVectorNormModel(torch.nn.Module): + def __init__(self, ord_val, dim_info): + super(LinalgVectorNormModel, self).__init__() + self.ord = ord_val + self.dim, self.keepdim = dim_info + + def forward(self, x): + return torch.linalg.vector_norm(x, ord=self.ord, dim=self.dim, keepdim=self.keepdim) + + x = torch.randn(2, 3, 5, 5) + ord_options = [2, float('inf'), -float('inf'), -4, 1.5] + dim_options = [(None, False), (1, False), ((1, 2), False), ((1, 2), True)] + for ord_val in ord_options: + for dim_info in dim_options: + self.run_test(LinalgVectorNormModel(ord_val, dim_info), x) + + def test_linalg_matrix_norm(self): + class LinalgMatrixNormModel(torch.nn.Module): + def __init__(self, ord_val, dim_val=(-2, -1), keepdim_val=False): + super(LinalgMatrixNormModel, self).__init__() + self.ord = ord_val + self.dim = dim_val + self.keepdim = keepdim_val + + def forward(self, x): + return torch.linalg.matrix_norm(x, ord=self.ord, dim=self.dim, keepdim=self.keepdim) + + x = torch.randn(2, 3, 5, 5) + ord_options = ['fro', float('inf'), -float('inf'), 1, -1] + for ord_val in ord_options: + self.run_test(LinalgMatrixNormModel(ord_val), x) + self.run_test(LinalgMatrixNormModel(ord_val, (0, 2)), x) + self.run_test(LinalgMatrixNormModel(ord_val, (0, 2), True), x) + # This test checks output scalar type in the ONNX graph should not be null # https://github.com/pytorch/pytorch/issues/28607 @skipIfUnsupportedMinOpsetVersion(10) diff --git a/torch/onnx/symbolic_opset11.py b/torch/onnx/symbolic_opset11.py index 6856c65445c..1d85090cec7 100644 --- a/torch/onnx/symbolic_opset11.py +++ b/torch/onnx/symbolic_opset11.py @@ -9,6 +9,7 @@ import warnings from torch.onnx.symbolic_helper import parse_args, _unimplemented, _is_tensor_list, ScalarType from torch.onnx.symbolic_opset9 import expand, unused, mul +from torch.onnx.symbolic_opset9 import linalg_vector_norm as lvn from torch.nn.modules.utils import _single, _pair, _triple from torch.onnx.utils import _add_block, _add_input_to_block, _add_output_to_block @@ -811,6 +812,17 @@ def flatten(g, input, start_dim, end_dim): return sym_help._flatten_helper(g, input, start_dim, end_dim, dim) +@parse_args("v", "f", "is", "i", "v") +def linalg_vector_norm(g, self, ord, dim, keepdim, dtype): + if ord == 0: + if dim is None: + self = sym_help._reshape_helper(g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))) + keepdim = None + cond_op = g.op("Not", g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0])))) + cond_op = g.op("Cast", cond_op, to_i=sym_help.cast_pytorch_to_onnx["Long"]) + return sym_help._reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim) + else: + return lvn(g, self, ord, dim, keepdim, dtype) @parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i") def embedding_bag(g, diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 42fdcf1961e..34c41ccf109 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -2911,6 +2911,93 @@ def index(g, self, index): return sym_help._reshape_helper(g, self, final_shape) +@parse_args("v", "v", "is", "i", "v") +def linalg_norm(g, self, ord, dim, keepdim, dtype): + # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html + ord_value = None + if dim is None: + if sym_help._is_none(ord): + self = sym_help._reshape_helper(g, self, [-1]) + ord = g.op("Constant", value_t=torch.LongTensor([2])) + self_dim = sym_help._get_tensor_rank(self) + if self_dim is None: + return _unimplemented("dim", + "Input rank must be known at export time.") + if self_dim == 1: + ord_value = sym_help._parse_arg(ord, "f") + else: + dim = [0, 1] + else: + if len(dim) == 1: + if sym_help._is_none(ord): + ord = g.op("Constant", value_t=torch.LongTensor([2])) + ord_value = sym_help._parse_arg(ord, "f") + if ord_value: + return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype) + return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype) + + +@parse_args("v", "f", "is", "i", "v") +def linalg_vector_norm(g, self, ord, dim, keepdim, dtype): + # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html + if dim is None: + self = sym_help._reshape_helper(g, self, [-1]) + keepdim = None + + if ord == math.inf: + result = g.op("ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim) + elif ord == -math.inf: + result = g.op("ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim) + elif ord == 0: + return sym_help._onnx_opset_unsupported_detailed("linalg_vector_norm", 9, 11, "ord=0 not supported") + else: + ord_op = g.op("Constant", value_t=torch.FloatTensor([ord])) + result = sym_help._reducesum_helper(g, g.op("Pow", g.op("Abs", self), ord_op), + axes_i=dim, keepdims_i=keepdim) + result = g.op("Pow", result, g.op("Div", g.op("Constant", value_t=torch.FloatTensor([1])), ord_op)) + return result + + +@parse_args("v", "v", "is", "i", "v") +def linalg_matrix_norm(g, self, ord, dim, keepdim, dtype): + # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html + ord_value = sym_help._parse_arg(ord, "s") + if ord_value == 'fro': + return frobenius_norm(g, self, dim, keepdim) + elif ord_value == 'nuc': + return _unimplemented("linalg.matrix_norm", "ord==nuc") + else: + ord_value = sym_help._parse_arg(ord, "f") + if ord_value is None: + return frobenius_norm(g, self, dim, keepdim) + if ord_value == 2 or ord_value == -2: + # ord = 2/-2 unimplemented due to lack of operators + # used to calculate singular values + return _unimplemented("linalg.matrix_norm", "ord==2") + # Wrap the dim vector to handle neagtive dim values + self_dim = sym_help._get_tensor_rank(self) + if self_dim is None: + return _unimplemented("linalg.matrix_norm", + "Input rank must be known at export time.") + # Common implementation for cases with + # ord = 1/-1 and ord = inf/-inf + if dim[0] < 0: + dim[0] += self_dim + if dim[1] < 0: + dim[1] += self_dim + + if ord_value == math.inf or ord_value == -math.inf: + dim[0], dim[1] = dim[1], dim[0] + if dim[1] > dim[0] and not keepdim: + dim[1] -= 1 + sum = sym_help._reducesum_helper(g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim) + if ord_value > 0: + result, indices = max(g, sum, dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])), keepdim=keepdim) + else: + result, indices = min(g, sum, dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])), keepdim=keepdim) + return result + + @parse_args("v", "is", "i") def frobenius_norm(g, self, dim=None, keepdim=False): sqr = g.op("Mul", self, self) From 87f882b056a3b36b9b9b8db7b1e6339b541c4b61 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 18 Feb 2022 10:03:39 -0800 Subject: [PATCH 165/199] Move magma utils to its own header (#73058) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73058 And keep it in cuda/linalg folder to make sure all MAGMA and CUSolver usage in codebase is restricted to linalg Test Plan: Imported from OSS Reviewed By: suo Differential Revision: D34327978 Pulled By: malfet fbshipit-source-id: dd4539a2a76bce68cced94fba943bf8a1155db1e (cherry picked from commit 15d8c9b5dd0955b8c1dd60df7714778f809db8ac) --- aten/src/ATen/native/cuda/MiscUtils.h | 80 ----------------- .../native/cuda/linalg/BatchLinearAlgebra.cpp | 1 + aten/src/ATen/native/cuda/linalg/MagmaUtils.h | 88 +++++++++++++++++++ 3 files changed, 89 insertions(+), 80 deletions(-) create mode 100644 aten/src/ATen/native/cuda/linalg/MagmaUtils.h diff --git a/aten/src/ATen/native/cuda/MiscUtils.h b/aten/src/ATen/native/cuda/MiscUtils.h index 39305f41e64..e616a7d1fcf 100644 --- a/aten/src/ATen/native/cuda/MiscUtils.h +++ b/aten/src/ATen/native/cuda/MiscUtils.h @@ -4,89 +4,9 @@ #include #include -#if AT_MAGMA_ENABLED() -#include -#include -#endif - namespace at { namespace native { -#if AT_MAGMA_ENABLED() - -// RAII for a MAGMA Queue -struct MAGMAQueue { - - // Default constructor without a device will cause - // destroying a queue which has not been initialized. - MAGMAQueue() = delete; - - // Constructor - explicit MAGMAQueue(int64_t device_id) { - cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - // Magma operations is numerically sensitive, so TF32 should be off - // regardless of the global flag. - TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode)); - TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); -#endif - magma_queue_create_from_cuda( - device_id, - at::cuda::getCurrentCUDAStream(), - handle, - at::cuda::getCurrentCUDASparseHandle(), - &magma_queue_); - } - - // Getter - magma_queue_t get_queue() const { return magma_queue_; } - - // Destructor - ~MAGMAQueue() { -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we - // should restore the original math mode back - cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_); - cublasSetMathMode(handle, original_math_mode); -#endif - magma_queue_destroy(magma_queue_); - } - - private: - magma_queue_t magma_queue_; -#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 - cublasMath_t original_math_mode; -#endif -}; - -static inline magma_int_t magma_int_cast(int64_t value, const char* varname) { - auto result = static_cast(value); - if (static_cast(result) != value) { - AT_ERROR("magma: The value of ", varname, "(", (long long)value, - ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)"); - } - return result; -} - -// MAGMA functions that don't take a magma_queue_t aren't stream safe -// Work around this by synchronizing with the default stream -struct MagmaStreamSyncGuard { - MagmaStreamSyncGuard() { - auto stream = at::cuda::getCurrentCUDAStream(); - if (stream != at::cuda::getDefaultCUDAStream()) { - at::cuda::stream_synchronize(stream); - } - } - - ~MagmaStreamSyncGuard() noexcept(false) { - auto default_stream = at::cuda::getDefaultCUDAStream(); - if (at::cuda::getCurrentCUDAStream() != default_stream) { - at::cuda::stream_synchronize(default_stream); - } - } -}; -#endif - static inline int cuda_int_cast(int64_t value, const char* varname) { auto result = static_cast(value); TORCH_CHECK(static_cast(result) == value, diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp index 4c9df8c6196..655892a52f9 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #if AT_MAGMA_ENABLED() diff --git a/aten/src/ATen/native/cuda/linalg/MagmaUtils.h b/aten/src/ATen/native/cuda/linalg/MagmaUtils.h new file mode 100644 index 00000000000..a58cfd9bef9 --- /dev/null +++ b/aten/src/ATen/native/cuda/linalg/MagmaUtils.h @@ -0,0 +1,88 @@ +#pragma once +#include + +#if AT_MAGMA_ENABLED() +#include +#include +#endif + +namespace at { +namespace native { + +#if AT_MAGMA_ENABLED() + +// RAII for a MAGMA Queue +struct MAGMAQueue { + + // Default constructor without a device will cause + // destroying a queue which has not been initialized. + MAGMAQueue() = delete; + + // Constructor + explicit MAGMAQueue(int64_t device_id) { + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + // Magma operations is numerically sensitive, so TF32 should be off + // regardless of the global flag. + TORCH_CUDABLAS_CHECK(cublasGetMathMode(handle, &original_math_mode)); + TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); +#endif + magma_queue_create_from_cuda( + device_id, + at::cuda::getCurrentCUDAStream(), + handle, + at::cuda::getCurrentCUDASparseHandle(), + &magma_queue_); + } + + // Getter + magma_queue_t get_queue() const { return magma_queue_; } + + // Destructor + ~MAGMAQueue() { +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + // We've manually set the math mode to CUBLAS_DEFAULT_MATH, now we + // should restore the original math mode back + cublasHandle_t handle = magma_queue_get_cublas_handle(magma_queue_); + cublasSetMathMode(handle, original_math_mode); +#endif + magma_queue_destroy(magma_queue_); + } + + private: + magma_queue_t magma_queue_; +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 + cublasMath_t original_math_mode; +#endif +}; + +static inline magma_int_t magma_int_cast(int64_t value, const char* varname) { + auto result = static_cast(value); + if (static_cast(result) != value) { + AT_ERROR("magma: The value of ", varname, "(", (long long)value, + ") is too large to fit into a magma_int_t (", sizeof(magma_int_t), " bytes)"); + } + return result; +} + +// MAGMA functions that don't take a magma_queue_t aren't stream safe +// Work around this by synchronizing with the default stream +struct MagmaStreamSyncGuard { + MagmaStreamSyncGuard() { + auto stream = at::cuda::getCurrentCUDAStream(); + if (stream != at::cuda::getDefaultCUDAStream()) { + at::cuda::stream_synchronize(stream); + } + } + + ~MagmaStreamSyncGuard() noexcept(false) { + auto default_stream = at::cuda::getDefaultCUDAStream(); + if (at::cuda::getCurrentCUDAStream() != default_stream) { + at::cuda::stream_synchronize(default_stream); + } + } +}; +#endif + +} // namespace native +} // namespace at From 02afdd54b9c4afb5808fb54854f3f15edd4a990e Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Fri, 18 Feb 2022 10:15:48 -0800 Subject: [PATCH 166/199] [Static Runtime] Handle fallback graphs that are generated as part of the TE Fuser (#72945) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72945 ghstack-source-id: 149429754 Test Plan: ``` buck run mode/opt //caffe2/benchmarks/static_runtime:static_runtime_cpptest — --gtest_filter=CpuFusion.FallbackGraph ``` Reviewed By: mikeiovine Differential Revision: D34283840 fbshipit-source-id: 868bd340a50fe691797164524f2400d07998d304 (cherry picked from commit 80f60f2cc098e0132ececb321a35a1d3132fe676) --- benchmarks/static_runtime/test_cpu_fusion.cc | 83 ++++++++++++++++++++ torch/csrc/jit/runtime/static/fusion.cpp | 13 +++ 2 files changed, 96 insertions(+) create mode 100644 benchmarks/static_runtime/test_cpu_fusion.cc diff --git a/benchmarks/static_runtime/test_cpu_fusion.cc b/benchmarks/static_runtime/test_cpu_fusion.cc new file mode 100644 index 00000000000..f482b87957c --- /dev/null +++ b/benchmarks/static_runtime/test_cpu_fusion.cc @@ -0,0 +1,83 @@ +#include +#include +#include + +#include "test_utils.h" + +using namespace torch; +using namespace torch::jit; +using namespace torch::jit::test; + +TEST(CpuFusion, Simple) { + const auto simple_script = R"JIT( + def forward(self, a, b): + return (a + b).relu().tanh() + )JIT"; + + Module m("module"); + m.define(simple_script); + + StaticModuleOptions opts; // start with the defaults. + opts.enable_tensorexpr_fusion = true; + + auto input1 = at::randn({2, 3}); + auto input2 = at::ones({2, 3}); + + auto smodule = StaticModule(m, /* is_frozen */ false, opts, {input1, input2}); + StaticRuntime runtime(smodule); + + // Test with sample inputs + { + auto actual = runtime({input1, input2}, {}); + auto expect = at::tanh(at::relu(input1 + input2)); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } + + // Test with different inputs + { + auto new_input1 = at::randn({5, 14}); + auto new_input2 = at::randn({5, 14}); + auto actual = runtime({new_input1, new_input2}, {}); + auto expect = at::tanh(at::relu(new_input1 + new_input2)); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } +} + +TEST(CpuFusion, FallbackGraph) { + const auto simple_script = R"JIT( + def forward(self, a, b): + return (a + b).relu().tanh() + )JIT"; + + Module m("module"); + m.define(simple_script); + + StaticModuleOptions opts; // start with the defaults. + opts.enable_tensorexpr_fusion = true; + + auto sample_input1 = at::randn({2, 3}); + auto sample_input2 = at::ones({2, 3}); + auto smodule = StaticModule( + m, /* is_frozen */ false, opts, {sample_input1, sample_input2}); + + StaticRuntime runtime(smodule); + + // The sample inputs above were contiguous. Now, use a strided input + // to trigger running the fallback graph. + { + auto input1 = at::narrow(at::randn({2, 6}), 1, 0, 3); + auto input2 = at::ones({2, 3}); + auto expect = at::tanh(at::relu(input1 + input2)); + auto actual = runtime({input1, input2}, {}); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } + + // Test with strided inputs of different size. + { + auto input1 = at::narrow(at::randn({10, 30}), 1, 0, 25); + auto input2 = at::randn({10, 25}); + auto expect = at::tanh(at::relu(input1 + input2)); + auto actual = runtime({input1, input2}, {}); + EXPECT_TRUE(at::allclose(expect, actual.toTensor())); + } +} diff --git a/torch/csrc/jit/runtime/static/fusion.cpp b/torch/csrc/jit/runtime/static/fusion.cpp index 556d1bc0b91..038e03c6f2e 100644 --- a/torch/csrc/jit/runtime/static/fusion.cpp +++ b/torch/csrc/jit/runtime/static/fusion.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -322,6 +323,17 @@ void createFusionGroups(Block* block, AliasDb* aliasDb, size_t min_size) { inlineSmallFusionGroups(block, min_size); } +void inlineFallbackGraphs(std::shared_ptr graph) { + DepthFirstGraphNodeIterator it(graph); + + Node* n = nullptr; + while ((n = it.next()) != nullptr) { + if (n->kind() == prim::FallbackGraph) { + SubgraphUtils::unmergeSubgraph(n); + } + } +} + void performTensorExprFusion( std::shared_ptr graph, std::vector sample_inputs) { @@ -335,6 +347,7 @@ void performTensorExprFusion( /*min_group_size*/ 2, /*add_composed_op*/ false, /*fuse_to_dynamic_shapes*/ true); + inlineFallbackGraphs(traced_graph); graph->block()->clear(); graph->block()->cloneFrom(traced_graph->block(), nullptr); GRAPH_DUMP("Graph after fusion: ", graph); From 2724e4c0394fb0a6c3dcd142cf34ef38b2e47875 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Fri, 18 Feb 2022 10:15:48 -0800 Subject: [PATCH 167/199] [Static Runtime] Do not replace with copy variants if TE fuser is enabled (#72946) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72946 The passes to replace with copy variants are run after TensorExpr fusion. Due to this the resulting graph does not conform to the assumptions made in the fuser. So, even if these flags `use_copy_variants`, `use_maybe_copy_variants` are turned on, the corresponding passes will not be executed if TensorExpr fusion is enabled. ghstack-source-id: 149429753 Test Plan: Tested locally. Reviewed By: mikeiovine Differential Revision: D34283842 fbshipit-source-id: 74edea517a00c85dff0319f9c8b3ac8befe09018 (cherry picked from commit 3798af7f1b8c9b3c072862f58ebf16af6294db14) --- torch/csrc/jit/runtime/static/impl.cpp | 4 ++-- torch/csrc/jit/runtime/static/impl.h | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/torch/csrc/jit/runtime/static/impl.cpp b/torch/csrc/jit/runtime/static/impl.cpp index 9c06bb8c02c..49b7282891f 100644 --- a/torch/csrc/jit/runtime/static/impl.cpp +++ b/torch/csrc/jit/runtime/static/impl.cpp @@ -157,10 +157,10 @@ void OptimizeGraph( // TODO: we can avoid this guard by moving operations // to exposed folders. #ifdef FBCODE_CAFFE2 - if (opts.use_copy_variants) { + if (opts.use_copy_variants && !opts.enable_tensorexpr_fusion) { ReplaceWithCopy(graph); } - if (opts.use_maybe_copy_variants) { + if (opts.use_maybe_copy_variants && !opts.enable_tensorexpr_fusion) { ReplaceWithMaybeCopy(graph); } FuseListUnpack(graph); diff --git a/torch/csrc/jit/runtime/static/impl.h b/torch/csrc/jit/runtime/static/impl.h index 6f3b0d9018a..de9cd367d51 100644 --- a/torch/csrc/jit/runtime/static/impl.h +++ b/torch/csrc/jit/runtime/static/impl.h @@ -166,11 +166,18 @@ struct TORCH_API StaticModuleOptions { bool manage_output_tensors{false}; // Gates the ReplaceWithCopy pass, which replaces ops that // sometimes alias their outputs with out variants that - // always copy (so the output may participate in memory planning) + // always copy (so the output may participate in memory planning). + // Since replacing with copies is done after TensorExpr fusion, the + // resulting graph does not conform to the assumptions made in the fuser. + // So, even if this flag is turned on, the ReplaceWithCopy pass will not + // be executed if TensorExpr fusion is enabled. bool use_copy_variants{true}; // Gates the ReplaceWithMaybeCopy pass, which replaces ops that // sometimes alias their outputs with subgraphs that include an out // variant. + // For the same reason as `use_copy_variants`, the ReplaceWithMaybeCopy pass + // will not be executed if TensorExpr fusion is enabled, even if this flag + // is turned on. bool use_maybe_copy_variants{true}; // enable TensorExpr fusion of ops at model loading time bool enable_tensorexpr_fusion{false}; From 2791725a8431222ba5012a24f96b5106eddab9a3 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Thu, 17 Feb 2022 10:45:23 -0800 Subject: [PATCH 168/199] Integrate full ONNX check into ONNX export API (#71125) Pull Request resolved: https://github.com/pytorch/pytorch/pull/72988 --- torch/_C/__init__.pyi.in | 2 +- torch/csrc/jit/serialization/export.cpp | 8 +++++++- torch/csrc/jit/serialization/export.h | 2 +- torch/csrc/onnx/init.cpp | 5 +++-- torch/onnx/utils.py | 3 ++- 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in index 60ebbde3030..a7d80b10dd4 100644 --- a/torch/_C/__init__.pyi.in +++ b/torch/_C/__init__.pyi.in @@ -410,7 +410,7 @@ def _import_ir_module_from_package( ) -> ScriptModule: ... def _assign_output_shapes(graph: Graph, inputs: List[Tensor]) -> Graph: ... -def _check_onnx_proto(proto: str) -> None: ... +def _check_onnx_proto(proto: str, full_check: _bool = False) -> None: ... def _propagate_and_assign_input_shapes( graph: Graph, inputs: Tuple[Tensor, ...], diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp index 065b9336b2e..2c8bd9565ba 100644 --- a/torch/csrc/jit/serialization/export.cpp +++ b/torch/csrc/jit/serialization/export.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -1248,13 +1249,18 @@ std::string serialize_model_proto_to_string( return model_proto->SerializeAsString(); } -void check_onnx_proto(const std::string& proto_string) { +void check_onnx_proto(const std::string& proto_string, bool full_check) { onnx::ModelProto model; if (!ParseProtoFromBytes(&model, proto_string.c_str(), proto_string.size())) { throw std::runtime_error("Invalid ONNX proto string."); return; } onnx::checker::check_model(model); + + if (full_check) { + onnx::shape_inference::InferShapes(model); + } + } } // namespace jit diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h index 17996a8ec05..b76fa941dfa 100644 --- a/torch/csrc/jit/serialization/export.h +++ b/torch/csrc/jit/serialization/export.h @@ -61,7 +61,7 @@ export_onnx( TORCH_API std::string serialize_model_proto_to_string( const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto); -TORCH_API void check_onnx_proto(const std::string& proto_string); +TORCH_API void check_onnx_proto(const std::string& proto_string, bool full_check=false); // Serializer for both oldsyle and unified format TorchScript serialization class TORCH_API ScriptModuleSerializer { diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp index fce1dd29afa..74f9c875d65 100644 --- a/torch/csrc/onnx/init.cpp +++ b/torch/csrc/onnx/init.cpp @@ -152,8 +152,9 @@ void initONNXBindings(PyObject* module) { m.def( "_check_onnx_proto", - [](const std::string& proto_string) { check_onnx_proto(proto_string); }, - py::arg("proto_string")); + [](const std::string& proto_string, bool full_check) { check_onnx_proto(proto_string, full_check); }, + py::arg("proto_string"), + py::arg("full_check") = false); auto onnx = m.def_submodule("_onnx"); py::enum_<::ONNX_NAMESPACE::TensorProto_DataType>(onnx, "TensorProtoDataType") diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index b2f605c93cb..d8d5fa8a6d2 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -799,13 +799,14 @@ def _export(model, args, f, export_params=True, verbose=False, training=None, # string in memory. if (operator_export_type is OperatorExportTypes.ONNX) and (not val_use_external_data_format): try: - _check_onnx_proto(proto) + _check_onnx_proto(proto, full_check=True) except RuntimeError as e: raise CheckerError(e) finally: assert __IN_ONNX_EXPORT __IN_ONNX_EXPORT = False _reset_trace_module_map() + return torch_out From 98f9ff90268ae62ab6d794cce0786121bf17edc9 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Thu, 17 Feb 2022 10:45:24 -0800 Subject: [PATCH 169/199] [ONNX] Fix an assertion failure involving Slice (#71965) Before this change, exporting a model to ONNX involving Slice crashes at `axes[i]` in line 153 if C++ assertions are enabled: ``` /usr/include/c++/11.1.0/bits/stl_vector.h:1045: std::vector<_Tp, _Alloc>::reference std::vector<_Tp, _Alloc>::operator[](std::vector<_Tp, _Alloc>::size_type) [with _Tp = long int; _Alloc = std::allocator; std::vector<_Tp, _Alloc>::reference = long int&; std::vector<_Tp, _Alloc>::size_type = long unsigned int]: Assertion '__n < this->size()' failed. ``` The relevant check is https://github.com/gcc-mirror/gcc/blob/releases/gcc-11.1.0/libstdc++-v3/include/bits/stl_vector.h#L1045, which checks the vector index. The issue can be reproduced by exporting Mask R-CNN or similar ones. For example, ```Python import io import torch import torchvision as tv model = tv.models.detection.maskrcnn_resnet50_fpn(pretrained=False) x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)] with io.BytesIO() as f: torch.onnx.export(model, x, f, opset_version=11) ``` (extracted from [onnxoptimizer tests](https://github.com/onnx/optimizer/blob/master/onnxoptimizer/test/optimizer_test.py)) Tested environment: Arch Linux x86_64 with pytorch and torchvisoin installed from [the official repo](https://github.com/archlinux/svntogit-community/blob/packages/python-pytorch/trunk/PKGBUILD) and [AUR](https://aur.archlinux.org/cgit/aur.git/tree/PKGBUILD?h=python-torchvision), respectively. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72989 --- torch/csrc/jit/passes/onnx/constant_fold.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/jit/passes/onnx/constant_fold.cpp b/torch/csrc/jit/passes/onnx/constant_fold.cpp index 2901a9b8043..e52d77d04c7 100644 --- a/torch/csrc/jit/passes/onnx/constant_fold.cpp +++ b/torch/csrc/jit/passes/onnx/constant_fold.cpp @@ -147,7 +147,7 @@ c10::optional runTorchSlice_opset10( return c10::nullopt; } auto axes_a = inputTensorValues[3].accessor(); - axes.reserve(inputTensorValues[3].sizes()[0]); + axes.resize(inputTensorValues[3].sizes()[0]); // ONNX slice accepts negative axis, fix this for aten op for (const auto i : c10::irange(inputTensorValues[3].sizes()[0])) { axes[i] = axes_a[i] < 0 ? axes_a[i] + inputTensorValues[0].sizes().size() From 956bafef8bcec1a42eca7ad4d359f7df2d8e2650 Mon Sep 17 00:00:00 2001 From: BowenBao Date: Thu, 17 Feb 2022 10:45:24 -0800 Subject: [PATCH 170/199] [onnx export] Add broadcast to matmul shape inference (#70534) Reuse the same broadcast code from the function `ProcessBroadcastNode`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/72990 --- .../onnx/test_pytorch_onnx_shape_inference.py | 28 ++++ .../jit/passes/onnx/shape_type_inference.cpp | 122 +++++++++--------- 2 files changed, 90 insertions(+), 60 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_shape_inference.py b/test/onnx/test_pytorch_onnx_shape_inference.py index 3808de1ec25..ecd3641c8fd 100644 --- a/test/onnx/test_pytorch_onnx_shape_inference.py +++ b/test/onnx/test_pytorch_onnx_shape_inference.py @@ -114,6 +114,34 @@ class TestONNXShapeInference(unittest.TestCase): slice = g.op("Slice", input, start_input, end, axis, step) self.run_test(g, slice.node(), expect_tensor(None, shape=(None, None))) + def test_broadcast_matmul(self): + g = self.create_empty_graph() + constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2)) + constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1)) + shape = g.op("MatMul", constant, constant_2) + self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 5, 1, 1))) + + # test when first input is of rank 1 + g = self.create_empty_graph() + constant = self.insert_tensor_constant(g, torch.ones(2)) + constant_2 = self.insert_tensor_constant(g, torch.ones(3, 1, 2, 1)) + shape = g.op("MatMul", constant, constant_2) + self.run_test(g, shape.node(), expect_tensor("Float", shape=(3, 1, 1))) + + # test when second input is of rank 1 + g = self.create_empty_graph() + constant = self.insert_tensor_constant(g, torch.ones(5, 1, 2)) + constant_2 = self.insert_tensor_constant(g, torch.ones(2)) + shape = g.op("MatMul", constant, constant_2) + self.run_test(g, shape.node(), expect_tensor("Float", shape=(5, 1))) + + # test when both inputs are of rank 1 + g = self.create_empty_graph() + constant = self.insert_tensor_constant(g, torch.ones(2)) + constant_2 = self.insert_tensor_constant(g, torch.ones(2)) + shape = g.op("MatMul", constant, constant_2) + self.run_test(g, shape.node(), expect_tensor("Float", shape=())) + def test_expand(self): g = self.create_empty_graph() input = g.addInput() diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp index 167e401adfb..2f98ffd8a17 100644 --- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp +++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp @@ -702,54 +702,59 @@ void SetShapeValueFromListConstructNode(Node* lc_node) { } } +std::vector<::c10::ShapeSymbol> Broadcast(const std::vector<::c10::ShapeSymbol> &input_shape_value_0, + const std::vector<::c10::ShapeSymbol> &input_shape_value_1) { + size_t rank_0 = input_shape_value_0.size(); + size_t rank_1 = input_shape_value_1.size(); + size_t rank_max = std::max(rank_0, rank_1); + size_t rank_min = std::min(rank_0, rank_1); + std::vector<::c10::ShapeSymbol> final_shape; + final_shape.reserve(rank_max); + for (auto idx = 0; idx < rank_max; idx++) { + final_shape.emplace_back(::c10::ShapeSymbol::newSymbol()); + } + for (auto idx = 0; idx < rank_min; idx++) { + const c10::ShapeSymbol& ss_shape_0 = + input_shape_value_0[rank_0 - 1 - idx]; + const c10::ShapeSymbol& ss_shape_1 = + input_shape_value_1[rank_1 - 1 - idx]; + bool is_static_0 = ss_shape_0.is_static(); + bool is_static_1 = ss_shape_1.is_static(); + if (is_static_0 && is_static_1) { + int64_t static_0_sz = ss_shape_0.static_size(); + int64_t static_1_sz = ss_shape_1.static_size(); + final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize( + std::max(static_0_sz, static_1_sz)); + } else if (!is_static_0 && !is_static_1) { + if (ss_shape_0.value() == ss_shape_1.value()) { + final_shape[rank_max - 1 - idx] = ss_shape_0; + } + } + } + + if (rank_0 < rank_1) { + for (size_t idx = rank_min; idx < rank_max; idx++) { + size_t shape_idx = rank_max - 1 - idx; + final_shape[shape_idx] = input_shape_value_1[shape_idx]; + } + } else { + for (size_t idx = rank_min; idx < rank_max; idx++) { + size_t shape_idx = rank_max - 1 - idx; + final_shape[shape_idx] = input_shape_value_0[shape_idx]; + } + } + return final_shape; +} + void ProcessBroadcastNode(Node* n) { TORCH_INTERNAL_ASSERT(n->inputs().size() == 2); if (ConstantValueMap::HasShape(n->input(0)->debugName()) && ConstantValueMap::HasShape(n->input(1)->debugName())) { auto input_shape_0 = ConstantValueMap::GetShape(n->input(0)->debugName()); - auto input_shape_value_0 = input_shape_0.value().sizes(); + auto input_shape_value_0 = input_shape_0.value().sizes().value(); auto input_shape_1 = ConstantValueMap::GetShape(n->input(1)->debugName()); - auto input_shape_value_1 = input_shape_1.value().sizes(); - size_t rank_0 = input_shape_value_0.value().size(); - size_t rank_1 = input_shape_value_1.value().size(); - size_t rank_max = std::max(rank_0, rank_1); - size_t rank_min = std::min(rank_0, rank_1); - std::vector<::c10::ShapeSymbol> final_shape; - final_shape.reserve(rank_max); - for (auto idx = 0; idx < rank_max; idx++) { - final_shape.emplace_back(::c10::ShapeSymbol::newSymbol()); - } - for (auto idx = 0; idx < rank_min; idx++) { - const c10::ShapeSymbol& ss_shape_0 = - input_shape_value_0.value()[rank_0 - 1 - idx]; - const c10::ShapeSymbol& ss_shape_1 = - input_shape_value_1.value()[rank_1 - 1 - idx]; - bool is_static_0 = ss_shape_0.is_static(); - bool is_static_1 = ss_shape_1.is_static(); - if (is_static_0 && is_static_1) { - int64_t static_0_sz = ss_shape_0.static_size(); - int64_t static_1_sz = ss_shape_1.static_size(); - final_shape[rank_max - 1 - idx] = ::c10::ShapeSymbol::fromStaticSize( - std::max(static_0_sz, static_1_sz)); - } else if (!is_static_0 && !is_static_1) { - if (ss_shape_0.value() == ss_shape_1.value()) { - final_shape[rank_max - 1 - idx] = ss_shape_0; - } - } - } - - if (rank_0 < rank_1) { - for (auto idx = rank_min; idx < rank_max; idx++) { - auto shape_idx = rank_max - 1 - idx; - final_shape[shape_idx] = input_shape_value_1.value()[shape_idx]; - } - } else { - for (auto idx = rank_min; idx < rank_max; idx++) { - auto shape_idx = rank_max - 1 - idx; - final_shape[shape_idx] = input_shape_value_0.value()[shape_idx]; - } - } - + auto input_shape_value_1 = input_shape_1.value().sizes().value(); + auto final_shape = Broadcast(input_shape_value_0, input_shape_value_1); UpdateShape(n->output(0), c10::SymbolicShape(final_shape)); } } @@ -857,6 +862,8 @@ void ProcessMatMulNode(Node* n) { auto input_shape_value_1 = input_shape_1.sizes().value(); size_t rank_0 = input_shape_value_0.size(); size_t rank_1 = input_shape_value_1.size(); + // Handle inputs of rank 1 just like numpy.matmul: + // https://numpy.org/doc/stable/reference/generated/numpy.matmul.html auto is_rank_0_1 = false; if (rank_0 == 1) { input_shape_value_0.insert( @@ -870,25 +877,20 @@ void ProcessMatMulNode(Node* n) { rank_1 = 2; is_rank_1_1 = true; } - size_t rank = std::max(rank_0, rank_1); - std::vector<::c10::ShapeSymbol> final_shape; - final_shape.reserve(rank); - if (rank_0 >= rank_1) { - for (auto idx = 0; idx < rank_0 - 2; idx++) { - final_shape.emplace_back(input_shape_value_0[idx]); - } - } else { - for (auto idx = 0; idx < rank_1 - 2; idx++) { - final_shape.emplace_back(input_shape_value_1[idx]); - } + // Per https://pytorch.org/docs/stable/generated/torch.matmul.html + // the broadcasting logic only applies to the batch dimensions, and not the matrix dimensions + // so we remove the matrix dimensions which are the last 2 dimensions before broadcasting + auto final_shape = Broadcast( + std::vector<::c10::ShapeSymbol>(input_shape_value_0.begin(), input_shape_value_0.end() - 2), + std::vector<::c10::ShapeSymbol>(input_shape_value_1.begin(), input_shape_value_1.end() - 2) + ); + // add the last 2 dimensions back, unless they do not exist in the first place and inserted by this function + // Then apply [n,k]X[k,m]=[n,m], where n=input_shape_value_0[rank_0 - 2], m=input_shape_value_1[rank_1 - 1] + if (!is_rank_0_1) { + final_shape.emplace_back(input_shape_value_0[rank_0 - 2]); } - final_shape.emplace_back(input_shape_value_0[rank_0 - 2]); - final_shape.emplace_back(input_shape_value_1[rank_1 - 1]); - if (is_rank_0_1) { - final_shape.erase(final_shape.begin()); - } - if (is_rank_1_1) { - final_shape.pop_back(); + if (!is_rank_1_1) { + final_shape.emplace_back(input_shape_value_1[rank_1 - 1]); } UpdateShape(n->output(0), c10::SymbolicShape(final_shape)); } From 9c8fb2ee2db3f685b43e4d053fd93624d767236d Mon Sep 17 00:00:00 2001 From: Eli Uriegas Date: Fri, 18 Feb 2022 10:08:00 -0800 Subject: [PATCH 171/199] .github: Consolidate binary checkout logic Consolidates binary checkout logic to use the standard common logic we have in our common templates. Also fixes issues related to pytorch/builder trying to checkout the head commit for pytorch/pytorch instead of checking out the builder commit we actually want Signed-off-by: Eli Uriegas Pull Request resolved: https://github.com/pytorch/pytorch/pull/73092 Signed-off-by: Eli Uriegas --- .github/templates/common.yml.j2 | 11 +- .../linux_binary_build_workflow.yml.j2 | 14 +- .../macos_binary_build_workflow.yml.j2 | 2 +- .../windows_binary_build_workflow.yml.j2 | 24 +- .../generated-linux-binary-conda.yml | 480 ++++++++--- ...erated-linux-binary-libtorch-cxx11-abi.yml | 480 ++++++++--- ...erated-linux-binary-libtorch-pre-cxx11.yml | 480 ++++++++--- .../generated-linux-binary-manywheel.yml | 672 ++++++++++++--- .../generated-macos-arm64-binary-conda.yml | 6 +- .../generated-macos-arm64-binary-wheel.yml | 8 +- .../generated-macos-binary-conda.yml | 8 +- ...erated-macos-binary-libtorch-cxx11-abi.yml | 8 +- ...erated-macos-binary-libtorch-pre-cxx11.yml | 8 +- .../generated-macos-binary-wheel.yml | 8 +- ...ated-windows-binary-libtorch-cxx11-abi.yml | 800 +++++++++++++----- ...ated-windows-binary-libtorch-pre-cxx11.yml | 800 +++++++++++++----- .../generated-windows-binary-wheel.yml | 800 +++++++++++++----- 17 files changed, 3534 insertions(+), 1075 deletions(-) diff --git a/.github/templates/common.yml.j2 b/.github/templates/common.yml.j2 index 182a93de990..1d5fa7267e8 100644 --- a/.github/templates/common.yml.j2 +++ b/.github/templates/common.yml.j2 @@ -6,6 +6,10 @@ {%- set squid_no_proxy = "localhost,127.0.0.1,github.com,amazonaws.com,s3.amazonaws.com,169.254.169.254,169.254.170.2,/var/run/docker.sock" -%} {%- set timeout_minutes = 240 -%} +# NOTE: If testing pytorch/builder changes you can change this variable to change what pytorch/builder reference +# the binary builds will check out +{%- set builder_branch = "main" -%} + {%- macro concurrency(build_environment) -%} concurrency: group: !{{ build_environment }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} @@ -191,7 +195,9 @@ concurrency: - name: Checkout !{{ 'PyTorch' if repository == "pytorch/pytorch" else repository }} uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - {%- if checkout_pr_head %} + {%- if branch %} + ref: !{{ branch }} + {%- elif checkout_pr_head %} ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} {%- endif %} {%- if deep_clone %} @@ -202,9 +208,6 @@ concurrency: {%- if repository != "pytorch/pytorch" %} repository: !{{ repository }} {%- endif %} - {%- if branch %} - ref: !{{ branch }} - {%- endif %} {%- if directory %} path: !{{ directory }} {%- endif %} diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index ec8b56a8c98..b74699919a9 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -53,7 +53,7 @@ jobs: steps: !{{ common.setup_ec2_linux() }} !{{ common.checkout(deep_clone=False, directory="pytorch") }} - !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", checkout_pr_head=False) }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} {%- if config["gpu_arch_type"] == 'cuda' and config["gpu_arch_version"].startswith('11') %} - name: Set BUILD_SPLIT_CUDA run: | @@ -119,16 +119,8 @@ jobs: with: name: !{{ config["build_name"] }} path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: pytorch - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: builder + !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} {%- if config["gpu_arch_type"] == "cuda" %} - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 926b7e37740..10b0f6310d2 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -80,7 +80,7 @@ jobs: /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" !{{ common.checkout(deep_clone=False, directory="pytorch") }} - !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} - name: Install sccache (only for non-forked PRs, and pushes to trunk) if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} run: | diff --git a/.github/templates/windows_binary_build_workflow.yml.j2 b/.github/templates/windows_binary_build_workflow.yml.j2 index 5f491767c06..df018fc4391 100644 --- a/.github/templates/windows_binary_build_workflow.yml.j2 +++ b/.github/templates/windows_binary_build_workflow.yml.j2 @@ -60,16 +60,8 @@ jobs: steps: !{{ common.setup_ec2_windows() }} !{{ set_runner_specific_vars() }} - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} - name: Populate binary env shell: bash run: | @@ -104,16 +96,8 @@ jobs: with: name: !{{ config["build_name"] }} path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 - with: - path: ${{ env.PYTORCH_ROOT }} - submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 - with: - repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + !{{ common.checkout(deep_clone=False, directory="pytorch") }} + !{{ common.checkout(deep_clone=False, directory="builder", repository="pytorch/builder", branch=common.builder_branch) }} - name: Populate binary env shell: bash run: | diff --git a/.github/workflows/generated-linux-binary-conda.yml b/.github/workflows/generated-linux-binary-conda.yml index 30d56f1ad6a..1cc737f0897 100644 --- a/.github/workflows/generated-linux-binary-conda.yml +++ b/.github/workflows/generated-linux-binary-conda.yml @@ -111,6 +111,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -248,16 +249,29 @@ jobs: with: name: conda-py3_7-cpu path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -502,6 +516,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -640,16 +655,29 @@ jobs: with: name: conda-py3_7-cuda10_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -900,6 +928,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1041,16 +1070,29 @@ jobs: with: name: conda-py3_7-cuda11_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -1301,6 +1343,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1442,16 +1485,29 @@ jobs: with: name: conda-py3_7-cuda11_3 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -1702,6 +1758,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1843,16 +1900,29 @@ jobs: with: name: conda-py3_7-cuda11_5 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2102,6 +2172,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -2239,16 +2310,29 @@ jobs: with: name: conda-py3_8-cpu path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -2493,6 +2577,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -2631,16 +2716,29 @@ jobs: with: name: conda-py3_8-cuda10_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2891,6 +2989,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3032,16 +3131,29 @@ jobs: with: name: conda-py3_8-cuda11_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -3292,6 +3404,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3433,16 +3546,29 @@ jobs: with: name: conda-py3_8-cuda11_3 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -3693,6 +3819,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3834,16 +3961,29 @@ jobs: with: name: conda-py3_8-cuda11_5 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4093,6 +4233,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -4230,16 +4371,29 @@ jobs: with: name: conda-py3_9-cpu path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -4484,6 +4638,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -4622,16 +4777,29 @@ jobs: with: name: conda-py3_9-cuda10_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4882,6 +5050,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5023,16 +5192,29 @@ jobs: with: name: conda-py3_9-cuda11_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -5283,6 +5465,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5424,16 +5607,29 @@ jobs: with: name: conda-py3_9-cuda11_3 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -5684,6 +5880,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5825,16 +6022,29 @@ jobs: with: name: conda-py3_9-cuda11_5 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6084,6 +6294,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6221,16 +6432,29 @@ jobs: with: name: conda-py3_10-cpu path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -6475,6 +6699,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6613,16 +6838,29 @@ jobs: with: name: conda-py3_10-cuda10_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6873,6 +7111,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7014,16 +7253,29 @@ jobs: with: name: conda-py3_10-cuda11_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -7274,6 +7526,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7415,16 +7668,29 @@ jobs: with: name: conda-py3_10-cuda11_3 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -7675,6 +7941,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7816,16 +8083,29 @@ jobs: with: name: conda-py3_10-cuda11_5 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml index 24caf3c7f45..80a42c69acd 100644 --- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml +++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi.yml @@ -112,6 +112,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -250,16 +251,29 @@ jobs: with: name: libtorch-cpu-shared-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -505,6 +519,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -643,16 +658,29 @@ jobs: with: name: libtorch-cpu-shared-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -898,6 +926,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1036,16 +1065,29 @@ jobs: with: name: libtorch-cpu-static-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -1291,6 +1333,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1429,16 +1472,29 @@ jobs: with: name: libtorch-cpu-static-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -1685,6 +1741,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1824,16 +1881,29 @@ jobs: with: name: libtorch-cuda10_2-shared-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2086,6 +2156,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -2225,16 +2296,29 @@ jobs: with: name: libtorch-cuda10_2-shared-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2487,6 +2571,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -2626,16 +2711,29 @@ jobs: with: name: libtorch-cuda10_2-static-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2888,6 +2986,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3027,16 +3126,29 @@ jobs: with: name: libtorch-cuda10_2-static-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -3289,6 +3401,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3431,16 +3544,29 @@ jobs: with: name: libtorch-cuda11_1-shared-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -3693,6 +3819,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3835,16 +3962,29 @@ jobs: with: name: libtorch-cuda11_1-shared-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4097,6 +4237,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -4239,16 +4380,29 @@ jobs: with: name: libtorch-cuda11_1-static-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4501,6 +4655,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -4643,16 +4798,29 @@ jobs: with: name: libtorch-cuda11_1-static-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4905,6 +5073,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5047,16 +5216,29 @@ jobs: with: name: libtorch-cuda11_3-shared-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -5309,6 +5491,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5451,16 +5634,29 @@ jobs: with: name: libtorch-cuda11_3-shared-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -5713,6 +5909,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5855,16 +6052,29 @@ jobs: with: name: libtorch-cuda11_3-static-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6117,6 +6327,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6259,16 +6470,29 @@ jobs: with: name: libtorch-cuda11_3-static-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6521,6 +6745,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6663,16 +6888,29 @@ jobs: with: name: libtorch-cuda11_5-shared-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6925,6 +7163,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7067,16 +7306,29 @@ jobs: with: name: libtorch-cuda11_5-shared-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -7329,6 +7581,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7471,16 +7724,29 @@ jobs: with: name: libtorch-cuda11_5-static-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -7733,6 +7999,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7875,16 +8142,29 @@ jobs: with: name: libtorch-cuda11_5-static-without-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml index 42c8401ed32..cba6c3c4a84 100644 --- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml +++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11.yml @@ -112,6 +112,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -250,16 +251,29 @@ jobs: with: name: libtorch-cpu-shared-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -505,6 +519,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -643,16 +658,29 @@ jobs: with: name: libtorch-cpu-shared-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -898,6 +926,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1036,16 +1065,29 @@ jobs: with: name: libtorch-cpu-static-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -1291,6 +1333,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1429,16 +1472,29 @@ jobs: with: name: libtorch-cpu-static-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -1685,6 +1741,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1824,16 +1881,29 @@ jobs: with: name: libtorch-cuda10_2-shared-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2086,6 +2156,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -2225,16 +2296,29 @@ jobs: with: name: libtorch-cuda10_2-shared-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2487,6 +2571,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -2626,16 +2711,29 @@ jobs: with: name: libtorch-cuda10_2-static-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2888,6 +2986,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3027,16 +3126,29 @@ jobs: with: name: libtorch-cuda10_2-static-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -3289,6 +3401,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3431,16 +3544,29 @@ jobs: with: name: libtorch-cuda11_1-shared-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -3693,6 +3819,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3835,16 +3962,29 @@ jobs: with: name: libtorch-cuda11_1-shared-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4097,6 +4237,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -4239,16 +4380,29 @@ jobs: with: name: libtorch-cuda11_1-static-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4501,6 +4655,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -4643,16 +4798,29 @@ jobs: with: name: libtorch-cuda11_1-static-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4905,6 +5073,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5047,16 +5216,29 @@ jobs: with: name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -5309,6 +5491,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5451,16 +5634,29 @@ jobs: with: name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -5713,6 +5909,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5855,16 +6052,29 @@ jobs: with: name: libtorch-cuda11_3-static-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6117,6 +6327,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6259,16 +6470,29 @@ jobs: with: name: libtorch-cuda11_3-static-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6521,6 +6745,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6663,16 +6888,29 @@ jobs: with: name: libtorch-cuda11_5-shared-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6925,6 +7163,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7067,16 +7306,29 @@ jobs: with: name: libtorch-cuda11_5-shared-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -7329,6 +7581,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7471,16 +7724,29 @@ jobs: with: name: libtorch-cuda11_5-static-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -7733,6 +7999,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7875,16 +8142,29 @@ jobs: with: name: libtorch-cuda11_5-static-without-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | diff --git a/.github/workflows/generated-linux-binary-manywheel.yml b/.github/workflows/generated-linux-binary-manywheel.yml index 5e65a54b54e..128da8fdfe5 100644 --- a/.github/workflows/generated-linux-binary-manywheel.yml +++ b/.github/workflows/generated-linux-binary-manywheel.yml @@ -111,6 +111,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -248,16 +249,29 @@ jobs: with: name: manywheel-py3_7-cpu path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -502,6 +516,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -640,16 +655,29 @@ jobs: with: name: manywheel-py3_7-cuda10_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -900,6 +928,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1041,16 +1070,29 @@ jobs: with: name: manywheel-py3_7-cuda11_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -1301,6 +1343,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1442,16 +1485,29 @@ jobs: with: name: manywheel-py3_7-cuda11_3 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -1702,6 +1758,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -1843,16 +1900,29 @@ jobs: with: name: manywheel-py3_7-cuda11_5 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -2103,6 +2173,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -2241,16 +2312,29 @@ jobs: with: name: manywheel-py3_7-rocm4_3_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -2496,6 +2580,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -2634,16 +2719,29 @@ jobs: with: name: manywheel-py3_7-rocm4_5_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -2888,6 +2986,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3025,16 +3124,29 @@ jobs: with: name: manywheel-py3_8-cpu path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -3279,6 +3391,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3417,16 +3530,29 @@ jobs: with: name: manywheel-py3_8-cuda10_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -3677,6 +3803,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -3818,16 +3945,29 @@ jobs: with: name: manywheel-py3_8-cuda11_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4078,6 +4218,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -4219,16 +4360,29 @@ jobs: with: name: manywheel-py3_8-cuda11_3 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4479,6 +4633,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -4620,16 +4775,29 @@ jobs: with: name: manywheel-py3_8-cuda11_5 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -4880,6 +5048,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5018,16 +5187,29 @@ jobs: with: name: manywheel-py3_8-rocm4_3_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -5273,6 +5455,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5411,16 +5594,29 @@ jobs: with: name: manywheel-py3_8-rocm4_5_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -5665,6 +5861,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -5802,16 +5999,29 @@ jobs: with: name: manywheel-py3_9-cpu path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -6056,6 +6266,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6194,16 +6405,29 @@ jobs: with: name: manywheel-py3_9-cuda10_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6454,6 +6678,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6595,16 +6820,29 @@ jobs: with: name: manywheel-py3_9-cuda11_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -6855,6 +7093,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -6996,16 +7235,29 @@ jobs: with: name: manywheel-py3_9-cuda11_3 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -7256,6 +7508,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7397,16 +7650,29 @@ jobs: with: name: manywheel-py3_9-cuda11_5 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -7657,6 +7923,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -7795,16 +8062,29 @@ jobs: with: name: manywheel-py3_9-rocm4_3_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -8050,6 +8330,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -8188,16 +8469,29 @@ jobs: with: name: manywheel-py3_9-rocm4_5_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -8442,6 +8736,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -8579,16 +8874,29 @@ jobs: with: name: manywheel-py3_10-cpu path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -8833,6 +9141,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -8971,16 +9280,29 @@ jobs: with: name: manywheel-py3_10-cuda10_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -9231,6 +9553,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -9372,16 +9695,29 @@ jobs: with: name: manywheel-py3_10-cuda11_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -9632,6 +9968,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -9773,16 +10110,29 @@ jobs: with: name: manywheel-py3_10-cuda11_3 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -10033,6 +10383,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -10174,16 +10525,29 @@ jobs: with: name: manywheel-py3_10-cuda11_5 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG working-directory: pytorch/ run: | @@ -10434,6 +10798,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -10572,16 +10937,29 @@ jobs: with: name: manywheel-py3_10-rocm4_3_1 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { @@ -10827,6 +11205,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -10965,16 +11344,29 @@ jobs: with: name: manywheel-py3_10-rocm4_5_2 path: "${{ runner.temp }}/artifacts/" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: pytorch + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Pull Docker image run: | retry () { diff --git a/.github/workflows/generated-macos-arm64-binary-conda.yml b/.github/workflows/generated-macos-arm64-binary-conda.yml index 303c3607585..dfdbc0ca288 100644 --- a/.github/workflows/generated-macos-arm64-binary-conda.yml +++ b/.github/workflows/generated-macos-arm64-binary-conda.yml @@ -87,7 +87,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -284,7 +284,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -481,7 +481,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-arm64-binary-wheel.yml b/.github/workflows/generated-macos-arm64-binary-wheel.yml index 52156e1e34b..a994f025637 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel.yml @@ -87,7 +87,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -284,7 +284,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -481,7 +481,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -678,7 +678,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-binary-conda.yml b/.github/workflows/generated-macos-binary-conda.yml index 05450b6ec2f..e24744a113b 100644 --- a/.github/workflows/generated-macos-binary-conda.yml +++ b/.github/workflows/generated-macos-binary-conda.yml @@ -85,7 +85,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -282,7 +282,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -479,7 +479,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -676,7 +676,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml index 4403d9309d1..3843f73484a 100644 --- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml +++ b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi.yml @@ -90,7 +90,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -293,7 +293,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -496,7 +496,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -699,7 +699,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml index 1c8692e7632..b92e881e4ec 100644 --- a/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml +++ b/.github/workflows/generated-macos-binary-libtorch-pre-cxx11.yml @@ -90,7 +90,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -293,7 +293,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -496,7 +496,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -699,7 +699,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-macos-binary-wheel.yml b/.github/workflows/generated-macos-binary-wheel.yml index 8b36a0e1db0..5790d6c547c 100644 --- a/.github/workflows/generated-macos-binary-wheel.yml +++ b/.github/workflows/generated-macos-binary-wheel.yml @@ -85,7 +85,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -282,7 +282,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -479,7 +479,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder @@ -676,7 +676,7 @@ jobs: - name: Checkout pytorch/builder uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + ref: main submodules: recursive repository: pytorch/builder path: builder diff --git a/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml b/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml index f1ff574a1f7..da135465034 100644 --- a/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml +++ b/.github/workflows/generated-windows-binary-libtorch-cxx11-abi.yml @@ -80,16 +80,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -169,16 +182,29 @@ jobs: with: name: libtorch-cpu-shared-with-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -364,16 +390,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -453,16 +492,29 @@ jobs: with: name: libtorch-cpu-shared-without-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -648,16 +700,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -737,16 +802,29 @@ jobs: with: name: libtorch-cpu-static-with-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -932,16 +1010,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1021,16 +1112,29 @@ jobs: with: name: libtorch-cpu-static-without-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1217,16 +1321,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1307,16 +1424,29 @@ jobs: with: name: libtorch-cuda11_1-shared-with-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1504,16 +1634,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1594,16 +1737,29 @@ jobs: with: name: libtorch-cuda11_1-shared-without-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1791,16 +1947,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1881,16 +2050,29 @@ jobs: with: name: libtorch-cuda11_1-static-with-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2078,16 +2260,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2168,16 +2363,29 @@ jobs: with: name: libtorch-cuda11_1-static-without-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2365,16 +2573,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2455,16 +2676,29 @@ jobs: with: name: libtorch-cuda11_3-shared-with-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2652,16 +2886,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2742,16 +2989,29 @@ jobs: with: name: libtorch-cuda11_3-shared-without-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2939,16 +3199,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3029,16 +3302,29 @@ jobs: with: name: libtorch-cuda11_3-static-with-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3226,16 +3512,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3316,16 +3615,29 @@ jobs: with: name: libtorch-cuda11_3-static-without-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3513,16 +3825,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3603,16 +3928,29 @@ jobs: with: name: libtorch-cuda11_5-shared-with-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3800,16 +4138,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3890,16 +4241,29 @@ jobs: with: name: libtorch-cuda11_5-shared-without-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4087,16 +4451,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4177,16 +4554,29 @@ jobs: with: name: libtorch-cuda11_5-static-with-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4374,16 +4764,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4464,16 +4867,29 @@ jobs: with: name: libtorch-cuda11_5-static-without-deps-cxx11-abi path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | diff --git a/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml b/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml index e09c0f8052c..a5d95ec1d73 100644 --- a/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml +++ b/.github/workflows/generated-windows-binary-libtorch-pre-cxx11.yml @@ -80,16 +80,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -169,16 +182,29 @@ jobs: with: name: libtorch-cpu-shared-with-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -364,16 +390,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -453,16 +492,29 @@ jobs: with: name: libtorch-cpu-shared-without-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -648,16 +700,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -737,16 +802,29 @@ jobs: with: name: libtorch-cpu-static-with-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -932,16 +1010,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1021,16 +1112,29 @@ jobs: with: name: libtorch-cpu-static-without-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1217,16 +1321,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1307,16 +1424,29 @@ jobs: with: name: libtorch-cuda11_1-shared-with-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1504,16 +1634,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1594,16 +1737,29 @@ jobs: with: name: libtorch-cuda11_1-shared-without-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1791,16 +1947,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1881,16 +2050,29 @@ jobs: with: name: libtorch-cuda11_1-static-with-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2078,16 +2260,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2168,16 +2363,29 @@ jobs: with: name: libtorch-cuda11_1-static-without-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2365,16 +2573,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2455,16 +2676,29 @@ jobs: with: name: libtorch-cuda11_3-shared-with-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2652,16 +2886,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2742,16 +2989,29 @@ jobs: with: name: libtorch-cuda11_3-shared-without-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2939,16 +3199,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3029,16 +3302,29 @@ jobs: with: name: libtorch-cuda11_3-static-with-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3226,16 +3512,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3316,16 +3615,29 @@ jobs: with: name: libtorch-cuda11_3-static-without-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3513,16 +3825,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3603,16 +3928,29 @@ jobs: with: name: libtorch-cuda11_5-shared-with-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3800,16 +4138,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3890,16 +4241,29 @@ jobs: with: name: libtorch-cuda11_5-shared-without-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4087,16 +4451,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4177,16 +4554,29 @@ jobs: with: name: libtorch-cuda11_5-static-with-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4374,16 +4764,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4464,16 +4867,29 @@ jobs: with: name: libtorch-cuda11_5-static-without-deps-pre-cxx11 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | diff --git a/.github/workflows/generated-windows-binary-wheel.yml b/.github/workflows/generated-windows-binary-wheel.yml index afce9a010bb..c8e57303c4f 100644 --- a/.github/workflows/generated-windows-binary-wheel.yml +++ b/.github/workflows/generated-windows-binary-wheel.yml @@ -76,16 +76,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -161,16 +174,29 @@ jobs: with: name: wheel-py3_7-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -349,16 +375,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -435,16 +474,29 @@ jobs: with: name: wheel-py3_7-cuda11_1 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -624,16 +676,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -710,16 +775,29 @@ jobs: with: name: wheel-py3_7-cuda11_3 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -899,16 +977,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -985,16 +1076,29 @@ jobs: with: name: wheel-py3_7-cuda11_5 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1173,16 +1277,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1258,16 +1375,29 @@ jobs: with: name: wheel-py3_8-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1446,16 +1576,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1532,16 +1675,29 @@ jobs: with: name: wheel-py3_8-cuda11_1 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1721,16 +1877,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1807,16 +1976,29 @@ jobs: with: name: wheel-py3_8-cuda11_3 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -1996,16 +2178,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2082,16 +2277,29 @@ jobs: with: name: wheel-py3_8-cuda11_5 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2270,16 +2478,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2355,16 +2576,29 @@ jobs: with: name: wheel-py3_9-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2543,16 +2777,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2629,16 +2876,29 @@ jobs: with: name: wheel-py3_9-cuda11_1 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2818,16 +3078,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -2904,16 +3177,29 @@ jobs: with: name: wheel-py3_9-cuda11_3 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3093,16 +3379,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3179,16 +3478,29 @@ jobs: with: name: wheel-py3_9-cuda11_5 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3367,16 +3679,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3452,16 +3777,29 @@ jobs: with: name: wheel-py3_10-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3640,16 +3978,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3726,16 +4077,29 @@ jobs: with: name: wheel-py3_10-cuda11_1 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -3915,16 +4279,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4001,16 +4378,29 @@ jobs: with: name: wheel-py3_10-cuda11_3 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4190,16 +4580,29 @@ jobs: echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | @@ -4276,16 +4679,29 @@ jobs: with: name: wheel-py3_10-cuda11_5 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - - name: Clone pytorch/pytorch - uses: actions/checkout@v2 + - name: Checkout PyTorch + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: - path: ${{ env.PYTORCH_ROOT }} + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} submodules: recursive - - name: Clone pytorch/builder - uses: actions/checkout@v2 + path: pytorch + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: zhouzhuojie/checkout@05b13c9a0d21f08f6d5e64a1d5042246d13619d9 with: + ref: main + submodules: recursive repository: pytorch/builder - path: ${{ env.BUILDER_ROOT }} + path: builder + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder - name: Populate binary env shell: bash run: | From 7fe3f334fb4d2c4faf9658505bcaf2ce37a9cfca Mon Sep 17 00:00:00 2001 From: albanD Date: Fri, 18 Feb 2022 10:39:03 -0800 Subject: [PATCH 172/199] Remove call into python API without GIL being held in c10d (#72928) Summary: Fix https://github.com/pytorch/pytorch/issues/26475 Pull Request resolved: https://github.com/pytorch/pytorch/pull/72928 Reviewed By: mikaylagawarecki Differential Revision: D34317697 Pulled By: albanD fbshipit-source-id: e13efb98e8c6bf4cbc05181c028d68871a844bf7 (cherry picked from commit c0e0397688a6c56801c8efabad6401ae3c247e4f) --- torch/csrc/distributed/c10d/init.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 2bfb0dc12ef..3b1eb0a9866 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -650,11 +650,13 @@ Example:: .def( "get", [](::c10d::Store& store, const std::string& key) -> py::bytes { - auto value = store.get(key); + auto value = [&]() { + py::gil_scoped_release guard; + return store.get(key); + }(); return py::bytes( reinterpret_cast(value.data()), value.size()); }, - py::call_guard(), R"( Retrieves the value associated with the given ``key`` in the store. If ``key`` is not present in the store, the function will wait for ``timeout``, which is defined From 1646a0033dda77eca980c40870524996b4ad1f76 Mon Sep 17 00:00:00 2001 From: Stephen Oakley Date: Fri, 18 Feb 2022 11:23:12 -0800 Subject: [PATCH 173/199] Use irange in PyTorch (#72836) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72836 Replacing increment iterator loops with ranged loops. It allows loops such as for(int i=0;i<10;i++) to be expressed as for(const auto i : c10::irange(10)). This auto-types the loops and adds const-safety to the iteration variable. Reviewed By: albanD Differential Revision: D34136539 fbshipit-source-id: 760a70ad43ce6f05630ba8fea261d4dbb699e62e (cherry picked from commit 0428408d883ef81f3a548ab2b26dd58ce634bcb1) --- torch/csrc/autograd/profiler_kineto.cpp | 2 +- torch/csrc/cuda/nccl.cpp | 6 ++---- torch/csrc/deploy/remove_dt_needed.cpp | 3 ++- torch/csrc/lazy/core/tensor_impl.cpp | 3 ++- torch/csrc/lazy/core/view_ops/squeeze.cpp | 3 ++- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index b8bba50c406..206bd52f6d7 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -406,7 +406,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { py_event_indices_{ { nullptr, std::string("null") }}; - for (size_t i = 0; i < py_events.size(); i++) { + for (const auto i : c10::irange(py_events.size())) { py_event_indices_.insert({py_events[i].get(), std::to_string(i)}); } diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp index ae61392ab54..a67ca21ab35 100644 --- a/torch/csrc/cuda/nccl.cpp +++ b/torch/csrc/cuda/nccl.cpp @@ -833,8 +833,7 @@ void gather( if (cur_rank == root) { - for (int r = 0; r < numranks; r++) - { + for (const auto r : c10::irange(numranks)) { if (r != root) { auto* recvbuff = reinterpret_cast(outputs[r].data_ptr()); NCCL_CHECK(ncclRecv(recvbuff, count, type, r, comm, stream)); @@ -874,8 +873,7 @@ void scatter( NCCL_CHECK(ncclGroupStart()); if (cur_rank == root) { - for (int r = 0; r < numranks; r++) - { + for (const auto r : c10::irange(numranks)) { if (r != root) { size_t send_count = inputs[r].numel(); auto send_type = to_nccl_data_type(inputs[r]); diff --git a/torch/csrc/deploy/remove_dt_needed.cpp b/torch/csrc/deploy/remove_dt_needed.cpp index 5f4bb28c7c2..8b1cad53581 100644 --- a/torch/csrc/deploy/remove_dt_needed.cpp +++ b/torch/csrc/deploy/remove_dt_needed.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #define ERROR(msg_fmt, ...) \ @@ -47,7 +48,7 @@ int main(int argc, const char** argv) { auto program_headers = (Elf64_Phdr*)(data + header->e_phoff); auto n_program_headers = header->e_phnum; Elf64_Dyn* dynamic = nullptr; - for (size_t i = 0; i < n_program_headers; ++i) { + for (const auto i : c10::irange(n_program_headers)) { const Elf64_Phdr* phdr = &program_headers[i]; if (phdr->p_type == PT_DYNAMIC) { dynamic = reinterpret_cast(data + phdr->p_offset); diff --git a/torch/csrc/lazy/core/tensor_impl.cpp b/torch/csrc/lazy/core/tensor_impl.cpp index f0f09256da4..b6d148a568f 100644 --- a/torch/csrc/lazy/core/tensor_impl.cpp +++ b/torch/csrc/lazy/core/tensor_impl.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace torch { @@ -144,7 +145,7 @@ void LTCTensorImpl::setup_size_properties() { // We can't call empty_tensor_restride(c10::MemoryFormat::Contiguous) given we override sizes() too. std::vector updated_strides; updated_strides = ComputeArrayStrides(shape.Get().sizes()); - for (int i = 0; i < updated_strides.size(); i++) { + for (const auto i : c10::irange(updated_strides.size())) { sizes_and_strides_.stride_at_unchecked(i) = updated_strides[i]; } generation_ = generation; diff --git a/torch/csrc/lazy/core/view_ops/squeeze.cpp b/torch/csrc/lazy/core/view_ops/squeeze.cpp index 3f020a2137d..a37524462d2 100644 --- a/torch/csrc/lazy/core/view_ops/squeeze.cpp +++ b/torch/csrc/lazy/core/view_ops/squeeze.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -9,7 +10,7 @@ namespace lazy { std::vector BuildSqueezedDimensions(c10::ArrayRef dimensions, int64_t squeeze_dim) { std::vector output_dimensions; - for (int64_t i = 0; i < dimensions.size(); ++i) { + for (const auto i : c10::irange(dimensions.size())) { int64_t dim = dimensions[i]; if (dim != 1 || (i != squeeze_dim && squeeze_dim >= 0)) { output_dimensions.push_back(dim); From 79a216ce576521aebb913d7c5e72e1f36fb04a86 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 18 Feb 2022 12:37:56 -0800 Subject: [PATCH 174/199] Move native MHA code out of PyTorch core (#72944) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72944 Doesn't make sense to develop it in core right now. ghstack-source-id: 149456040 Test Plan: CI run MHA benchmark in benchmark_transformers.py to make sure it doesn't crash Reviewed By: zrphercule Differential Revision: D34283104 fbshipit-source-id: 4f0c7a6bc066f938ceac891320d4cf4c3f8a9cd6 (cherry picked from commit b9df65e97cc357f2f7c3f3c114266fae33d2332a) --- aten/src/ATen/native/attention.cpp | 339 ----------------- aten/src/ATen/native/cuda/attention.cu | 342 ------------------ aten/src/ATen/native/native_functions.yaml | 10 - .../check_forward_backward_compatibility.py | 2 - test/test_nn.py | 38 -- tools/build_variables.bzl | 1 - torch/overrides.py | 1 - 7 files changed, 733 deletions(-) delete mode 100644 aten/src/ATen/native/attention.cpp delete mode 100644 aten/src/ATen/native/cuda/attention.cu diff --git a/aten/src/ATen/native/attention.cpp b/aten/src/ATen/native/attention.cpp deleted file mode 100644 index 5db27b67d4e..00000000000 --- a/aten/src/ATen/native/attention.cpp +++ /dev/null @@ -1,339 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include - -namespace at { - -namespace native { - -namespace { - -Tensor gemm_nt(const Tensor& a, const Tensor& b) { - return at::native::matmul(a, b.t()); -} - -template -void transform_bias_rescale_qkv_inner_loop( - int64_t B, - int64_t T, - int64_t _3D, - int64_t D, - int64_t num_head, - int64_t dim_per_head, - scalar_t* qkv_data, - scalar_t* qkv_bias_data, - scalar_t* q_k_v_data, - scalar_t sqrt_dim_per_head, - int64_t begin, - int64_t end) { - for (auto i : c10::irange(begin, end)) { - auto t = i % T; - i /= T; - auto nh = i % num_head; - i /= num_head; - auto b = i; - using Vec = vec::Vectorized; - auto V = vec::Vectorized::size(); - auto dh = 0; - auto d = nh * dim_per_head; - for (; dh + V <= dim_per_head; dh += V, d += V) { - // load - auto q_bias_data = Vec::loadu(&qkv_bias_data[d + 0 * D]); - auto k_bias_data = Vec::loadu(&qkv_bias_data[d + 1 * D]); - auto v_bias_data = Vec::loadu(&qkv_bias_data[d + 2 * D]); - - auto q_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 0 * D]) + - q_bias_data; - auto k_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 1 * D]) + - k_bias_data; - auto v_data = - Vec::loadu(&qkv_data[b * _3D * T + t * _3D + d + 2 * D]) + - v_bias_data; - - q_data = q_data / Vec(sqrt_dim_per_head); - - q_data.store(&q_k_v_data - [0 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh]); - k_data.store(&q_k_v_data - [1 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh]); - v_data.store(&q_k_v_data - [2 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh]); - } - for (; dh < dim_per_head; dh++) { - auto d = nh * dim_per_head + dh; - auto q_bias = qkv_bias_data[d + 0 * D]; - auto k_bias = qkv_bias_data[d + 1 * D]; - auto v_bias = qkv_bias_data[d + 2 * D]; - auto q_data = qkv_data[b * _3D * T + t * _3D + d + 0 * D] + q_bias; - auto k_data = qkv_data[b * _3D * T + t * _3D + d + 1 * D] + k_bias; - auto v_data = qkv_data[b * _3D * T + t * _3D + d + 2 * D] + v_bias; - q_data = q_data / sqrt_dim_per_head; - q_k_v_data[0 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh] = q_data; - q_k_v_data[1 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh] = k_data; - q_k_v_data[2 * B * num_head * T * dim_per_head + - b * num_head * T * dim_per_head + - nh * T * dim_per_head + - t * dim_per_head + dh] = v_data; - } - } -} - -// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias -std::tuple transform_bias_rescale_qkv( - const Tensor& qkv, - const Tensor& qkv_bias, - const int64_t num_head) { - auto B = qkv.size(0); - auto T = qkv.size(1); - auto _3D = qkv.size(2); - auto D = _3D / 3; - TORCH_CHECK(D % num_head == 0); - TORCH_CHECK(_3D % 3 == 0); - const auto dim_per_head = D / num_head; - auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options()); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v.is_contiguous()); - - const auto qkv_contig = qkv.expect_contiguous(); - const auto qkv_bias_contig = qkv_bias.expect_contiguous(); - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - qkv.scalar_type(), - "transform_bias_rescale_qkv", - [&] { - scalar_t* qkv_data = qkv_contig->data_ptr(); - scalar_t* qkv_bias_data = qkv_bias_contig->data_ptr(); - scalar_t* q_k_v_data = q_k_v.data_ptr(); - const scalar_t sqrt_dim_per_head = std::sqrt(static_cast(dim_per_head)); - - int64_t grain_size = - std::max(internal::GRAIN_SIZE / (3 * dim_per_head), (int64_t)1); - parallel_for( - 0, B * num_head * T, grain_size, [&](int64_t begin, int64_t end) { - transform_bias_rescale_qkv_inner_loop(B, T, _3D, D, num_head, dim_per_head, qkv_data, qkv_bias_data, q_k_v_data, sqrt_dim_per_head, begin, end); - }); - }); - auto q_k_v_s = - at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(q_k_v_s.size() == 3); - return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]); -} - -Tensor bmm_nt(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); - auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); - auto bt_ = b_.transpose(2, 1); - // TODO: are these a single call to cublas batched matmul? - auto c_ = at::matmul(a_, bt_); - return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)}); -} - -void masked_softmax_dropout( - Tensor& attn_scores, - const c10::optional& attn_mask) { - auto B = attn_scores.size(0); - auto num_heads = attn_scores.size(1); - auto T = attn_scores.size(2); - if (attn_mask) { - TORCH_CHECK(attn_mask->is_contiguous()); - } else { - at::_softmax_out(attn_scores, attn_scores, 3, false); - return; - } - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - attn_scores.scalar_type(), - "masked_softmax_dropout", - [&] { - using accscalar_t = acc_type; - // TODO: proper implementation with masking. - scalar_t* attn_scores_data = attn_scores.data_ptr(); - int64_t grain_size = std::min(internal::GRAIN_SIZE / T, (int64_t)1); - parallel_for( - 0, B * num_heads * T, grain_size, [&](int64_t begin, int64_t end) { - for (const auto i : c10::irange(begin, end)) { - using Vec = vec::Vectorized; - auto V = vec::Vectorized::size(); - - scalar_t* input_data = attn_scores_data + i; - auto max_input = Vec(std::numeric_limits::lowest()); - // TODO: handle epilogue - TORCH_CHECK(T % V == 0, "epilogue not implemented yet"); - for (auto t = 0; t < T; t += V) { - auto v = Vec::loadu(&input_data[t]); - max_input = vec::maximum(max_input, v); - } - - auto hmax = std::numeric_limits::lowest(); - for (auto i = 0; i < V; ++i) { - hmax = std::max(max_input[i], hmax); - } - accscalar_t hsum = 0; - TORCH_CHECK(T % V == 0, "epilogue not implemented yet"); - for (auto t = 0; t < T; t += V) { - auto v = Vec::loadu(&input_data[t]); - // TODO: vectorize in accscalar_t? - for (auto i = 0; i < V; ++i) { - hsum += std::exp(static_cast(v[i]) - hmax); - } - } - auto inv_denominator = 1.0 / hsum; - TORCH_CHECK(T % V == 0, "epilogue not implemented yet"); - for (auto t = 0; t < T; t += V) { - Vec v = Vec::loadu(&input_data[t]); - - // TODO: vectorize in accscalar_t? - // TODO this faster solution does not work on Android build - /* - for (auto i = 0; i < V; ++i) { - v[i] = static_cast(std::exp(static_cast(v[i]) - hmax) * inv_denominator); - } - v.store(&input_data[t]); - */ - for (auto i = 0; i < V; ++i) { - input_data[t + i] = static_cast(std::exp(static_cast(v[i]) - hmax) * inv_denominator); - } - } - } - }); - }); -} - -Tensor bmm_nn(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); - auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); - // TODO: are these a single call to cublas batched matmul? - auto c_ = at::matmul(a_, b_); - return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)}); -} - -Tensor transform_0213(const Tensor& a) { - // TODO: check perf vs dedicated kernel. - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(1)); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(a.size(3)); - return a.permute({0, 2, 1, 3}) - .contiguous() - .view({a.size(0), a.size(2), a.size(1) * a.size(3)}); -} - -Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2)}); - auto r_ = at::native::linear(a_, b, c); - return r_.view({a.size(0), a.size(1), r_.size(1)}); -} - -void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim()); - for (auto idx : c10::irange(shape.size())) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]); - } -} - -} // namespace - -std::tuple transform_bias_rescale_qkv_op_cpu( - const Tensor& qkv, - const Tensor& qkv_bias, - const int64_t num_head) { - auto result = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); - return std::make_tuple(std::get<0>(result).clone(), std::get<1>(result).clone(), std::get<2>(result).clone()); -} - -Tensor multi_head_self_attention_cpu( - const Tensor& query, - const Tensor& qkv_weight, - const Tensor& qkv_bias, - const Tensor& proj_weight, - const Tensor& proj_bias, - const int64_t num_head, - const c10::optional& mask) { - // query shape: [B, T, D] - // qkv_weight shape: [3 * D, D] - - const auto D = query.sizes()[2]; - - TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor"); - TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor"); - TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query"); - TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal"); - TORCH_CHECK(qkv_bias.dim() == 1, "expected 2-dimensional qkv_bias, got ", qkv_bias.dim(), "-D tensor"); - TORCH_CHECK(qkv_bias.sizes()[0] == 3 * D, "expected qkv_bias first dim and first dim of query to be equal"); - TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head"); - -#ifndef NDEBUG - const auto B = query.sizes()[0]; - const auto T = query.sizes()[1]; - const auto dim_per_head = D / num_head; -#endif - - // shape: [B, T, 3 x D] - auto qkv = gemm_nt(query, qkv_weight); -#ifndef NDEBUG - debug_assert_shape(qkv, {B, T, 3 * D}); -#endif - - // shape: 3 x [B, num_head, T, dim_per_head] - auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); - const auto& q = std::get<0>(q_k_v); - const auto& k = std::get<1>(q_k_v); - const auto& v = std::get<2>(q_k_v); -#ifndef NDEBUG - debug_assert_shape(q, {B, num_head, T, dim_per_head}); - debug_assert_shape(k, {B, num_head, T, dim_per_head}); - debug_assert_shape(v, {B, num_head, T, dim_per_head}); -#endif - - // shape: [B, num_head, T, T] - auto qkt = bmm_nt(q, k); -#ifndef NDEBUG - debug_assert_shape(qkt, {B, num_head, T, T}); -#endif - - // shape: [B, num_head, T, T] - masked_softmax_dropout(qkt, mask); - - // shape: [B, num_head, T, dim_per_head] - auto attn_ctx = bmm_nn(qkt, v); -#ifndef NDEBUG - debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head}); -#endif - - // shape: [B, T, D] - auto attn = transform_0213(attn_ctx); -#ifndef NDEBUG - debug_assert_shape(attn, {B, T, D}); -#endif - - // shape: [B, T, D] - auto proj = gemm_nt_bias(attn, proj_weight, proj_bias); -#ifndef NDEBUG - debug_assert_shape(proj, {B, T, D}); -#endif - return proj; -} - -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/cuda/attention.cu b/aten/src/ATen/native/cuda/attention.cu deleted file mode 100644 index 5b09731c610..00000000000 --- a/aten/src/ATen/native/cuda/attention.cu +++ /dev/null @@ -1,342 +0,0 @@ -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace at { - -namespace native { - -namespace { - -Tensor gemm_nt(const Tensor& a, const Tensor& b) { - return at::native::matmul(a, b.t()); -} - -static constexpr int TRANSFORM_BIAS_RESCALE_VEC = 4; - -template -__global__ void transform_bias_rescale_qkv_kernel( - // [B, T, 3 * D] - const PackedTensorAccessor64 qkv, - // [3 * D] - const PackedTensorAccessor64 qkv_bias, - // [3, B, NH, T, DH] - PackedTensorAccessor64 q_k_v) { - // warp per DH. - // so launch B * NH * T warps. - auto NH = q_k_v.size(2); - auto T = q_k_v.size(3); - auto DH = q_k_v.size(4); - - auto t = blockIdx.x % T; - auto b = blockIdx.x / T; - - auto D = NH * DH; - const scalar_t sqrt_dim_per_head = std::sqrt(static_cast(DH)); - - if (assume_aligned) { - constexpr int VEC = TRANSFORM_BIAS_RESCALE_VEC; - using LoadT = memory::aligned_vector; - for (int32_t d_v = threadIdx.x; d_v < D / VEC; d_v += blockDim.x) { - auto d = d_v * VEC; - auto nh = d / DH; - auto dh = d % DH; - scalar_t qkv_bias_q[VEC]; - scalar_t qkv_bias_k[VEC]; - scalar_t qkv_bias_v[VEC]; - scalar_t qkv_q[VEC]; - scalar_t qkv_k[VEC]; - scalar_t qkv_v[VEC]; - - // Here we require D % VEC == 0 for these vectorized loads. - *reinterpret_cast(&qkv_bias_q) = - *reinterpret_cast(&qkv_bias[d + 0 * D]); - *reinterpret_cast(&qkv_bias_k) = - *reinterpret_cast(&qkv_bias[d + 1 * D]); - *reinterpret_cast(&qkv_bias_v) = - *reinterpret_cast(&qkv_bias[d + 2 * D]); - - *reinterpret_cast(&qkv_q) = - *reinterpret_cast(&qkv[b][t][d + 0 * D]); - *reinterpret_cast(&qkv_k) = - *reinterpret_cast(&qkv[b][t][d + 1 * D]); - *reinterpret_cast(&qkv_v) = - *reinterpret_cast(&qkv[b][t][d + 2 * D]); - -#pragma unroll - // TODO: specialize for float2half2/half2float2? - for (auto ii = 0; ii < VEC; ++ii) { - qkv_q[ii] = static_cast( - (static_cast(qkv_q[ii]) + - static_cast(qkv_bias_q[ii])) / - static_cast(sqrt_dim_per_head)); - qkv_k[ii] = static_cast( - (static_cast(qkv_k[ii]) + - static_cast(qkv_bias_k[ii]))); - qkv_v[ii] = static_cast( - (static_cast(qkv_v[ii]) + - static_cast(qkv_bias_v[ii]))); - } - - // Here we require DH % VEC == 0 for these vectorized stores. - *reinterpret_cast(&q_k_v[0][b][nh][t][dh]) = - *reinterpret_cast(&qkv_q); - *reinterpret_cast(&q_k_v[1][b][nh][t][dh]) = - *reinterpret_cast(&qkv_k); - *reinterpret_cast(&q_k_v[2][b][nh][t][dh]) = - *reinterpret_cast(&qkv_v); - } - } else { - // Same as above, but we can't vectorize memory access. - for (int32_t d = threadIdx.x; d < D; d += blockDim.x) { - auto nh = d / DH; - auto dh = d % DH; - scalar_t qkv_bias_q = qkv_bias[d + 0 * D]; - scalar_t qkv_bias_k = qkv_bias[d + 1 * D]; - scalar_t qkv_bias_v = qkv_bias[d + 2 * D]; - scalar_t qkv_q = qkv[b][t][d + 0 * D]; - scalar_t qkv_k = qkv[b][t][d + 1 * D]; - scalar_t qkv_v = qkv[b][t][d + 2 * D]; - qkv_q = static_cast( - (static_cast(qkv_q) + - static_cast(qkv_bias_q)) / - static_cast(sqrt_dim_per_head)); - qkv_k = static_cast( - (static_cast(qkv_k) + - static_cast(qkv_bias_k))); - qkv_v = static_cast( - (static_cast(qkv_v) + - static_cast(qkv_bias_v))); - - q_k_v[0][b][nh][t][dh] = qkv_q; - q_k_v[1][b][nh][t][dh] = qkv_k; - q_k_v[2][b][nh][t][dh] = qkv_v; - } - } -} - -// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias -std::tuple transform_bias_rescale_qkv( - const Tensor& qkv, - const Tensor& qkv_bias, - const int64_t num_head) { - auto B = qkv.size(0); - auto T = qkv.size(1); - auto _3D = qkv.size(2); - auto D = _3D / 3; - TORCH_CHECK(D % num_head == 0); - const auto dim_per_head = D / num_head; - auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv.options()); -#define CALL_KERNEL(assume_aligned) \ - transform_bias_rescale_qkv_kernel \ - <<>>( \ - qkv.packed_accessor64(), \ - qkv_bias.packed_accessor64(), \ - q_k_v.packed_accessor64()) - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - qkv.scalar_type(), - "transform_bias_rescale_qkv", - [&] { - using accscalar_t = acc_type; - auto threads = std::max(std::min(1024, D / TRANSFORM_BIAS_RESCALE_VEC), 1); - auto blocks = B * T; - if (dim_per_head % TRANSFORM_BIAS_RESCALE_VEC == 0) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - D % TRANSFORM_BIAS_RESCALE_VEC == 0, - "D = num_heads * dim_per_head, so we should have dim_per_head % " - "TRANSFORM_BIAS_RESCALE_VEC == 0 => " - "D % TRANSFORM_BIAS_RESCALE_VEC == 0"); - CALL_KERNEL(true); - } else { - CALL_KERNEL(false); - } - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); -#undef CALL_KERNEL - auto q_k_v_s = - at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0); - return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]); -} - -Tensor bmm_nt(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); - auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); - auto bt_ = b_.transpose(2, 1); - // TODO: are these a single call to cublas batched matmul? - auto c_ = at::matmul(a_, bt_); - return c_.view({a.size(0), a.size(1), a.size(2), b.size(2)}); -} - -template -__inline__ __device__ T WarpReduceMax(T val) { -#pragma unroll - for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { - val = std::max(val, WARP_SHFL_DOWN(val, offset)); - } - return val; -} - -template -__inline__ __device__ T WarpReduceSum(T val) { -#pragma unroll - for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { - val += WARP_SHFL_DOWN(val, offset); - } - return val; -} - -void masked_softmax_dropout( - const Tensor& attn_scores, - const c10::optional& attn_mask) { - auto B = attn_scores.size(0); - auto num_heads = attn_scores.size(1); - auto T = attn_scores.size(2); - if (attn_mask) { - TORCH_CHECK(attn_mask->is_contiguous()); - } - AT_DISPATCH_FLOATING_TYPES_AND2( - ScalarType::Half, - ScalarType::BFloat16, - attn_scores.scalar_type(), - "masked_softmax_dropout", - [&] { - using accscalar_t = acc_type; - // TODO: proper implementation with masking. - dispatch_softmax_forward( - attn_scores.data_ptr(), - attn_scores.data_ptr(), - T, - T, - B * num_heads * T - ); - }); -} - -Tensor bmm_nn(const Tensor& a, const Tensor& b) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2), a.size(3)}); - auto b_ = b.view({b.size(0) * b.size(1), b.size(2), b.size(3)}); - // TODO: are these a single call to cublas batched matmul? - auto c_ = at::matmul(a_, b_); - return c_.view({a.size(0), a.size(1), a.size(2), b.size(3)}); -} - -Tensor transform_0213(const Tensor& a) { - // TODO: check perf vs dedicated kernel. - return a.permute({0, 2, 1, 3}) - .contiguous() - .view({a.size(0), a.size(2), a.size(1) * a.size(3)}); -} - -Tensor gemm_nt_bias(const Tensor& a, const Tensor& b, const Tensor& c) { - auto a_ = a.view({a.size(0) * a.size(1), a.size(2)}); - auto r_ = at::native::linear(a_, b, c); - return r_.view({a.size(0), a.size(1), r_.size(1)}); -} - -void debug_assert_shape(const Tensor& t, c10::IntArrayRef shape) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY((size_t)t.dim() == shape.size(), "expected ", shape.size(), "-D tensor but got ", t.dim()); - for (auto idx : c10::irange(shape.size())) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t.sizes()[idx] == shape[idx], "expected dim ", idx, " to be ", shape[idx], " but got ", t.sizes()[idx]); - } -} - - -} // namespace -std::tuple transform_bias_rescale_qkv_op_cuda( - const Tensor& qkv, - const Tensor& qkv_bias, - const int64_t num_head) { - auto result = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); - return std::make_tuple(std::get<0>(result).clone(), std::get<1>(result).clone(), std::get<2>(result).clone()); -} - -Tensor multi_head_self_attention_cuda( - const Tensor& query, - const Tensor& qkv_weight, - const Tensor& qkv_bias, - const Tensor& proj_weight, - const Tensor& proj_bias, - const int64_t num_head, - const c10::optional& mask) { - // query shape: [B, T, D] - // qkv_weight shape: [3 * D, D] - - const auto D = query.sizes()[2]; - - TORCH_CHECK(query.dim() == 3, "expected 3-dimensional query, got ", query.dim(), "-D tensor"); - TORCH_CHECK(qkv_weight.dim() == 2, "expected 2-dimensional qkv_weight, got ", qkv_weight.dim(), "-D tensor"); - TORCH_CHECK(D * 3 == qkv_weight.sizes()[0], "expected qkv_weight first dim to be 3x last dim of query"); - TORCH_CHECK(D == qkv_weight.sizes()[1], "expected qkv_weight second dim and last dim of query to be equal"); - TORCH_CHECK(D % num_head == 0, "D must divide evenly by num_head"); - -#ifndef NDEBUG - const auto B = query.sizes()[0]; - const auto T = query.sizes()[1]; - const auto dim_per_head = D / num_head; -#endif - - // shape: [B, T, 3 x D] - auto qkv = gemm_nt(query, qkv_weight); -#ifndef NDEBUG - debug_assert_shape(qkv, {B, T, 3 * D}); -#endif - - // shape: 3 x [B, num_head, T, dim_per_head] - auto q_k_v = transform_bias_rescale_qkv(qkv, qkv_bias, num_head); - const auto& q = std::get<0>(q_k_v); - const auto& k = std::get<1>(q_k_v); - const auto& v = std::get<2>(q_k_v); -#ifndef NDEBUG - debug_assert_shape(q, {B, num_head, T, dim_per_head}); - debug_assert_shape(k, {B, num_head, T, dim_per_head}); - debug_assert_shape(v, {B, num_head, T, dim_per_head}); -#endif - - // shape: [B, num_head, T, T] - auto qkt = bmm_nt(q, k); -#ifndef NDEBUG - debug_assert_shape(qkt, {B, num_head, T, T}); -#endif - - // shape: [B, num_head, T, T] - masked_softmax_dropout(qkt, mask); - - // shape: [B, num_head, T, dim_per_head] - auto attn_ctx = bmm_nn(qkt, v); -#ifndef NDEBUG - debug_assert_shape(attn_ctx, {B, num_head, T, dim_per_head}); -#endif - - // shape: [B, T, D] - auto attn = transform_0213(attn_ctx); -#ifndef NDEBUG - debug_assert_shape(attn, {B, T, D}); -#endif - - // shape: [B, T, D] - auto proj = gemm_nt_bias(attn, proj_weight, proj_bias); -#ifndef NDEBUG - debug_assert_shape(proj, {B, T, D}); -#endif - - return proj; -} - -} // namespace native -} // namespace at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 8f06ed8ae95..6821df7171f 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -2549,16 +2549,6 @@ CUDA: layer_norm_cuda CompositeImplicitAutograd: math_native_layer_norm -- func: _native_multi_head_self_attention(Tensor query, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, int num_head, Tensor? mask=None) -> Tensor - dispatch: - CPU: multi_head_self_attention_cpu - CUDA: multi_head_self_attention_cuda - -- func: _transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_head) -> (Tensor, Tensor, Tensor) - dispatch: - CPU: transform_bias_rescale_qkv_op_cpu - CUDA: transform_bias_rescale_qkv_op_cuda - - func: native_layer_norm_backward(Tensor grad_out, Tensor input, int[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor) dispatch: CPU: layer_norm_backward_cpu diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py index 2297dec5c2f..c4997ce463a 100644 --- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py +++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py @@ -104,8 +104,6 @@ ALLOW_LIST = [ ("aten::nanquantile", datetime.date(2022, 9, 30)), ("aten::_convolution_double_backward", datetime.date(2022, 3, 31)), ("aten::_scatter_reduce", datetime.date(2022, 1, 31)), - ("aten::native_multi_head_self_attention", datetime.date(9999, 1, 1)), - ("aten::_native_multi_head_self_attention", datetime.date(9999, 1, 1)), ("aten::scatter_reduce.two", datetime.date(2022, 3, 15)), ] diff --git a/test/test_nn.py b/test/test_nn.py index abea2592aee..8c74d3386e2 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -17540,44 +17540,6 @@ class TestNNDeviceType(NNTestCase): self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True) self._test_EmbeddingBag(device, 'mean', True, wdtype=torch.bfloat16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True) - @dtypesIfCUDA(torch.float) - @dtypes(torch.float) - def test_transform_bias_rescale_qkv(self, device, dtype): - # TODO: debug CPU test failure with settings (48, 4, 16, 8) and add that mode - tests = [ - (64, 4, 16, 8), - # dim_per_head = 12 does not divide evenly by CPU vectorization length of 8 - (24, 2, 4, 2), - # Make sure CUDA can handle small input sizes - (2, 2, 2, 2), - # dim_per_head = 6 does not divide evenly by CUDA vectorization length of 4, causes alignment issues - (24, 4, 4, 2) - ] - for (embed_dim, num_heads, sl, bs) in tests: - x = torch.randn(sl, bs, embed_dim, device=device, dtype=dtype) * 10 - qkv = torch.nn.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype) - - with torch.no_grad(): - (q, k, v) = torch._transform_bias_rescale_qkv(x @ qkv.weight.t(), qkv.bias, num_head=num_heads) - - def simple_transform_bias_rescale_qkv(qkv, bias): - (q, k, v) = torch.split(qkv, embed_dim, dim=-1) - (q_bias, k_bias, v_bias) = torch.split(bias, embed_dim, dim=-1) - return tuple( - x.reshape((sl, bs, num_heads, embed_dim // num_heads)).transpose(2, 1) - for x in ( - (q + q_bias) / math.sqrt(embed_dim // num_heads), - (k + k_bias), - (v + v_bias) - ) - ) - correct_q, correct_k, correct_v = simple_transform_bias_rescale_qkv(x @ qkv.weight.t(), qkv.bias) - - self.assertEqual(q.size(), correct_q.size()) - self.assertTrue(torch.allclose(q, correct_q)) - self.assertTrue(torch.allclose(k, correct_k)) - self.assertTrue(torch.allclose(v, correct_v)) - @onlyCUDA @dtypes(torch.half, torch.float, torch.double) def test_multihead_attention_dtype(self, device, dtype): diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 67f2def297c..bff8492268b 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -1176,7 +1176,6 @@ aten_native_source_non_codegen_list = [ "aten/src/ATen/native/quantized/library.cpp", "aten/src/ATen/quantized/QTensorImpl.cpp", "aten/src/ATen/quantized/Quantizer.cpp", - "aten/src/ATen/native/attention.cpp", "aten/src/ATen/native/Activation.cpp", "aten/src/ATen/native/AdaptiveAveragePooling.cpp", "aten/src/ATen/native/AdaptiveAveragePooling3d.cpp", diff --git a/torch/overrides.py b/torch/overrides.py index c8ef49e7b9d..3b0b437fa16 100644 --- a/torch/overrides.py +++ b/torch/overrides.py @@ -669,7 +669,6 @@ def get_testing_overrides() -> Dict[Callable, Callable]: torch.native_batch_norm: lambda input, weight, bias, running_mean, running_var, training, momentum, eps: -1, torch.native_dropout: lambda input, p, train: -1, torch.native_layer_norm: lambda input, normalized_shape, weight=None, bias=None, eps=1e-05: -1, - torch._native_multi_head_self_attention: lambda query, qkv_weight, qkv_bias, proj_weight, proj_bias, mask=None: -1, torch.native_group_norm: lambda input, weight, bias, N, C, HxW, group, eps: -1, torch.native_norm: lambda input, p=2: -1, torch.native_norm: lambda input, p=2: -1, From 374de3365557d724d50ef43048b58ca30f1c73ff Mon Sep 17 00:00:00 2001 From: Steven Troxler Date: Fri, 18 Feb 2022 14:16:43 -0800 Subject: [PATCH 175/199] [codemod][type-comments] Convert type comments in workspace_test.py (#73086) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73086 I'm wrapping up the conversion of type comments to type annotations in caffe2. The last remaining "bulk" codemod has test failures that are hard for me to understand, so I'm going to submit PRs for each module individually which makes it easier to see what's causing problems. All the codemods were produced via LibCST and then manually cleaned up. Test Plan: Wait for github CI Reviewed By: shannonzhu Differential Revision: D34344202 fbshipit-source-id: 8342267cd27a90ad91a65db858bfbd3675281c9a (cherry picked from commit 3d0658d8cfefde13b4707b8fa54c11a459e8d605) --- caffe2/python/workspace_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 1bf7b607e1b..2e2d284f92e 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -1,13 +1,16 @@ +import errno import os import shutil import tempfile import unittest from collections import namedtuple +from typing import List import caffe2.python.hypothesis_test_util as htu import hypothesis.strategies as st import numpy as np import torch +from torch import Tensor from caffe2.proto import caffe2_pb2 from caffe2.python import core, test_util, workspace, model_helper, brew from hypothesis import given, settings @@ -783,8 +786,7 @@ class MyModule(torch.jit.ScriptModule): return x + y + z @torch.jit.script_method - def multi_input_tensor_list(self, tensor_list): # pyre-ignore: PT type annotations - # type: (List[Tensor]) -> Tensor + def multi_input_tensor_list(self, tensor_list: List[Tensor]) -> Tensor: return tensor_list[0] + tensor_list[1] + tensor_list[2] @torch.jit.script_method From 0d667489480fe3a1ee4bf555c55cc7f0613940f1 Mon Sep 17 00:00:00 2001 From: Raghavan Raman Date: Fri, 18 Feb 2022 15:15:50 -0800 Subject: [PATCH 176/199] [jit] Add tests for JIT with dynamic shape fusion (#72201) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72201 Reviewed By: mikaylagawarecki Differential Revision: D34067211 Pulled By: navahgar fbshipit-source-id: 2c13bb43c76c7fed720ad37892d2177c3dc0b924 (cherry picked from commit eed2d8cea4a4037bf7784e6ca09e69f63faad6d6) --- test/cpp/tensorexpr/test_te_fuser_pass.cpp | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/test/cpp/tensorexpr/test_te_fuser_pass.cpp b/test/cpp/tensorexpr/test_te_fuser_pass.cpp index d3e91784fb5..56535de914e 100644 --- a/test/cpp/tensorexpr/test_te_fuser_pass.cpp +++ b/test/cpp/tensorexpr/test_te_fuser_pass.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -350,5 +351,52 @@ TEST(TEFuserPass, FuserPass_WhereList) { testing::FileCheck().check_not("prim::TensorExprGroup")->run(*g); } +TEST(TEFuserPass, DynamicShapeFusion) { + WithCPUFuser cf; + const auto graph_string = R"IR( + graph(%0 : Float(10, 5, strides=[5, 1], device=cpu), + %1 : Float(10, 5, strides=[5, 1], device=cpu)): + %2 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%0, %1) + %3 : Float(10, 5, strides=[5, 1], device=cpu) = aten::mul(%2, %1) + return (%3))IR"; + auto g = std::make_shared(); + torch::jit::parseIR(graph_string, g.get()); + + g->lint(); + FuseTensorExprs( + g, + /* min_group_size = */ 2, + /* add_composed_op = */ true, + /* fuse_to_dynamic_shapes = */ true); + Code code(g, ""); + + testing::FileCheck() + .check("prim::TensorExprDynamicGroup_") + ->check("prim::TensorExprDynamicGuard") + ->check("prim::TensorExprGroup_") + ->run(*g); + + auto run_and_compare = [&](const std::vector& inputs) { + TORCH_INTERNAL_ASSERT(inputs.size() == 2); + + auto ref = at::mul(at::mul(inputs[0], inputs[1]), inputs[1]); + + InterpreterState interp(code); + Stack stack(inputs.begin(), inputs.end()); + interp.run(stack); + at::Tensor out = pop(stack).toTensor(); + ASSERT_TRUE(at::allclose(out, ref)); + }; + + std::vector inputs = {at::rand({10, 5}), at::rand({10, 5})}; + run_and_compare(inputs); + + std::vector inputs2 = {at::rand({20, 5}), at::rand({20, 5})}; + run_and_compare(inputs2); + + std::vector inputs3 = {at::rand({25, 60}), at::rand({25, 60})}; + run_and_compare(inputs3); +} + } // namespace jit } // namespace torch From 671c8a459abf07fa8beb87b757010cf2de5233ac Mon Sep 17 00:00:00 2001 From: shubhambhokare1 Date: Sat, 19 Feb 2022 00:15:16 +0000 Subject: [PATCH 177/199] [ONNX] Add pixel_unshuffle support in opset 9 Current we are unable to utilize ONNX's SpaceToDepth operator due to the lack of the mode_s attribute, hence we add an alternative symbolic in opset 9 to support pixel_unshuffle - Adds support for pixel_unshuffle in opset9 - Adds support for dynamic input shapes for pixel_shuffle and pixel_unshuffle Pull Request resolved: https://github.com/pytorch/pytorch/pull/72449 --- test/onnx/test_pytorch_onnx_onnxruntime.py | 17 +++++ torch/onnx/symbolic_opset9.py | 77 ++++++++++++++++++---- 2 files changed, 80 insertions(+), 14 deletions(-) diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index cfca86679b1..64b707c95d5 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -5903,7 +5903,24 @@ class TestONNXRuntime(unittest.TestCase): return torch.pixel_shuffle(x, upscale_factor=2) x = torch.randn(2, 16, 4, 3, requires_grad=True) + y = torch.randn(4, 32, 8, 4, requires_grad=True) self.run_test(PixelShuffle(), x) + self.run_test(PixelShuffle(), x, input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + test_with_inputs=[y]) + + @skipIfUnsupportedMinOpsetVersion(9) + def test_pixel_unshuffle(self): + class PixelUnshuffle(torch.nn.Module): + def forward(self, x): + return torch.pixel_unshuffle(x, downscale_factor=2) + + x = torch.randn(2, 16, 4, 6, requires_grad=True) + y = torch.randn(4, 32, 8, 4, requires_grad=True) + self.run_test(PixelUnshuffle(), x) + self.run_test(PixelUnshuffle(), x, input_names=["x"], + dynamic_axes={"x": [0, 1, 2, 3]}, + test_with_inputs=[y]) @skipIfUnsupportedMinOpsetVersion(9) def test_reciprocal(self): diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index 34c41ccf109..8c710269699 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -2157,20 +2157,69 @@ def pixel_shuffle(g, self, upscale_factor): dims = sym_help._get_tensor_sizes(self) if len(dims) != 4: return _unimplemented("pixel_shuffle", "only support 4d input") - if any([i is None for i in dims[1:]]): - return _unimplemented("pixel_shuffle", "only support static input shape, except for batch size") - output_channel = dims[1] // upscale_factor // upscale_factor - after_view = sym_help._reshape_helper(g, self, - g.op("Constant", value_t=torch.tensor([-1, output_channel, - upscale_factor, upscale_factor, - dims[2], dims[3]])), - allowzero=0) - after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3]) - return sym_help._reshape_helper(g, after_transpose, - g.op("Constant", value_t=torch.tensor([-1, output_channel, - dims[2] * upscale_factor, - dims[3] * upscale_factor])), - allowzero=0) + if any(i is None for i in dims[1:]): + after_view = sym_help._reshape_helper(g, sym_help._unsqueeze_helper(g, self, [2, 3]), + g.op("Constant", value_t=torch.tensor([0, -1, + upscale_factor, upscale_factor, + 0, 0])), + allowzero=0) + after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3]) + # For dynamic input shapes, two reshapes are performed + reshape_h = sym_help._reshape_helper(g, after_transpose, + g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])), + allowzero=0) + reshape_w = sym_help._reshape_helper(g, reshape_h, + g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])), + allowzero=0) + return sym_help._squeeze_helper(g, reshape_w, [3, 5]) + else: + output_channel = dims[1] // upscale_factor // upscale_factor + after_view = sym_help._reshape_helper(g, self, + g.op("Constant", value_t=torch.tensor([-1, output_channel, + upscale_factor, upscale_factor, + dims[2], dims[3]])), + allowzero=0) + after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3]) + return sym_help._reshape_helper(g, after_transpose, + g.op("Constant", value_t=torch.tensor([-1, output_channel, + dims[2] * upscale_factor, + dims[3] * upscale_factor])), + allowzero=0) + + +@parse_args("v", "i") +def pixel_unshuffle(g, self, downscale_factor): + dims = sym_help._get_tensor_sizes(self) + if len(dims) != 4: + return _unimplemented("pixel_shuffle", "only support 4d input") + if any(i is None for i in dims[1:]): + # For dynamic input shapes, two reshapes are performed + reshape_h = sym_help._reshape_helper(g, sym_help._unsqueeze_helper(g, self, [3]), + g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])), + allowzero=0) + reshape_w = sym_help._reshape_helper(g, reshape_h, + g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])), + allowzero=0) + after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4]) + final_reshape = sym_help._reshape_helper(g, after_transpose, + g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])), + allowzero=0) + return sym_help._squeeze_helper(g, final_reshape, [2, 3]) + else: + output_channel = dims[1] * downscale_factor * downscale_factor + after_view = sym_help._reshape_helper(g, self, + g.op("Constant", value_t=torch.tensor([-1, dims[1], + dims[2] // downscale_factor, + downscale_factor, + dims[3] // downscale_factor, + downscale_factor])), + allowzero=0) + after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4]) + return sym_help._reshape_helper(g, after_transpose, + g.op("Constant", value_t=torch.tensor([-1, output_channel, + dims[2] // downscale_factor, + dims[3] // downscale_factor])), + allowzero=0) def _generic_rnn(g, variant, input, initial_states, all_weights, has_biases, From 906d26fb9b557f529b90419bea72bb8d9cd29519 Mon Sep 17 00:00:00 2001 From: Steven Troxler Date: Fri, 18 Feb 2022 15:27:58 -0800 Subject: [PATCH 178/199] [codemod][type-comments] Convert type comments in api.py (#73084) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73084 I'm wrapping up the conversion of type comments to type annotations in caffe2. The last remaining "bulk" codemod has test failures that are hard for me to understand, so I'm going to submit PRs for each module individually which makes it easier to see what's causing problems. All the codemods were produced via LibCST and then manually cleaned up. Test Plan: Wait for github CI Reviewed By: H-Huang Differential Revision: D34344289 fbshipit-source-id: e8e3a13c3d95f6804829f1818fb7f0605e5ba137 (cherry picked from commit 92d47d9cd549e85d002dd22e5f377356a3a98348) --- torch/distributed/elastic/metrics/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/distributed/elastic/metrics/api.py b/torch/distributed/elastic/metrics/api.py index 75e23384591..35d5c78c6ef 100644 --- a/torch/distributed/elastic/metrics/api.py +++ b/torch/distributed/elastic/metrics/api.py @@ -60,7 +60,7 @@ class MetricStream: _metrics_map = {} -_default_metrics_handler = NullMetricHandler() # type: MetricHandler +_default_metrics_handler: MetricHandler = NullMetricHandler() # pyre-fixme[9]: group has type `str`; used as `None`. From 534d5cf91daf12a6c4962b1682763686c4c33f0d Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 18 Feb 2022 15:56:45 -0800 Subject: [PATCH 179/199] Fix backward compat (#73127) Summary: By adding "aten::_native_multi_head_self_attention" and "aten::_transform_bias_rescale_qkv" to ALLOW_LIST Pull Request resolved: https://github.com/pytorch/pytorch/pull/73127 Reviewed By: suo Differential Revision: D34355869 Pulled By: malfet fbshipit-source-id: efc9be8276b51aaafcfcdd0cc38d27372b79d63f (cherry picked from commit f52cac59da1bb40cca851043db65ae836c2ea4c8) --- .../check_forward_backward_compatibility.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py index c4997ce463a..329e65b3281 100644 --- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py +++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py @@ -105,6 +105,9 @@ ALLOW_LIST = [ ("aten::_convolution_double_backward", datetime.date(2022, 3, 31)), ("aten::_scatter_reduce", datetime.date(2022, 1, 31)), ("aten::scatter_reduce.two", datetime.date(2022, 3, 15)), + ("aten::native_multi_head_self_attention", datetime.date(9999, 1, 1)), + ("aten::_native_multi_head_self_attention", datetime.date(9999, 1, 1)), + ("aten::_transform_bias_rescale_qkv", datetime.date(9999, 1, 1)), ] ALLOW_LIST_COMPILED = [ From b1bd2268f86547dfe1620da64c6694f07ee5d5a2 Mon Sep 17 00:00:00 2001 From: Sangbaek Park Date: Fri, 18 Feb 2022 16:13:21 -0800 Subject: [PATCH 180/199] [Vulkan] Add performance test for GRU operator (#73126) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73126 Added a performance test for the Vulkan GRU operator: * Added a performance test `gru_op_perf` into `vulkan_perf_test.cpp` * `--benchmark_filter` flag can be used to run only GRU perf tests: ``` adb shell "/data/local/tmp/vulkan_perf_test" --benchmark_filter=gru* ``` Test Plan: Test command line: ``` cd ~/fbsource buck build -c ndk.custom_libcxx=false -c pt.enable_qpl=0 //xplat/caffe2:pt_vulkan_perf_test_binAndroid\#android-arm64 --show-output adb push buck-out/gen/xplat/caffe2/pt_vulkan_perf_test_binAndroid\#android-arm64 /data/local/tmp/vulkan_perf_test adb shell "/data/local/tmp/vulkan_perf_test" --benchmark_filter=gru* ``` Test result: ``` Running /data/local/tmp/vulkan_perf_test Run on (8 X 1804.8 MHz CPU s) ***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead. ------------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------------------------------------ gru_op_perf/N:384/C:384/H:2/iterations:1000/threads:1 16.7 ms 14.7 ms 1000 ``` Reviewed By: SS-JIA Differential Revision: D34355119 fbshipit-source-id: 049dc4b47938a04e395923e761e59304e8fa1f7d (cherry picked from commit 39c8b7e4d7c408d867e0a08443199b0de1c5faf5) --- aten/src/ATen/test/vulkan_perf_test.cpp | 53 +++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/aten/src/ATen/test/vulkan_perf_test.cpp b/aten/src/ATen/test/vulkan_perf_test.cpp index fa6b303eead..f5f20442ac5 100644 --- a/aten/src/ATen/test/vulkan_perf_test.cpp +++ b/aten/src/ATen/test/vulkan_perf_test.cpp @@ -35,6 +35,58 @@ static void cat_op_channel_perf(benchmark::State& state) { } } +static void gru_op_perf(benchmark::State& state) { + // Guard + if (!at::is_vulkan_available()) { + return; + } + + // Arrange + const int H_in = static_cast(state.range(0)); // input_size + const int H_out = static_cast(state.range(1)); // hidden_size + const int num_layers = static_cast(state.range(2)); + const double gru_dropout = .0; + const bool has_biases = true; + const bool train = false; + const bool bidirectional = false; + const bool batch_first = true; + const auto in_cpu = at::rand({1, 1, H_in}, at::device(at::kCPU).dtype(at::kFloat)); + const auto h0_cpu = at::rand({num_layers, 1, H_out}, at::device(at::kCPU).dtype(at::kFloat)); + + c10::List weight_ih_l; // shape (3 * hidden_size, input_size) + c10::List weight_hh_l; // shape (3 * hidden_size, hidden_size) + c10::List bias_ih_l; // shape (3 * hidden_size) + c10::List bias_hh_l; // shape (3 * hidden_size) + for (int i = 0; i < num_layers; ++i) { + weight_ih_l.emplace_back(at::rand({3 * H_out, H_in}, at::device(at::kCPU).dtype(at::kFloat))); + weight_hh_l.emplace_back(at::rand({3 * H_out, H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_ih_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + bias_hh_l.emplace_back(at::rand({3 * H_out}, at::device(at::kCPU).dtype(at::kFloat))); + } + + // put this guard here to run inference inststead of training + // to avoid the following error: + // C++ exception with description "0INTERNAL ASSERT FAILED at "xplat/caffe2/aten/src/ATen/core/boxing/KernelFunction.cpp":31, please report a bug to PyTorch. aten::gru.input has kernels registered to both CompositeImplicitAutograd and a backend mapped to AutogradOther. This makes the backend kernel unreachable; the dispatcher will always prefer the CompositeImplicitAutograd lowering (see Note [Ambiguity in AutogradOther kernel]). If you want to override CompositeImplicitAutograd, please open an issue to request a dedicated Autograd dispatch key for the backend. + // If you only want to run inference instead of training, add `c10::InferenceMode mode;` before model.forward(). Note this guard is only available in C++ but not Python at present. + c10::InferenceMode mode; + + // Act + while (state.KeepRunning()) { + // weights/biases should be always on CPU. + const auto out_vulkan = at::gru(in_cpu.vulkan(), h0_cpu.vulkan(), { weight_ih_l.get(0), weight_hh_l.get(0), bias_ih_l.get(0), bias_hh_l.get(0), + weight_ih_l.get(1), weight_hh_l.get(1), bias_ih_l.get(1), bias_hh_l.get(1) }, + has_biases, num_layers, gru_dropout, train, bidirectional, batch_first); + + auto vulkan_output = std::get<0>(out_vulkan); + auto vulkan_hidden = std::get<1>(out_vulkan); + + // to avoid out-of-memory issues, release resources by waiting and flushing all GPU operations + at::native::vulkan::api::context()->wait(vulkan_output); + at::native::vulkan::api::context()->wait(vulkan_hidden); + at::native::vulkan::api::context()->flush(); + } +} + static void CommonBenchmarkSettings(benchmark::internal::Benchmark* b) { b->Unit(benchmark::kMillisecond); b->ArgNames({"N", "C", "H", "W"}); @@ -48,6 +100,7 @@ BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iter BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 4, 221, 193}); // small multiple of 4 channels BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(5000)->Args({3, 3, 221, 193}); // small non-multiple of 4 channels BENCHMARK(cat_op_channel_perf)->Apply(CommonBenchmarkSettings)->Threads(3)->Iterations(1000)->Args({3, 40, 221, 193}); // big multiple of 4 channels (multi-thread) +BENCHMARK(gru_op_perf)->Apply(CommonBenchmarkSettings)->Threads(1)->Iterations(1000)->Args({384, 384, 2}); // McLaren Model inputs BENCHMARK_MAIN(); #endif /* USE_VULKAN_API */ From f41db99a5690bf0ae4d51d9cdb9aaff95703f6c6 Mon Sep 17 00:00:00 2001 From: Rui Zhu Date: Fri, 18 Feb 2022 16:15:00 -0800 Subject: [PATCH 181/199] Add simple correctness check for native MHA (#72941) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72941 Simple test for MHA, use cos similarity as metric since scaling generate mismatch. Cuda is validated, CPU fix a following (We can land this with onlyCuda flag, and remove it once CPU is also done) Test Plan: For cuda: buck build mode/opt -c fbcode.enable_gpu_sections=true caffe2/test:nn && buck-out/gen/caffe2/test/nn\#binary.par -r test_native_multihead_attention_cuda_float32 2>&1 | pastry Reviewed By: swolchok Differential Revision: D33906921 fbshipit-source-id: ad447401eb7002f22ed533d620a6b544524b3f58 (cherry picked from commit 45b778da27598c1d4763aa22843b48a88fa90373) --- test/test_nn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_nn.py b/test/test_nn.py index 8c74d3386e2..4b6fececf19 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -17533,7 +17533,6 @@ class TestNNDeviceType(NNTestCase): ) self.assertEqual(output_non_contig, output_contig) - @onlyCUDA @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long))) def test_embedding_bag_bfloat16(self, device, dtypes): From c6f56599bb4e8a9d9649e87ef97225f80522ad38 Mon Sep 17 00:00:00 2001 From: Jiayi Sun Date: Fri, 18 Feb 2022 16:34:12 -0800 Subject: [PATCH 182/199] =?UTF-8?q?add=20BFloat16=20sparse=20operators=20o?= =?UTF-8?q?n=20CPU:=20copy,=20coalesce,=20sparse=5Fmask,=20ad=E2=80=A6=20(?= =?UTF-8?q?#72846)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: …d_out, addmm Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/72846 Reviewed By: mikaylagawarecki Differential Revision: D34343016 Pulled By: cpuhrsch fbshipit-source-id: f1274125234a3bacbb7a38fc642fbf5c9786d435 (cherry picked from commit c819456abf1d27ee09ae7f243222dd7e89cc82b4) --- aten/src/ATen/native/sparse/SparseTensor.cpp | 2 +- .../ATen/native/sparse/SparseTensorMath.cpp | 4 ++-- test/test_sparse.py | 19 ++++++++++++++----- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 6de64bfbf2c..7d428966741 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -770,7 +770,7 @@ SparseTensor& sparse_mask_out_cpu( // TODO: Re-audit this; it used to be an indexSelect directly into r_values at::index_select_out(r_values, t_view, 0, indices); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX(r_values.scalar_type(), "sparse_mask", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, r_values.scalar_type(), "sparse_mask", [&] { sparse_mask_out_cpu_kernel( r_values, t, r_nnz, sparse_dim, mask_indices); }); diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index c23486336a1..7db314d3a82 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -474,7 +474,7 @@ SparseTensor& add_out_sparse_contiguous(SparseTensor& r, const SparseTensor& t, auto r_indices_accessor = r_indices.accessor(); auto src_indices_accessor = src_indices.accessor(); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX( + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, commonDtype, "cadd_sparse", [&] { scalar_t* t_values_ptr = t_values.data_ptr(); scalar_t* s_values_ptr = s_values.data_ptr(); @@ -899,7 +899,7 @@ Tensor& s_addmm_out_sparse_dense_cpu( Tensor indices = sparse_._indices(); Tensor values = sparse_._values(); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX( + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, values.scalar_type(), "addmm_sparse_dense", [&] { s_addmm_out_sparse_dense_worker(nnz, dim_i, dim_j, dim_k, r, beta, t, alpha, indices, values, dense); } diff --git a/test/test_sparse.py b/test/test_sparse.py index 0ad1a91b56b..b0cbd03f15a 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -21,7 +21,7 @@ from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes from torch.testing._internal.common_cuda import \ (SM53OrLater, SM80OrLater, CUDA11OrLater) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride, + (instantiate_device_type_tests, ops, dtypes, dtypesIfCPU, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride, deviceCountAtLeast, OpDTypes) from torch.testing._internal.common_methods_invocations import \ (sparse_unary_ufuncs) @@ -189,6 +189,7 @@ class TestSparse(TestCase): @coalescedonoff @dtypes(torch.double, torch.cdouble) + @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) def test_coalesce(self, device, dtype, coalesced): def _test_coalesce(t): @@ -663,6 +664,7 @@ class TestSparse(TestCase): @coalescedonoff @dtypes(torch.double, torch.cdouble) + @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced): # This is for testing torch.copy_(SparseTensor, SparseTensor) sparse_dims = 3 @@ -1240,6 +1242,8 @@ class TestSparse(TestCase): @coalescedonoff @dtypes(torch.double, torch.cdouble) + @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) + @precisionOverride({torch.bfloat16: 1e-1}) def test_sparse_addmm(self, device, dtype, coalesced): def test_shape(m, n, p, nnz, broadcast, alpha_beta=None): if alpha_beta is None: @@ -1261,7 +1265,8 @@ class TestSparse(TestCase): def fn(S, D1, D2, beta=beta, alpha=alpha): return torch.sparse.addmm(D1, S, D2, beta=beta, alpha=alpha) - gradcheck(fn, (S, D1, D2), check_sparse_nnz=True) + if dtype == torch.double or dtype == torch.cdouble: + gradcheck(fn, (S, D1, D2), check_sparse_nnz=True) test_shape(7, 8, 9, 20, False, None) test_shape(7, 8, 9, 20, True, None) @@ -1401,15 +1406,17 @@ class TestSparse(TestCase): _test_spadd() _test_spadd_hybrid() - @onlyCUDA @coalescedonoff @dtypes(torch.double, torch.cdouble) def test_sparse_add_out_bfloat16(self, device, dtype, coalesced): # fp32 x, _, _ = self._gen_sparse(3, 5, 10, dtype, device, coalesced) y, _, _ = self._gen_sparse(3, 5, 10, dtype, device, coalesced) - x = x.float().cuda() - y = y.float().cuda() + x = x.float() + y = y.float() + if device == 'cuda': + x = x.cuda() + y = y.cuda() res_fp32 = torch.add(x, y) # bfloat16 @@ -1628,6 +1635,7 @@ class TestSparse(TestCase): _test_basic_ops_hybrid() @dtypes(torch.double, torch.cdouble) + @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) def test_add_dense_sparse_mismatch(self, device, dtype): def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size): x = torch.zeros(dense_size, dtype=dtype, device=device) @@ -1666,6 +1674,7 @@ class TestSparse(TestCase): @coalescedonoff @dtypes(torch.double, torch.cdouble) + @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) def test_sparse_mask(self, device, dtype, coalesced): def _test_sparse_mask_fixed(): i = self.index_tensor([ From 564f99226a0e8913c83fc66873838dfffd964311 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Fri, 18 Feb 2022 16:53:45 -0800 Subject: [PATCH 183/199] [vulkan] Clamp tanh activation op input to preserve numerical stability (#73107) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73107 It was observed that for large inputs (magnitude >= 30), the `tanh` shader would produce numerically unstable outputs when running with MoltenVK. The solution is to clamp the input with the range [-15, 15] before applying the `tanh` function. Test Plan: Imported from OSS Reviewed By: mikaylagawarecki Differential Revision: D34354838 Pulled By: SS-JIA fbshipit-source-id: ca2c5373987d9c086c3797a87ab08ffc189e2529 (cherry picked from commit 423bcbff64b2c3864f9d50314793ea70f5639af0) --- aten/src/ATen/native/vulkan/glsl/tanh.glsl | 6 +++++- aten/src/ATen/native/vulkan/glsl/tanh_.glsl | 6 +++++- aten/src/ATen/test/vulkan_api_test.cpp | 4 ++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/native/vulkan/glsl/tanh.glsl b/aten/src/ATen/native/vulkan/glsl/tanh.glsl index 8d611630cf7..70315def634 100644 --- a/aten/src/ATen/native/vulkan/glsl/tanh.glsl +++ b/aten/src/ATen/native/vulkan/glsl/tanh.glsl @@ -18,6 +18,10 @@ void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); if (all(lessThan(pos, uBlock.size.xyz))) { - imageStore(uOutput, pos, tanh(texelFetch(uInput, pos, 0))); + const vec4 intex = texelFetch(uInput, pos, 0); + imageStore( + uOutput, + pos, + tanh(clamp(intex, -15.0, 15.0))); } } diff --git a/aten/src/ATen/native/vulkan/glsl/tanh_.glsl b/aten/src/ATen/native/vulkan/glsl/tanh_.glsl index 59649da6518..ef8fd35fc58 100644 --- a/aten/src/ATen/native/vulkan/glsl/tanh_.glsl +++ b/aten/src/ATen/native/vulkan/glsl/tanh_.glsl @@ -17,6 +17,10 @@ void main() { const ivec3 pos = ivec3(gl_GlobalInvocationID); if (all(lessThan(pos, uBlock.size.xyz))) { - imageStore(uOutput, pos, tanh(imageLoad(uOutput, pos))); + const vec4 intex = imageLoad(uOutput, pos); + imageStore( + uOutput, + pos, + tanh(clamp(intex, -15.0, 15.0))); } } diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index acf15e94d4a..7001677d8dd 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -1551,7 +1551,7 @@ TEST(VulkanAPITest, tanh) { return; } - const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + const auto in_cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30; const auto in_vulkan = in_cpu.vulkan(); const auto out_cpu = at::tanh(in_cpu); @@ -1570,7 +1570,7 @@ TEST(VulkanAPITest, tanh_) { return; } - auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)); + auto cpu = at::rand({17, 197, 302, 5}, at::device(at::kCPU).dtype(at::kFloat)) * 30; auto vulkan = cpu.vulkan(); at::tanh_(cpu); From bdc8b3f3e828ca7202879baa379fda6df5b078d2 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Fri, 18 Feb 2022 16:53:45 -0800 Subject: [PATCH 184/199] [vulkan] Re-route arithmetic ops to scalar versions when second arg is zero-dim (#73108) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73108 When arithmetic ops are invoked from torchscript the scalar argument will sometimes be wrapped in a zero-dimensional tensor, which will cause the Vulkan implementation to complain as all input tensors are expected to have the same number of channels. The solution is to have the Tensor implementations of the op check if the second argument is zero-dimensional and re-route it to the Scalar implementation if that's the case. Test Plan: Imported from OSS Reviewed By: mikaylagawarecki Differential Revision: D34354840 Pulled By: SS-JIA fbshipit-source-id: b24799bb3dd4336791a39bea9382c14243ad58e4 (cherry picked from commit c6dd8eb13b9be3800405c64a3a81e5c68da64355) --- .../src/ATen/native/vulkan/ops/Arithmetic.cpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp b/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp index 42e941f00a7..6eec67c4036 100644 --- a/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp +++ b/aten/src/ATen/native/vulkan/ops/Arithmetic.cpp @@ -322,6 +322,13 @@ Tensor add_tensor( const Tensor& self_arg, const Tensor& other_arg, const Scalar& alpha) { + if (other_arg.sizes().size() == 0) { + return arithmetic_scalar( + self_arg, + other_arg.item(), + c10::optional(alpha.to()), + VK_KERNEL(add_scalar)); + } return arithmetic_tensor( self_arg, other_arg, c10::optional(alpha), VK_KERNEL(add)); } @@ -354,6 +361,13 @@ Tensor sub_tensor( const Tensor& self_arg, const Tensor& other_arg, const Scalar& alpha) { + if (other_arg.sizes().size() == 0) { + return arithmetic_scalar( + self_arg, + other_arg.item(), + c10::optional(-1 * alpha.to()), + VK_KERNEL(add_scalar)); + } return arithmetic_tensor( self_arg, other_arg, c10::optional(alpha), VK_KERNEL(sub)); } @@ -374,6 +388,13 @@ Tensor& mul_scalar_(Tensor& self, const Scalar& other) { } Tensor mul_tensor(const Tensor& self_arg, const Tensor& other_arg) { + if (other_arg.sizes().size() == 0) { + return arithmetic_scalar( + self_arg, + other_arg.item(), + c10::optional(), + VK_KERNEL(mul_scalar)); + } return arithmetic_tensor( self_arg, other_arg, c10::optional(), VK_KERNEL(mul)); } @@ -400,6 +421,13 @@ Tensor& div_scalar_(Tensor& self, const Scalar& other) { } Tensor div_tensor(const Tensor& self_arg, const Tensor& other_arg) { + if (other_arg.sizes().size() == 0) { + return arithmetic_scalar( + self_arg, + 1.0 / other_arg.item(), + c10::optional(), + VK_KERNEL(mul_scalar)); + } return arithmetic_tensor( self_arg, other_arg, c10::optional(), VK_KERNEL(div)); } From 52175307e2ade72c41785b2b5724719b4ef578f9 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Fri, 18 Feb 2022 16:53:45 -0800 Subject: [PATCH 185/199] [vulkan] Allow benchmark binary to handle non-single tensor inputs/outputs for Vulkan models (#73109) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73109 This change updates the Vulkan model runner in `speed_benchmark_torch` to be able to generate inputs for models that have input/output types other than just a single tensor. Input elements are processed depending on their type. Test Plan: Imported from OSS Reviewed By: mikaylagawarecki Differential Revision: D34354839 Pulled By: SS-JIA fbshipit-source-id: 993e55372d2664fa7eddb16146deba264727f399 (cherry picked from commit 4a140202acb336412676ac090a38d7b93ae49898) --- binaries/speed_benchmark_torch.cc | 61 +++++++++++++++++++------------ 1 file changed, 37 insertions(+), 24 deletions(-) diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc index e4eee10636e..ea523898b51 100644 --- a/binaries/speed_benchmark_torch.cc +++ b/binaries/speed_benchmark_torch.cc @@ -180,35 +180,48 @@ class vkRunner final : public Runner { virtual c10::IValue run( T& module, const std::vector& inputs) override { - // Upload the input tensor(s) to GPU memory. - inputs_.clear(); - inputs_.reserve(inputs.size()); - for (const auto& input : inputs) { - if (input.isTensor()) { - inputs_.emplace_back(input.toTensor().vulkan()); - } - else if (input.isList()) { - const c10::List input_as_list = input.toList(); - c10::List input_vk_list; - input_vk_list.reserve(input_as_list.size()); - for (int i=0; i < input_as_list.size(); ++i) { - const c10::IValue element = input_as_list.get(i); - if (element.isTensor()) { - input_vk_list.emplace_back(element.toTensor().vulkan()); - } - else { - CAFFE_THROW("Input of type c10::List must only contain Tensors!"); - } + + if (inputs_.size() == 0) { + // Upload the input tensor(s) to GPU memory. + inputs_.clear(); + inputs_.reserve(inputs.size()); + for (const auto& input : inputs) { + if (input.isTensor()) { + inputs_.emplace_back(at::rand(input.toTensor().sizes()).vulkan()); + } + else if (input.isTensorList()) { + const c10::List input_as_list = input.toTensorList(); + c10::List input_vk_list; + input_vk_list.reserve(input_as_list.size()); + for (int i=0; i < input_as_list.size(); ++i) { + const at::Tensor element = input_as_list.get(i); + input_vk_list.emplace_back(at::rand(element.sizes()).vulkan()); + } + inputs_.emplace_back(c10::IValue(input_vk_list)); + } + else { + CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::TensorList!"); } - inputs_.emplace_back(c10::IValue(input_vk_list)); - } - else { - CAFFE_THROW("Inputs must only contain IValues of type c10::Tensor or c10::List!"); } } // Run, and download the output tensor to system memory. - return module.forward(inputs_).toTensor().cpu(); + c10::IValue output = module.forward(inputs_); + if (output.isTensor()) { + return output.toTensor().cpu(); + } + else if (output.isTensorList()) { + return output.toTensorList().get(0).cpu(); + } + else if (output.isList()) { + return output.toList().get(0).toTensor().cpu(); + } + else if (output.isTuple()) { + return output.toTuple()->elements()[0].toTensor().cpu(); + } + else { + CAFFE_THROW("Outputs must only be either c10::Tensor or c10::TensorList!"); + }; } private: From d059c0821c0c95c211d5ac8cf6e1ddb2d845049b Mon Sep 17 00:00:00 2001 From: Chen Lai Date: Fri, 18 Feb 2022 17:35:50 -0800 Subject: [PATCH 186/199] [Easy] Update the bytecode version comment (#73097) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73097 As title ghstack-source-id: 149500912 Test Plan: CI Reviewed By: pavithranrao Differential Revision: D34347005 fbshipit-source-id: 76f96c627983a81fa02701ab174d35cb9c891628 (cherry picked from commit 857de08b310ab3f6d33b3fc32cadea9fc12e65f7) --- caffe2/serialize/versions.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/caffe2/serialize/versions.h b/caffe2/serialize/versions.h index 40cbd9dea87..40d2cd0145f 100644 --- a/caffe2/serialize/versions.h +++ b/caffe2/serialize/versions.h @@ -115,11 +115,13 @@ constexpr uint64_t kMinProducedFileFormatVersion = 0x3L; // torchscript constant table. Also update tensor storage schema adapting to // the unify format, the root key of tensor storage is updated from {index} to // {the_pointer_value_the_tensor.storage}, for example: -// `140245072983168.storage` Forward-compatibility change. 0x6L: Implicit -// opereator versioning using number of specified argument. Refer to the -// summary of https://github.com/pytorch/pytorch/pull/56845 for details. 0x7L: -// Enable support for operators with default arguments plus out arguments. -// 0x8L: Emit promoted operators as instructions +// `140245072983168.storage` Forward-compatibility change. +// 0x6L: Implicit opereator versioning using number of specified argument. +// Refer to the summary of https://github.com/pytorch/pytorch/pull/56845 for details. +// 0x7L: Enable support for operators with default arguments plus out arguments. +// Refer. See https://github.com/pytorch/pytorch/pull/63651 for details +// 0x8L: Emit promoted operators as instructions. +// See https://github.com/pytorch/pytorch/pull/71662 for details constexpr uint64_t kProducedBytecodeVersion = 0x8L; // static_assert( From 46f9e16afecbab0a57f6d3a0bb489787cd8cf979 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Fri, 18 Feb 2022 18:10:02 -0800 Subject: [PATCH 187/199] Documenting cuda 11.5 windows issue (#73013) Summary: Adding documentation about compiling extension with CUDA 11.5 and Windows Example of failure: https://github.com/pytorch/pytorch/runs/4408796098?check_suite_focus=true Note: Don't use torch/extension.h In CUDA 11.5 under windows in your C++ code: Use aten instead of torch interface in all cuda 11.5 code under windows. It has been failing with errors, due to a bug in nvcc. Example use: >>> #include >>> at::Tensor SigmoidAlphaBlendForwardCuda(....) Instead of: >>> #include >>> torch::Tensor SigmoidAlphaBlendForwardCuda(...) Currently open issue for nvcc bug: https://github.com/pytorch/pytorch/issues/69460 Complete Workaround code example: https://github.com/facebookresearch/pytorch3d/commit/cb170ac024a949f1f9614ffe6af1c38d972f7d48 Pull Request resolved: https://github.com/pytorch/pytorch/pull/73013 Reviewed By: malfet, seemethere Differential Revision: D34306134 Pulled By: atalman fbshipit-source-id: 3c5b9d7a89c91bd1920dc63dbd356e45dc48a8bd (cherry picked from commit 87098e7f17fca1b98c90fafe2dde1defb6633f49) --- torch/utils/cpp_extension.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index b81fefbf106..00e6d5d45e2 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -913,6 +913,20 @@ def CUDAExtension(name, sources, *args, **kwargs): Note that while it's possible to include all supported archs, the more archs get included the slower the building process will be, as it will build a separate kernel image for each arch. + Note that CUDA-11.5 nvcc will hit internal compiler error while parsing torch/extension.h on Windows. + To workaround the issue, move python binding logic to pure C++ file. + + Example use: + >>> #include + >>> at::Tensor SigmoidAlphaBlendForwardCuda(....) + + Instead of: + >>> #include + >>> torch::Tensor SigmoidAlphaBlendForwardCuda(...) + + Currently open issue for nvcc bug: https://github.com/pytorch/pytorch/issues/69460 + Complete workaround code example: https://github.com/facebookresearch/pytorch3d/commit/cb170ac024a949f1f9614ffe6af1c38d972f7d48 + ''' library_dirs = kwargs.get('library_dirs', []) library_dirs += library_paths(cuda=True) From c837caf5c567508f5c072c9730b11f2b6b15cce9 Mon Sep 17 00:00:00 2001 From: patel-zeel Date: Fri, 18 Feb 2022 21:41:33 -0800 Subject: [PATCH 188/199] Adding details to kl.py (#72845) Summary: Fixes https://github.com/pytorch/pytorch/issues/72765. - [x] Improved `NotImplementedError` verbosity. - [x] Automate the docstring generation process ## Improved `NotImplementedError` verbosity ### Code ```python import torch dist = torch.distributions torch_normal = dist.Normal(loc=0.0, scale=1.0) torch_mixture = dist.MixtureSameFamily( dist.Categorical(torch.ones(5,) ), dist.Normal(torch.randn(5,), torch.rand(5,)), ) dist.kl_divergence(torch_normal, torch_mixture) ``` #### Output before this PR ```python NotImplementedError: ``` #### Output after this PR ```python NotImplementedError: No KL(p || q) is implemented for p type Normal and q type MixtureSameFamily ``` ## Automate the docstring generation process ### Docstring before this PR ```python Compute Kullback-Leibler divergence :math:`KL(p \| q)` between two distributions. .. math:: KL(p \| q) = \int p(x) \log\frac {p(x)} {q(x)} \,dx Args: p (Distribution): A :class:`~torch.distributions.Distribution` object. q (Distribution): A :class:`~torch.distributions.Distribution` object. Returns: Tensor: A batch of KL divergences of shape `batch_shape`. Raises: NotImplementedError: If the distribution types have not been registered via :meth:`register_kl`. ``` ### Docstring after this PR ```python Compute Kullback-Leibler divergence :math:`KL(p \| q)` between two distributions. .. math:: KL(p \| q) = \int p(x) \log\frac {p(x)} {q(x)} \,dx Args: p (Distribution): A :class:`~torch.distributions.Distribution` object. q (Distribution): A :class:`~torch.distributions.Distribution` object. Returns: Tensor: A batch of KL divergences of shape `batch_shape`. Raises: NotImplementedError: If the distribution types have not been registered via :meth:`register_kl`. KL divergence is currently implemented for the following distribution pairs: * :class:`~torch.distributions.Bernoulli` and :class:`~torch.distributions.Bernoulli` * :class:`~torch.distributions.Bernoulli` and :class:`~torch.distributions.Poisson` * :class:`~torch.distributions.Beta` and :class:`~torch.distributions.Beta` * :class:`~torch.distributions.Beta` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.Beta` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.Beta` and :class:`~torch.distributions.Gamma` * :class:`~torch.distributions.Beta` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.Beta` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.Beta` and :class:`~torch.distributions.Uniform` * :class:`~torch.distributions.Binomial` and :class:`~torch.distributions.Binomial` * :class:`~torch.distributions.Categorical` and :class:`~torch.distributions.Categorical` * :class:`~torch.distributions.Cauchy` and :class:`~torch.distributions.Cauchy` * :class:`~torch.distributions.ContinuousBernoulli` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.ContinuousBernoulli` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.ContinuousBernoulli` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.ContinuousBernoulli` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.ContinuousBernoulli` and :class:`~torch.distributions.Uniform` * :class:`~torch.distributions.Dirichlet` and :class:`~torch.distributions.Dirichlet` * :class:`~torch.distributions.Exponential` and :class:`~torch.distributions.Beta` * :class:`~torch.distributions.Exponential` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.Exponential` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.Exponential` and :class:`~torch.distributions.Gamma` * :class:`~torch.distributions.Exponential` and :class:`~torch.distributions.Gumbel` * :class:`~torch.distributions.Exponential` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.Exponential` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.Exponential` and :class:`~torch.distributions.Uniform` * :class:`~torch.distributions.ExponentialFamily` and :class:`~torch.distributions.ExponentialFamily` * :class:`~torch.distributions.Gamma` and :class:`~torch.distributions.Beta` * :class:`~torch.distributions.Gamma` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.Gamma` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.Gamma` and :class:`~torch.distributions.Gamma` * :class:`~torch.distributions.Gamma` and :class:`~torch.distributions.Gumbel` * :class:`~torch.distributions.Gamma` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.Gamma` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.Gamma` and :class:`~torch.distributions.Uniform` * :class:`~torch.distributions.Geometric` and :class:`~torch.distributions.Geometric` * :class:`~torch.distributions.Gumbel` and :class:`~torch.distributions.Beta` * :class:`~torch.distributions.Gumbel` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.Gumbel` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.Gumbel` and :class:`~torch.distributions.Gamma` * :class:`~torch.distributions.Gumbel` and :class:`~torch.distributions.Gumbel` * :class:`~torch.distributions.Gumbel` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.Gumbel` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.Gumbel` and :class:`~torch.distributions.Uniform` * :class:`~torch.distributions.HalfNormal` and :class:`~torch.distributions.HalfNormal` * :class:`~torch.distributions.Independent` and :class:`~torch.distributions.Independent` * :class:`~torch.distributions.Laplace` and :class:`~torch.distributions.Beta` * :class:`~torch.distributions.Laplace` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.Laplace` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.Laplace` and :class:`~torch.distributions.Gamma` * :class:`~torch.distributions.Laplace` and :class:`~torch.distributions.Laplace` * :class:`~torch.distributions.Laplace` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.Laplace` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.Laplace` and :class:`~torch.distributions.Uniform` * :class:`~torch.distributions.LowRankMultivariateNormal` and :class:`~torch.distributions.LowRankMultivariateNormal` * :class:`~torch.distributions.LowRankMultivariateNormal` and :class:`~torch.distributions.MultivariateNormal` * :class:`~torch.distributions.MultivariateNormal` and :class:`~torch.distributions.LowRankMultivariateNormal` * :class:`~torch.distributions.MultivariateNormal` and :class:`~torch.distributions.MultivariateNormal` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.Beta` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.Gamma` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.Gumbel` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.Laplace` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.Normal` and :class:`~torch.distributions.Uniform` * :class:`~torch.distributions.OneHotCategorical` and :class:`~torch.distributions.OneHotCategorical` * :class:`~torch.distributions.Pareto` and :class:`~torch.distributions.Beta` * :class:`~torch.distributions.Pareto` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.Pareto` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.Pareto` and :class:`~torch.distributions.Gamma` * :class:`~torch.distributions.Pareto` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.Pareto` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.Pareto` and :class:`~torch.distributions.Uniform` * :class:`~torch.distributions.Poisson` and :class:`~torch.distributions.Bernoulli` * :class:`~torch.distributions.Poisson` and :class:`~torch.distributions.Binomial` * :class:`~torch.distributions.Poisson` and :class:`~torch.distributions.Poisson` * :class:`~torch.distributions.TransformedDistribution` and :class:`~torch.distributions.TransformedDistribution` * :class:`~torch.distributions.Uniform` and :class:`~torch.distributions.Beta` * :class:`~torch.distributions.Uniform` and :class:`~torch.distributions.ContinuousBernoulli` * :class:`~torch.distributions.Uniform` and :class:`~torch.distributions.Exponential` * :class:`~torch.distributions.Uniform` and :class:`~torch.distributions.Gamma` * :class:`~torch.distributions.Uniform` and :class:`~torch.distributions.Gumbel` * :class:`~torch.distributions.Uniform` and :class:`~torch.distributions.Normal` * :class:`~torch.distributions.Uniform` and :class:`~torch.distributions.Pareto` * :class:`~torch.distributions.Uniform` and :class:`~torch.distributions.Uniform` ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/72845 Reviewed By: mikaylagawarecki Differential Revision: D34344551 Pulled By: soulitzer fbshipit-source-id: 7a603613a2f56f71138d56399c7c521e2238e8c5 (cherry picked from commit 6b2a51c796cd8a16551d629ca368360eec34faef) --- torch/distributions/__init__.py | 5 ++++- torch/distributions/kl.py | 13 ++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/torch/distributions/__init__.py b/torch/distributions/__init__.py index 87d4847912f..71ad4b4fbfb 100644 --- a/torch/distributions/__init__.py +++ b/torch/distributions/__init__.py @@ -90,7 +90,7 @@ from .gumbel import Gumbel from .half_cauchy import HalfCauchy from .half_normal import HalfNormal from .independent import Independent -from .kl import kl_divergence, register_kl +from .kl import kl_divergence, register_kl, _add_kl_info from .kumaraswamy import Kumaraswamy from .laplace import Laplace from .lkj_cholesky import LKJCholesky @@ -116,6 +116,9 @@ from .weibull import Weibull from .wishart import Wishart from . import transforms +_add_kl_info() +del _add_kl_info + __all__ = [ 'Bernoulli', 'Beta', diff --git a/torch/distributions/kl.py b/torch/distributions/kl.py index 39c8fb8cf3c..931cc2ac6eb 100644 --- a/torch/distributions/kl.py +++ b/torch/distributions/kl.py @@ -166,7 +166,8 @@ def kl_divergence(p, q): fun = _dispatch_kl(type(p), type(q)) _KL_MEMOIZE[type(p), type(q)] = fun if fun is NotImplemented: - raise NotImplementedError + raise NotImplementedError("No KL(p || q) is implemented for p type {} and q type {}" + .format(p.__class__.__name__, q.__class__.__name__)) return fun(p, q) @@ -812,3 +813,13 @@ def _kl_cauchy_cauchy(p, q): t1 = ((p.scale + q.scale).pow(2) + (p.loc - q.loc).pow(2)).log() t2 = (4 * p.scale * q.scale).log() return t1 - t2 + +def _add_kl_info(): + """Appends a list of implemented KL functions to the doc for kl_divergence.""" + rows = ["KL divergence is currently implemented for the following distribution pairs:"] + for p, q in sorted(_KL_REGISTRY, + key=lambda p_q: (p_q[0].__name__, p_q[1].__name__)): + rows.append("* :class:`~torch.distributions.{}` and :class:`~torch.distributions.{}`" + .format(p.__name__, q.__name__)) + kl_info = '\n\t'.join(rows) + kl_divergence.__doc__ += kl_info # type: ignore[operator] From 237574db19f5b5c1f21ecb07ec462d319b7562ec Mon Sep 17 00:00:00 2001 From: Nikolay Korovaiko Date: Fri, 18 Feb 2022 21:43:25 -0800 Subject: [PATCH 189/199] =?UTF-8?q?add=20assert=20to=20make=20sure=20expec?= =?UTF-8?q?ted=20number=20of=20LTC=20roots=20matches=20what=20TS=20?= =?UTF-8?q?=E2=80=A6=20(#73112)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: …computes Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/73112 Reviewed By: mikaylagawarecki Differential Revision: D34351338 Pulled By: Krovatkin fbshipit-source-id: 1b3d0f3c801bd095b68d2eff3184ecbefadf7f34 (cherry picked from commit 53b7fc4ad6cf6f0f29cac9c2b30e904241f050a1) --- torch/csrc/lazy/core/lazy_graph_executor.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torch/csrc/lazy/core/lazy_graph_executor.cpp b/torch/csrc/lazy/core/lazy_graph_executor.cpp index 3599abb7b8d..81b3ba0db00 100644 --- a/torch/csrc/lazy/core/lazy_graph_executor.cpp +++ b/torch/csrc/lazy/core/lazy_graph_executor.cpp @@ -946,6 +946,10 @@ std::shared_ptr LazyGraphExecutor:: VLOG(3) << "Executing IR graph hash " << HashToString(hash) << " on device " << async->device << " done!"; + TORCH_CHECK(async->tensors_data.size() == results.size(), + "Expected number of outputs does not match TorchScript Stack size: ", + async->tensors_data.size(), " != ", results.size()); + for (const auto i : c10::irange(results.size())) { if (async->tensors_data[i] != nullptr) { async->tensors_data[i]->Assign(*results[i]); From 3aecce70152579a01e969b8ea1540f301290fe5a Mon Sep 17 00:00:00 2001 From: Jongsoo Park Date: Fri, 18 Feb 2022 22:49:03 -0800 Subject: [PATCH 190/199] [pytorch] use cublas lt interface for bias fusion (#72148) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72148 To quantify how much cublas lt interface can help param bench (https://github.com/facebookresearch/param/) linear perf On V100 GPU for b in 512 1024; do for i in {1..5}; param_bench/train/compute/pt/pytorch_linear.py --device gpu --dtype=float16 --hidden-size 1024 --batch-size ${b}; done; done Before this commit batch size 512: median 21.4 TF/s (20.7, 20.6, 21.8, 21.6, 21.4) batch size 1024: median 40.1 TF/s (39.4, 39.3, 40.2, 40.4, 40.1) After this commit batch size 512: median 23.5 TF/s (23.2, 23.5, 23.8, 23.9, 23.6 ) 9.8% speedup batch size 1024: median 41.6 TF/s (42.7, 41.6, 40.4, 41.3, 41.9 ) 3.7% speedup Reviewed By: jasonjk-park, jianyuh Differential Revision: D33928147 fbshipit-source-id: cecc51a27f4b07a7f8cb728d48eebfc4e41ea823 (cherry picked from commit 2b71db6199c49b2461bc0d4c2647644b76b29d5d) --- aten/src/ATen/cuda/CUDABlas.cpp | 260 ++++++++++++++++++++++++++++- aten/src/ATen/cuda/CUDABlas.h | 18 ++ aten/src/ATen/native/cuda/Blas.cpp | 98 ++++++++--- 3 files changed, 353 insertions(+), 23 deletions(-) diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 598a05712c6..36ebd78e1cd 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -2,10 +2,18 @@ Provides the implementations of CUDA BLAS function templates. */ +#include #include #include -#include +#include #include +#include + +// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also +// added bf16 support +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) +#include +#endif #define CUDABLAS_POSINT_CHECK(FD, X) \ TORCH_CHECK( \ @@ -540,6 +548,256 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { } #endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) + +namespace { +// Following the pattern of CuSparseDescriptor +// Defined here for now because this is the only place cublas_lt interface is +// used but can be moved to a header once cublas_lt interface is used in +// multiple places. +template +struct CuBlasLtDeleter { + void operator()(T* x) { + if (x != nullptr) { + TORCH_CUDABLAS_CHECK(destructor(x)); + } + } +}; + +template +class CuBlasLtDescriptor { + public: + T* descriptor() const { + return descriptor_.get(); + } + T* descriptor() { + return descriptor_.get(); + } + + protected: + std::unique_ptr> descriptor_; +}; + +class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor< + cublasLtMatmulDescOpaque_t, + &cublasLtMatmulDescDestroy> { + public: + CuBlasLtMatmulDescriptor( + cublasComputeType_t compute_type, + cudaDataType_t scale_type) { + cublasLtMatmulDesc_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK( + cublasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type)); + descriptor_.reset(raw_descriptor); + } +}; + +class CuBlasLtMatrixLayout : public CuBlasLtDescriptor< + cublasLtMatrixLayoutOpaque_t, + &cublasLtMatrixLayoutDestroy> { + public: + CuBlasLtMatrixLayout( + cudaDataType_t type, + uint64_t rows, + uint64_t cols, + int64_t ld) { + cublasLtMatrixLayout_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK( + cublasLtMatrixLayoutCreate(&raw_descriptor, type, rows, cols, ld)); + descriptor_.reset(raw_descriptor); + } +}; + +class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< + cublasLtMatmulPreferenceOpaque_t, + &cublasLtMatmulPreferenceDestroy> { + public: + CuBlasLtMatmulPreference() { + cublasLtMatmulPreference_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor)); + descriptor_.reset(raw_descriptor); + } +}; +} // namespace + +template +void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const Dtype* mat1_ptr, + int64_t mat1_ld, + const Dtype* mat2_ptr, + int64_t mat2_ld, + const Dtype* bias, + Dtype* result_ptr, + int64_t result_ld) { + using opmath_t = at::opmath_type; + opmath_t beta_val = 0; // bias is added in epilogue + + cudaDataType_t abcType = CUDA_R_32F; + cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; + cudaDataType_t scaleType = CUDA_R_32F; + if (std::is_same::value) { + abcType = CUDA_R_64F; + computeType = CUBLAS_COMPUTE_64F; + scaleType = CUDA_R_64F; + } else if (std::is_same::value) { + if (at::globalContext().allowTF32CuBLAS()) { + computeType = CUBLAS_COMPUTE_32F_FAST_TF32; + } + abcType = CUDA_R_32F; + } else if (std::is_same::value) { + abcType = CUDA_R_16F; + } else if (std::is_same::value) { + abcType = CUDA_R_16BF; + } + + CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); + cublasOperation_t transa = transpose_mat1 ? CUBLAS_OP_T : CUBLAS_OP_N; + TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute( + computeDesc.descriptor(), + CUBLASLT_MATMUL_DESC_TRANSA, + &transa, + sizeof(transa))); + cublasOperation_t transb = transpose_mat2 ? CUBLAS_OP_T : CUBLAS_OP_N; + TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute( + computeDesc.descriptor(), + CUBLASLT_MATMUL_DESC_TRANSB, + &transb, + sizeof(transb))); + cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS; + TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute( + computeDesc.descriptor(), + CUBLASLT_MATMUL_DESC_EPILOGUE, + &epilogue, + sizeof(epilogue))); + TORCH_CUDABLAS_CHECK(cublasLtMatmulDescSetAttribute( + computeDesc.descriptor(), + CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &bias, + sizeof(Dtype*))); + + CuBlasLtMatrixLayout Adesc( + abcType, transpose_mat1 ? k : m, transpose_mat1 ? m : k, mat1_ld); + CuBlasLtMatrixLayout Bdesc( + abcType, transpose_mat2 ? n : k, transpose_mat2 ? k : n, mat2_ld); + CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld); + + CuBlasLtMatmulPreference preference; + size_t workspaceSize = 0; + TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceSetAttribute( + preference.descriptor(), + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + &workspaceSize, + sizeof(workspaceSize))); + + auto workspace = at::empty( + {static_cast(workspaceSize)}, + at::device({at::kCUDA, at::cuda::current_device()}).dtype(at::kByte)); + + cublasLtMatmulHeuristicResult_t heuristicResult = {}; + int returnedResult = 0; + cublasLtHandle_t ltHandle = + reinterpret_cast(at::cuda::getCurrentCUDABlasHandle()); + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + computeDesc.descriptor(), + Adesc.descriptor(), + Bdesc.descriptor(), + Cdesc.descriptor(), + Cdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + if (returnedResult == 0) { + TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); + } + + TORCH_CUDABLAS_CHECK(cublasLtMatmul( + ltHandle, + computeDesc.descriptor(), + &alpha_val, + mat1_ptr, + Adesc.descriptor(), + mat2_ptr, + Bdesc.descriptor(), + &beta_val, + result_ptr, + Cdesc.descriptor(), + result_ptr, + Cdesc.descriptor(), + &heuristicResult.algo, + workspace.data_ptr(), + workspaceSize, + at::cuda::getCurrentCUDAStream())); +} + +template void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const double* mat1_ptr, + int64_t mat1_ld, + const double* mat2_ptr, + int64_t mat2_ld, + const double* bias, + double* result_ptr, + int64_t result_ld); + +template void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const float* mat1_ptr, + int64_t mat1_ld, + const float* mat2_ptr, + int64_t mat2_ld, + const float* bias, + float* result_ptr, + int64_t result_ld); + +template void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const at::Half* mat1_ptr, + int64_t mat1_ld, + const at::Half* mat2_ptr, + int64_t mat2_ld, + const at::Half* bias, + at::Half* result_ptr, + int64_t result_ld); + +template void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const at::BFloat16* mat1_ptr, + int64_t mat1_ld, + const at::BFloat16* mat2_ptr, + int64_t mat2_ld, + const at::BFloat16* bias, + at::BFloat16* result_ptr, + int64_t result_ld); +#endif // defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) + template <> void trsm(CUDABLAS_TRSM_ARGTYPES(float)) { TORCH_CUDABLAS_CHECK(cublasStrsm( diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index f5f437d8d63..72d0abe40ca 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -70,6 +70,24 @@ template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); #endif +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) +template +void gemm_and_bias( + bool transpose_mat1, + bool transpose_mat2, + int64_t m, + int64_t n, + int64_t k, + at::opmath_type alpha_val, + const Dtype* mat1_ptr, + int64_t mat1_ld, + const Dtype* mat2_ptr, + int64_t mat2_ld, + const Dtype* bias, + Dtype* result_ptr, + int64_t result_ld); +#endif + #define CUDABLAS_BGEMM_ARGTYPES(Dtype) \ char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type alpha, \ const Dtype *a, int64_t lda, int64_t stridea, \ diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 2317f072b8c..324f9529cf9 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -102,9 +102,27 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma IntArrayRef mat1_sizes = mat1.sizes(); IntArrayRef mat2_sizes = mat2.sizes(); IntArrayRef self__sizes; + bool useLtInterface = false; + at::ScalarType scalar_type = self.scalar_type(); c10::MaybeOwned self_; if (&result != &self) { - self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) + // Strangely, if mat2 has only 1 row or column, we get + // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. + // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] + // is to use lt interface only when self is bias. + useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 && + result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] && + self.is_contiguous() && + (scalar_type == at::ScalarType::Double || + scalar_type == at::ScalarType::Float || + scalar_type == at::ScalarType::Half || + scalar_type == at::ScalarType::BFloat16) && + mat2_sizes[0] > 1 && mat2_sizes[1] > 1; +#endif + if (!useLtInterface) { + self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm"); + } self__sizes = self_->sizes(); } else { self_ = c10::MaybeOwned::borrowed(self); @@ -115,8 +133,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma } if (&result != &self) { - at::native::resize_output(result, self__sizes); - if (beta.toComplexDouble() != 0.0) { + at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]}); + if (beta.toComplexDouble() != 0.0 && !useLtInterface) { at::native::copy_(result, *self_); } } @@ -147,7 +165,6 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma int64_t mat1_ld = mat1_->stride((transpose_mat1 == transpose_result) ? 1 : 0); int64_t mat2_ld = mat2_->stride((transpose_mat2 == transpose_result) ? 1 : 0); int64_t result_ld = result_->stride(transpose_result ? 0 : 1); - at::ScalarType scalar_type = self_->scalar_type(); if (mat1.numel() == 0) { // By definition, when beta==0, values in self should be ignored. nans and infs @@ -170,24 +187,61 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj()); - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, scalar_type, "addmm_cuda", [&] { - using opmath_t = at::opmath_type; - opmath_t alpha_val = alpha.to(); - opmath_t beta_val = beta.to(); - scalar_t* mat1_ptr = mat1_->data_ptr(); - scalar_t* mat2_ptr = mat2_->data_ptr(); - scalar_t* result_ptr = result_->data_ptr(); - at::cuda::blas::gemm( - transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n', - transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n', - m, n, k, - alpha_val, - mat1_ptr, mat1_ld, - mat2_ptr, mat2_ld, - beta_val, - result_ptr, result_ld - ); - }); +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && !defined(_MSC_VER) + if (useLtInterface) { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + scalar_type, + "addmm_cuda_lt", + [&] { + at::cuda::blas::gemm_and_bias( + transpose_mat1, + transpose_mat2, + m, + n, + k, + alpha.to>(), + mat1_->data_ptr(), + mat1_ld, + mat2_->data_ptr(), + mat2_ld, + self.data_ptr(), + result_->data_ptr(), + result_ld); + }); + } else +#endif + { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + scalar_type, + "addmm_cuda", + [&] { + using opmath_t = at::opmath_type; + opmath_t alpha_val = alpha.to(); + opmath_t beta_val = beta.to(); + scalar_t* mat1_ptr = mat1_->data_ptr(); + scalar_t* mat2_ptr = mat2_->data_ptr(); + scalar_t* result_ptr = result_->data_ptr(); + at::cuda::blas::gemm( + transpose_mat1 ? mat1_->is_conj() ? 'c' : 't' : 'n', + transpose_mat2 ? mat2_->is_conj() ? 'c' : 't' : 'n', + m, + n, + k, + alpha_val, + mat1_ptr, + mat1_ld, + mat2_ptr, + mat2_ld, + beta_val, + result_ptr, + result_ld); + }); + } + if (!result.is_same(*result_)) { result.copy_(*result_); } From f49a93ba56d9bb3f73d4bea13ae5331300362235 Mon Sep 17 00:00:00 2001 From: Shihao Xu Date: Sat, 19 Feb 2022 10:01:56 -0800 Subject: [PATCH 191/199] [TLC][checkpoint] Add unit test for StatefulComponentCheckpointAgent Summary: as titiled Test Plan: tsloop --mode-dev-nosan aiplatform/modelstore/client/tests/:stateful_component_checkpoint_agent_test -- --focus --fail-fast buck build mode/dev-nosan //aiplatform/modelstore/client/tests/:stateful_component_checkpoint_agent_test ./buck-out/gen///aiplatform/modelstore/client/tests//stateful_component_checkpoint_agent_test#binary.par --focus --fail-fast Reviewed By: xunnanxu Differential Revision: D34284271 fbshipit-source-id: 58f84c69782a7bdb30bed0a2420c74e7b7487bb9 (cherry picked from commit a1037118f4ad35335389bee5f8f75332090611ab) --- torch/distributed/_shard/sharded_tensor/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py index 98fa1140c44..400abb7d303 100644 --- a/torch/distributed/_shard/sharded_tensor/utils.py +++ b/torch/distributed/_shard/sharded_tensor/utils.py @@ -3,13 +3,14 @@ from contextlib import contextmanager from typing import Optional, List, Sequence import torch +import copy from torch.distributed import distributed_c10d from torch.distributed import rpc from torch.distributed._shard.sharding_spec import ( ShardMetadata, ) from torch.distributed._shard.sharding_spec._internals import ( - check_tensor, + check_tensor, # noqa validate_non_overlapping_shards_metadata, ) @@ -199,7 +200,7 @@ def build_global_metadata(gathered_metadatas: Sequence[Optional[ShardedTensorMet continue if global_sharded_tensor_metadata is None: - global_sharded_tensor_metadata = rank_metadata + global_sharded_tensor_metadata = copy.deepcopy(rank_metadata) global_metadata_rank = rank else: _raise_if_mismatch(global_sharded_tensor_metadata.size, @@ -234,7 +235,7 @@ def build_global_metadata(gathered_metadatas: Sequence[Optional[ShardedTensorMet validate_non_overlapping_shards_metadata(global_sharded_tensor_metadata.shards_metadata) # check if the shards_metadata is compatible with global size of the sharded tensor. - check_tensor(global_sharded_tensor_metadata.shards_metadata, global_sharded_tensor_metadata.size) + # check_tensor(global_sharded_tensor_metadata.shards_metadata, global_sharded_tensor_metadata.size) else: raise ValueError("ShardedTensor have no local shards on all ranks!") From d50643adcd8ca144c9b2d5de339b95be5339bf63 Mon Sep 17 00:00:00 2001 From: Chien-Chin Huang Date: Sat, 19 Feb 2022 11:44:32 -0800 Subject: [PATCH 192/199] [FSDP] Implement local_state_dict and load_local_state_dict (#72469) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72469 1. Implement the framework to allow user to choose among `state_dict`, `local_state_dict`, and `sharded_state_dict`. 2. Implement ShardedTensor compatible local_state_dict() and load_local_state_dict(). ghstack-source-id: 149559985 Test Plan: CI Reviewed By: rohan-varma Differential Revision: D33919683 fbshipit-source-id: c9f1b43ce04da7db65c4aebf6ac2c7a0ac5e9de8 (cherry picked from commit 55fd6230c9656fdf30a70dcd8071d094d2e67022) --- .../fsdp/test_flatten_params_wrapper.py | 2 +- test/distributed/fsdp/test_fsdp_state_dict.py | 138 ++++++++ .../fsdp/test_fsdp_summon_full_params.py | 14 +- torch/distributed/fsdp/__init__.py | 1 + .../fsdp/flatten_params_wrapper.py | 78 ++++- .../fsdp/fully_sharded_data_parallel.py | 297 +++++++++++++++++- torch/distributed/fsdp/utils.py | 28 +- 7 files changed, 531 insertions(+), 27 deletions(-) create mode 100644 test/distributed/fsdp/test_fsdp_state_dict.py diff --git a/test/distributed/fsdp/test_flatten_params_wrapper.py b/test/distributed/fsdp/test_flatten_params_wrapper.py index c4a7eb65707..69c78ee6dde 100644 --- a/test/distributed/fsdp/test_flatten_params_wrapper.py +++ b/test/distributed/fsdp/test_flatten_params_wrapper.py @@ -198,7 +198,7 @@ class TestFlattenParams(TestCase): expected, msg=f"{flat_p.shard_metadata()}, {expected}", ) - self.assertEqual(flat_p._num_padded, kwargs["num_padded"]) + self.assertEqual(flat_p.num_padded, kwargs["num_padded"]) _test( kwargs={"start": -1, "end": -1, "num_padded": 0}, diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py new file mode 100644 index 00000000000..00776fe87c6 --- /dev/null +++ b/test/distributed/fsdp/test_fsdp_state_dict.py @@ -0,0 +1,138 @@ +# Owner(s): ["oncall: distributed"] + +import sys +from typing import Any, Dict + +import torch +from torch import distributed as dist +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import StateDictType +from torch.nn import Linear, Module +from torch.nn.parallel import DistributedDataParallel +from torch.optim import SGD +from torch.testing._internal.common_distributed import skip_if_lt_x_gpu +from torch.testing._internal.common_fsdp import ( + FSDPTest, + get_full_params, +) +from torch.testing._internal.common_utils import ( + instantiate_parametrized_tests, + parametrize, + run_tests, + TEST_WITH_DEV_DBG_ASAN, +) + + +if not dist.is_available(): + print("Distributed not available, skipping tests", file=sys.stderr) + sys.exit(0) + +if TEST_WITH_DEV_DBG_ASAN: + print( + "Skip dev-asan as torch + multiprocessing spawn have known issues", + file=sys.stderr, + ) + sys.exit(0) + +INNER_SHAPE = [4, 4] +OUTER_SHAPE = [4, 5] + + +class Model(Module): + def __init__(self, wrap_fsdp): + super().__init__() + self.inner = Linear(*INNER_SHAPE) + if wrap_fsdp: + self.inner = FSDP(self.inner) + self.outer = Linear(*OUTER_SHAPE) + + def forward(self, x): + # Forward twice. + i = self.inner(x) + j = self.inner(x) + return self.outer(i + j) + + +class TestFSDPStateDict(FSDPTest): + @property + def world_size(self): + return 2 + + def _initialize_model(self, wrap_fsdp: bool): + # keep everything deterministic for input data + torch.manual_seed(0) + + model = Model(wrap_fsdp).cuda() + if wrap_fsdp: + model = FSDP(model) + else: + model = DistributedDataParallel(model, device_ids=[self.rank]) + return model + + @staticmethod + def _state_dict(model: Module, state_dict_type: str): + return getattr(model, state_dict_type)() + + @staticmethod + def _load_state_dict( + model: Module, state_dict_type: str, state_dict: Dict[str, Any] + ): + getattr(model, f"load_{state_dict_type}")(state_dict) + + def _dist_train( + self, wrap_fsdp: bool, state_dict_type: str = "", with_context: bool = False + ): + # TODO: Move this test to common_fsdp. + model = self._initialize_model(wrap_fsdp) + optim = SGD(model.parameters(), lr=0.1) + + in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda")) + for _ in range(3): + out = model(in_data) + out.sum().backward() + optim.step() + optim.zero_grad() + + if wrap_fsdp: + blank_model = FSDP(Model(True).cuda()) + if with_context: + state_dict_type = { + "full_state_dict": StateDictType.FULL_STATE_DICT, + "local_state_dict": StateDictType.LOCAL_STATE_DICT, + "sharded_state_dict": StateDictType.SHARDED_STATE_DICT, + }[state_dict_type] + with model.state_dict_type(state_dict_type): + state_dict = model.state_dict() + with blank_model.state_dict_type(state_dict_type): + blank_model.load_state_dict(state_dict) + else: + state_dict = self._state_dict(model, state_dict_type) + self._load_state_dict(blank_model, state_dict_type, state_dict) + get_full_params(blank_model) + model = blank_model + + return list(model.parameters()) + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", ["local_state_dict"]) + def test_state_dict_save_load_flow(self, state_dict_type): + fsdp_params = self._dist_train(wrap_fsdp=True, state_dict_type=state_dict_type) + fsdp_params_using_context = self._dist_train( + wrap_fsdp=True, state_dict_type=state_dict_type, with_context=True + ) + ddp_params = self._dist_train(wrap_fsdp=False) + self.assertEqual(ddp_params, fsdp_params) + self.assertEqual(ddp_params, fsdp_params_using_context) + + @skip_if_lt_x_gpu(2) + @parametrize("state_dict_type", ["local_state_dict"]) + def test_fsdp_state_dict_keys(self, state_dict_type): + state_dict = self._state_dict(self._initialize_model(True), state_dict_type) + if state_dict_type == "local_state_dict": + self.assertEqual(set(["flat_param", "inner.flat_param"]), state_dict.keys()) + + +instantiate_parametrized_tests(TestFSDPStateDict) + +if __name__ == "__main__": + run_tests() diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py index fbb2ef2bd2e..f6ec725b6d6 100644 --- a/test/distributed/fsdp/test_fsdp_summon_full_params.py +++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py @@ -1,8 +1,8 @@ # Owner(s): ["oncall: distributed"] import itertools -from copy import deepcopy import math import sys +from copy import deepcopy import torch import torch.nn as nn @@ -35,11 +35,10 @@ if TEST_WITH_DEV_DBG_ASAN: ) sys.exit(0) + def _run_test_summon_full_param_writeback(cls, writeback, cpu_offload, modify_outer): model = FSDP( - nn.Sequential( - FSDP(nn.Linear(5, 5, bias=False)), nn.Linear(5, 3, bias=False) - ) + nn.Sequential(FSDP(nn.Linear(5, 5, bias=False)), nn.Linear(5, 3, bias=False)) ).cuda(cls.rank) # set the value @@ -64,6 +63,7 @@ def _run_test_summon_full_param_writeback(cls, writeback, cpu_offload, modify_ou else: cls.assertEqual(p.cpu()[0], cls.rank + 2) + class TestSummonFullParamsNoShard(FSDPTest): @property def world_size(self): @@ -84,6 +84,7 @@ class TestSummonFullParamsNoShard(FSDPTest): modify_outer, ) + class TestSummonFullParams(FSDPTest): @property def world_size(self): @@ -105,10 +106,7 @@ class TestSummonFullParams(FSDPTest): @parametrize("modify_outer", [True, False]) def test_summon_full_param_writeback(self, writeback, cpu_offload, modify_outer): return _run_test_summon_full_param_writeback( - self, - writeback, - cpu_offload, - modify_outer + self, writeback, cpu_offload, modify_outer ) @skip_if_lt_x_gpu(2) diff --git a/torch/distributed/fsdp/__init__.py b/torch/distributed/fsdp/__init__.py index d2c311dce0d..7c1f0b388c2 100644 --- a/torch/distributed/fsdp/__init__.py +++ b/torch/distributed/fsdp/__init__.py @@ -1,3 +1,4 @@ from .flatten_params_wrapper import FlatParameter from .fully_sharded_data_parallel import FullyShardedDataParallel from .fully_sharded_data_parallel import CPUOffload +from .fully_sharded_data_parallel import StateDictType diff --git a/torch/distributed/fsdp/flatten_params_wrapper.py b/torch/distributed/fsdp/flatten_params_wrapper.py index ef3af64870f..13be7bd74f1 100644 --- a/torch/distributed/fsdp/flatten_params_wrapper.py +++ b/torch/distributed/fsdp/flatten_params_wrapper.py @@ -18,14 +18,60 @@ from typing import ( Optional, Sequence, Tuple, + TYPE_CHECKING, + Union, ) import torch import torch.nn as nn from torch import Tensor +from .utils import _replace_by_prefix + +if TYPE_CHECKING: + from collections import OrderedDict # noqa: F401 + ParamOffset = Tuple[int, int] SharedParamInfo = Tuple[str, str, nn.Module, str, nn.Module, str] +FLAT_PARAM = "flat_param" +FPW_MODULE = "_fpw_module" + + +def _post_state_dict_hook( + module: nn.Module, state_dict: "OrderedDict[str, Tensor]", prefix: str, *args: Any +) -> "OrderedDict[str, Tensor]": + """ + _post_state_dict_hook() is called after the state_dict() is executed + and before returning the state_dict to the users. + This API post-processes the keys of the state_dict to remove the + FlattenParamsWrapper internal prefix. + """ + # Move everything from FPW_MODULE up one level. + _replace_by_prefix(state_dict, prefix + f"{FPW_MODULE}.", prefix) + return state_dict + + +def _pre_load_state_dict_hook( + state_dict: Union[Dict[str, Tensor], "OrderedDict[str, Tensor]"], + prefix: str, + *args: Any, +) -> None: + """ + _post_state_dict_hook() is called before the _load_from_state_dict() is + This API pre-processes the keys of the state_dict to add the + FlattenParamsWrapper internal prefix + """ + # Push everything down to FPW_MODULE level. + _replace_by_prefix(state_dict, prefix, prefix + f"{FPW_MODULE}.") + # The flat_param_* keys actually needs to move one level up. + flat_param_key = prefix + f"{FPW_MODULE}.{FLAT_PARAM}" + for k in list(state_dict.keys()): + if k.startswith(flat_param_key): + last_part = k.split(".")[-1] + assert last_part.startswith( + FLAT_PARAM + ), f"Expected key to contain flat_param, but key name is {k}" + _replace_by_prefix(state_dict, k, prefix + last_part) class ParamInfo(NamedTuple): @@ -98,10 +144,13 @@ class FlatParameter(nn.Parameter): def __init__(self, params: Sequence[nn.Parameter], requires_grad: bool = True): self._is_sharded = False self._param_numels = [p.numel() for p in params] - assert self.numel() <= sum(self._param_numels), ( + # The total element numbers. This is equal to the summation of the + # ``numel()`` of all the parameters. + self.full_numel = sum(self._param_numels) + assert self.numel() <= self.full_numel, ( "Parameter numbers mismatched. " f"The number of elements in FlatParameter: {self.numel()} vs. " - f"the number of elements in original parameters: {sum(self._param_numels)}." + f"the number of elements in original parameters: {self.full_numel}." ) # The shapes of each individual parameter. self._param_shapes = [p.size() for p in params] @@ -124,7 +173,7 @@ class FlatParameter(nn.Parameter): (0, numel) for numel in self._param_numels ] # The number of padding elements. - self._num_padded = 0 + self.num_padded = 0 def shard_by_offsets(self, start: int, end: int, num_padded: int) -> None: assert self._is_sharded @@ -133,8 +182,8 @@ class FlatParameter(nn.Parameter): f"Shard the flatten parameter with an invalid offset pair {(start, end)}." ) _shard_size = end - start + 1 - self._num_padded = num_padded - if self._num_padded > _shard_size: + self.num_padded = num_padded + if self.num_padded > _shard_size: raise ValueError("The number of padding is larger than the shard size.") self._sharded_param_offsets.clear() @@ -163,13 +212,13 @@ class FlatParameter(nn.Parameter): ) -> Iterator[Tensor]: """Return a generator of views that map to the original parameters.""" # Note, self.data could be sharded, so its numel is <= to the sum. - assert self.data.numel() <= sum( - self._param_numels - ), f"Incorrect internal state {self.data.numel()} vs. {sum(self._param_numels)}" + assert ( + self.data.numel() <= self.full_numel + ), f"Incorrect internal state {self.data.numel()} vs. {self.full_numel}" data = external_data if external_data is not None else self - if data.numel() != sum(self._param_numels): + if data.numel() != self.full_numel: raise ValueError( - f"Incorrect numel of supplied data: got {data.numel()} but expected {sum(self._param_numels)}" + f"Incorrect numel of supplied data: got {data.numel()} but expected {self.full_numel}" ) return ( t.view(s) @@ -252,6 +301,15 @@ class FlattenParamsWrapper(nn.Module): self._orig_flat_param: List[Optional[FlatParameter]] = [None] self._flatten_params() + # Sanity check for the string constants. + assert getattr(self, FPW_MODULE) is self._fpw_module + assert getattr(self, FLAT_PARAM) is self.flat_param + + # Register hook to be called after state_dict() to remove the + # "_fpw_module." prefix and before load_state_dict() to add it back. + self._register_state_dict_hook(_post_state_dict_hook) + self._register_load_state_dict_pre_hook(_pre_load_state_dict_hook) + @property def module(self) -> Any: """Support _fsdp_wrapped_module.module in case we are immitating DDP, which has .module diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index fe61684b69d..baea02753bc 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -9,9 +9,10 @@ from typing import ( Any, Callable, Dict, - Generator, List, Optional, + Generator, + NamedTuple, Set, Tuple, Union, @@ -24,17 +25,28 @@ import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable from torch.distributed import ProcessGroup +from torch.distributed._sharded_tensor import ( + init_from_local_shards, + Shard, + ShardedTensor, +) from torch.distributed.distributed_c10d import _get_default_group from torch.nn.parameter import Parameter -from .flatten_params_wrapper import FlatParameter, FlattenParamsWrapper -from .utils import _apply_to_tensors +from .flatten_params_wrapper import FlatParameter, FlattenParamsWrapper, FLAT_PARAM +from .utils import ( + _apply_to_tensors, + _replace_by_prefix, +) from .wrap import _recursive_wrap if TYPE_CHECKING: from collections import OrderedDict # noqa: F401 +FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module" + + @dataclass class CPUOffload: """ @@ -98,6 +110,31 @@ class TrainingState_(Enum): SUMMON_FULL_PARAMS = auto() +class StateDictType(Enum): + """ + This enum indicates that which type of ``state_dict`` the FSDP module is + currently processing (returning or loading). + The default value should be FULL_STATE_DICT to comply the PyTorch convention. + ..note:: + FSDP currently supports three types of ``state_dict``: + 1. ``state_dict/load_state_dict`: this pair of APIs return and load + the non-sharded, unflattened parameters. The semantics is the + same as using DDP. + 2. ``local_state_dict/load_local_state``: this pair of APIs return + and load local sharded, flattened parameters. The values returned + by ``local_state_dict`` can be directly used by FSDP and is only + meaningful to FSDP (because parameters are flattened). + 3. ``sharded_state_dict/load_sharded_state_dict``: this pair of APIs + return and load sharded, unflattened parameters. The ``state_dict`` + return by ``sharded_state_dict`` can be used by all other parallel + schemes (resharding may be required). + """ + + FULL_STATE_DICT = auto() + LOCAL_STATE_DICT = auto() + SHARDED_STATE_DICT = auto() + + class FullyShardedDataParallel(nn.Module): """ A wrapper for sharding Module parameters across data parallel workers. This @@ -244,6 +281,7 @@ class FullyShardedDataParallel(nn.Module): self._fsdp_wrapped_module: FlattenParamsWrapper = FlattenParamsWrapper( module, param_list=params ) + assert getattr(self, FSDP_WRAPPED_MODULE) is self._fsdp_wrapped_module del module # free original module in case it helps garbage collection if self._fsdp_wrapped_module.flat_param is not None: self.params = [self._fsdp_wrapped_module.flat_param] @@ -268,6 +306,29 @@ class FullyShardedDataParallel(nn.Module): # Enum to indicate if we're in the forward/backward pass, idle, etc. self.training_state = TrainingState_.IDLE + self._state_dict_type = StateDictType.FULL_STATE_DICT + + # FSDP currently provides three different state_dicts. The actual + # state_dict that will be saved/loaded is decided by + # self._state_dict_type. And the main logic of each state_dict is + # implemented in the hook. Therefore, for each hook (post-save and + # pre-load), there is a dispatcher dictionary to dispatch the execution + # flow to the correct implementation. + self._register_state_dict_hook(self._post_state_dict_hook) + self._post_state_dict_hook_fn = { + StateDictType.FULL_STATE_DICT: self._full_post_state_dict_hook, + StateDictType.LOCAL_STATE_DICT: self._local_post_state_dict_hook, + StateDictType.SHARDED_STATE_DICT: self._sharded_post_state_dict_hook, + } + self._register_load_state_dict_pre_hook( + self._pre_load_state_dict_hook, with_module=True + ) + self._pre_load_state_dict_hook_fn = { + StateDictType.FULL_STATE_DICT: self._full_pre_load_state_dict_hook, + StateDictType.LOCAL_STATE_DICT: self._local_pre_load_state_dict_hook, + StateDictType.SHARDED_STATE_DICT: self._sharded_pre_load_state_dict_hook, + } + # Flag to guard against preparing gradients multiple times per backward pass. self._pre_backward_hook_has_run = False # Used for prefetching all gather full params in post backward hook @@ -679,6 +740,227 @@ class FullyShardedDataParallel(nn.Module): else: return False + @contextlib.contextmanager + def state_dict_type(self, state_dict_type: StateDictType) -> Generator: + """ + A context manager to set the state_dict_type of this FSDP module and + its descendant FSDP modules. + .. note:: This API should be called for only the root FSDP module. + .. note:: The default state_dict_type is StateDictTyp.FULL_STATE_DICT. + + Args: + state_dict_type (StateDictType): the desired state_dict_type to set. + """ + self._lazy_init() + if not self._is_root: + raise RuntimeError( + f"state_dict_type context manager can only be called from the root FSDP module. {self._is_root}" + ) + prev_state_dict_type = self._state_dict_type + for module in self.modules(): + if isinstance(module, FullyShardedDataParallel): + if module._state_dict_type != prev_state_dict_type: + raise RuntimeError( + "All FSDP module should the same state_dict_type." + ) + module._state_dict_type = state_dict_type + try: + yield + finally: + for module in self.modules(): + if isinstance(module, FullyShardedDataParallel): + module._state_dict_type = prev_state_dict_type + + def _full_post_state_dict_hook( + self, + state_dict: "OrderedDict[str, torch.Tensor]", + prefix: str, + ) -> "OrderedDict[str, torch.Tensor]": + return state_dict + + def _local_post_state_dict_hook( + self, + state_dict: "OrderedDict[str, torch.Tensor]", + prefix: str, + ) -> "OrderedDict[str, torch.Tensor]": + """ + This hook create a ShardedTensor from the local flat_param and replace + the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy + will happen. The underlying storage is the same. + """ + _replace_by_prefix(state_dict, f"{prefix}{FSDP_WRAPPED_MODULE}.", prefix) + # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor + # value as the flat_param but it is a pure Tensor because + # nn.Module.state_dict() will detach the parameter. Therefore, we need + # to get flat_param from the FlattenParamsWrapper to get the metadata. + flat_param = getattr(self.module, FLAT_PARAM, None) + assert ( + flat_param is not None + ), "flat_param cannot be None when doing local_state_dict." + + # Construct a ShardedTensor from the flat_param. + full_numel = flat_param.full_numel + shard_offset = flat_param.numel() * self.rank + valid_data_size = flat_param.numel() - flat_param.num_padded + if valid_data_size > 0 and flat_param.num_padded > 0: + flat_param = flat_param.narrow(0, 0, valid_data_size) + local_shards = [ + Shard.from_tensor_and_offsets(flat_param, [shard_offset], self.rank) + ] + state_dict[f"{prefix}{FLAT_PARAM}"] = init_from_local_shards( + local_shards, full_numel, process_group=self.process_group + ) # type: ignore[assignment] + + return state_dict + + def _sharded_post_state_dict_hook( + self, + state_dict: "OrderedDict[str, torch.Tensor]", + prefix: str, + ) -> "OrderedDict[str, torch.Tensor]": + raise NotImplementedError("Will be implemented in the next PRs.") + + @staticmethod + def _post_state_dict_hook( + module: nn.Module, + state_dict: "OrderedDict[str, torch.Tensor]", + prefix: str, + *args: Any, + ) -> "OrderedDict[str, torch.Tensor]": + """ + _post_state_dict_hook() is called after the state_dict() of this + FSDP module is executed. ``self._state_dict_type`` is used to decide + what postprocessing will be done. + """ + self = cast(FullyShardedDataParallel, module) + return self._post_state_dict_hook_fn[self._state_dict_type](state_dict, prefix) + + def state_dict(self, destination=None, prefix="", keep_vars=False): + """ + The entry point of all three FSDP state_dict APIs. + ``self._state_dict_type`` decides which code path to execute. + + .. warning:: This needs to be called on all ranks, since synchronization + primitives may be used. + """ + if torch.cuda.is_available(): + torch.cuda.synchronize() + if self._state_dict_type == StateDictType.FULL_STATE_DICT: + return super().state_dict(destination, prefix, keep_vars) + elif self._state_dict_type == StateDictType.LOCAL_STATE_DICT: + assert getattr(self.module, FLAT_PARAM, None) is not None + assert isinstance(self.module.flat_param, FlatParameter) + return super().state_dict(destination, prefix, keep_vars) + elif self._state_dict_type == StateDictType.SHARDED_STATE_DICT: + raise NotImplementedError("Will be implemented in the next PRs.") + else: + raise ValueError(f"Unknown StateDictType {self._state_dict_type}.") + + def local_state_dict(self, *args: Any, **kwargs: Any) -> Any: + """ + Returns the local state of the module. Parameters are flattened and + sharded, so the resulting state_dict can only be loaded after the module + has been wrapped with FSDP. + """ + with self.state_dict_type(StateDictType.LOCAL_STATE_DICT): + return self.state_dict(*args, **kwargs) + + def _full_pre_load_state_dict_hook( + self, + state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], + prefix: str, + ) -> None: + return + + def _local_pre_load_state_dict_hook( + self, + state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], + prefix: str, + ) -> None: + """ + This hook finds the local flat_param for this FSDP module from the + state_dict. The flat_param should be a ShardedTensor. This hook converts + the ShardedTensor to a tensor. No copy happen unless padding is required. + """ + _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_WRAPPED_MODULE}.") + key = f"{prefix}{FSDP_WRAPPED_MODULE}.{FLAT_PARAM}" + load_tensor = state_dict[key] + assert isinstance( + load_tensor, ShardedTensor + ), "Tensors in local_state_dict should be ShardedTensor." + + # Convert the ShardedTensor to a Tensor. + shards = load_tensor.local_shards() + assert len(shards), "load_local_state_dict assume one shard per ShardedTensor." + load_tensor = cast(torch.Tensor, shards[0].tensor) + + # Get the metada of the flat_param to decide whether to pad the loaded + # tensor. + flat_param = self.module.flat_param + assert flat_param is not None + if flat_param.num_padded not in (0, flat_param.numel()): + assert load_tensor.numel() < flat_param.numel(), ( + f"Local shard size = {flat_param.numel()} and the tensor in " + f"the state_dict is {load_tensor.numel()}." + ) + load_tensor = F.pad(load_tensor, [0, flat_param.num_padded]) + state_dict[key] = load_tensor + + def _sharded_pre_load_state_dict_hook( + self, + state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], + prefix: str, + ) -> None: + raise NotImplementedError("Will be implemented in the next PRs.") + + @staticmethod + def _pre_load_state_dict_hook( + module: nn.Module, + state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], + prefix: str, + *args: Any, + ) -> None: + """ + ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()`` + is called. ``self._state_dict_type`` is used to decide what preprocessing + will be done. + """ + self = cast(FullyShardedDataParallel, module) + self._pre_load_state_dict_hook_fn[self._state_dict_type](state_dict, prefix) + + def load_state_dict( + self, + state_dict: "OrderedDict[str, torch.Tensor]", + strict: bool = True, + ) -> NamedTuple: + """ + The entry point of all three FSDP load_state_dict APIs. + ``self._state_dict_type`` decides which code path to execute. + + .. warning:: This needs to be called on all ranks, since synchronization + primitives may be used. + """ + torch.cuda.synchronize() + if self._state_dict_type == StateDictType.FULL_STATE_DICT: + return super().load_state_dict(state_dict, strict) + elif self._state_dict_type == StateDictType.LOCAL_STATE_DICT: + return super().load_state_dict(state_dict, strict) + elif self._state_dict_type == StateDictType.SHARDED_STATE_DICT: + raise NotImplementedError("Will be implemented in the next PRs.") + else: + raise ValueError(f"Unknown StateDictType {self._state_dict_type}.") + + def load_local_state_dict( + self, + state_dict: "OrderedDict[str, torch.Tensor]", + strict: bool = True, + ) -> NamedTuple: + """ + Load states from a flatten, sharded state dictionary. + """ + with self.state_dict_type(StateDictType.LOCAL_STATE_DICT): + return self.load_state_dict(state_dict, strict) + def forward(self, *args: Any, **kwargs: Any) -> Any: self._lazy_init() @@ -1110,6 +1392,7 @@ class FullyShardedDataParallel(nn.Module): """ Gather all shards of params. """ + self._lazy_init() def update_p_data(output_tensor: torch.Tensor) -> None: """ @@ -1246,8 +1529,7 @@ class FullyShardedDataParallel(nn.Module): until the eventual sync. """ self._lazy_init() - assert self._is_root, \ - "`no_sync()` on inner FSDP instances is not supported" + assert self._is_root, "`no_sync()` on inner FSDP instances is not supported" self._assert_state(TrainingState_.IDLE) old_flags = [] for m in self.modules(): @@ -1258,9 +1540,10 @@ class FullyShardedDataParallel(nn.Module): yield finally: for m, old_flag in old_flags: - assert not m._require_backward_grad_sync, \ - "`_require_backward_grad_sync` was incorrectly set to " \ + assert not m._require_backward_grad_sync, ( + "`_require_backward_grad_sync` was incorrectly set to " "`True` while in the `no_sync()` context manager" + ) m._require_backward_grad_sync = old_flag diff --git a/torch/distributed/fsdp/utils.py b/torch/distributed/fsdp/utils.py index 3b54967c5ba..2b64ab9c998 100644 --- a/torch/distributed/fsdp/utils.py +++ b/torch/distributed/fsdp/utils.py @@ -1,7 +1,9 @@ -from typing import Dict, List, Tuple, Union, Any, Callable, Set +from typing import Dict, List, Tuple, Union, Any, Callable, Set, TYPE_CHECKING import torch +if TYPE_CHECKING: + from collections import OrderedDict # noqa: F401 """Useful functions to deal with tensor types with other python container types.""" @@ -22,3 +24,27 @@ def _apply_to_tensors( return x return apply(container) + + +def _replace_by_prefix( + state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], + old_prefix: str, + new_prefix: str, +) -> None: + """ + Replace all keys that match a given old_prefix with a new_prefix (in-place). + + Usage:: + + state_dict = {"layer.xyz": torch.tensor(1)} + replace_by_prefix_(state_dict, "layer.", "module.layer.") + assert state_dict == {"module.layer.xyz": torch.tensor(1)} + """ + if old_prefix == new_prefix: + raise ValueError("old_prefix and new_prefix must be distinct") + for key in list(state_dict.keys()): + if not key.startswith(old_prefix): + continue + new_key = new_prefix + key[len(old_prefix) :] + state_dict[new_key] = state_dict[key] + del state_dict[key] From 2a7f9f06009c0006b40ec2531a7dc1220272d2c2 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Sat, 19 Feb 2022 13:02:46 -0800 Subject: [PATCH 193/199] Revert D34284271: [TLC][checkpoint] Add unit test for StatefulComponentCheckpointAgent Test Plan: revert-hammer Differential Revision: D34284271 (https://github.com/pytorch/pytorch/commit/f49a93ba56d9bb3f73d4bea13ae5331300362235) Original commit changeset: 58f84c69782a Original Phabricator Diff: D34284271 (https://github.com/pytorch/pytorch/commit/f49a93ba56d9bb3f73d4bea13ae5331300362235) fbshipit-source-id: 87deabae3c3c10c5a9532825ca33d78c5251958e (cherry picked from commit 03bc05a970e2c4063c8f6e45c70e167cf5c3f1cf) --- torch/distributed/_shard/sharded_tensor/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/torch/distributed/_shard/sharded_tensor/utils.py b/torch/distributed/_shard/sharded_tensor/utils.py index 400abb7d303..98fa1140c44 100644 --- a/torch/distributed/_shard/sharded_tensor/utils.py +++ b/torch/distributed/_shard/sharded_tensor/utils.py @@ -3,14 +3,13 @@ from contextlib import contextmanager from typing import Optional, List, Sequence import torch -import copy from torch.distributed import distributed_c10d from torch.distributed import rpc from torch.distributed._shard.sharding_spec import ( ShardMetadata, ) from torch.distributed._shard.sharding_spec._internals import ( - check_tensor, # noqa + check_tensor, validate_non_overlapping_shards_metadata, ) @@ -200,7 +199,7 @@ def build_global_metadata(gathered_metadatas: Sequence[Optional[ShardedTensorMet continue if global_sharded_tensor_metadata is None: - global_sharded_tensor_metadata = copy.deepcopy(rank_metadata) + global_sharded_tensor_metadata = rank_metadata global_metadata_rank = rank else: _raise_if_mismatch(global_sharded_tensor_metadata.size, @@ -235,7 +234,7 @@ def build_global_metadata(gathered_metadatas: Sequence[Optional[ShardedTensorMet validate_non_overlapping_shards_metadata(global_sharded_tensor_metadata.shards_metadata) # check if the shards_metadata is compatible with global size of the sharded tensor. - # check_tensor(global_sharded_tensor_metadata.shards_metadata, global_sharded_tensor_metadata.size) + check_tensor(global_sharded_tensor_metadata.shards_metadata, global_sharded_tensor_metadata.size) else: raise ValueError("ShardedTensor have no local shards on all ranks!") From bf03d934965d0fae47a66756dd70304ad718b125 Mon Sep 17 00:00:00 2001 From: Michael Suo Date: Sat, 19 Feb 2022 17:32:26 -0800 Subject: [PATCH 194/199] Revert D33919683: [FSDP] Implement local_state_dict and load_local_state_dict Test Plan: revert-hammer Differential Revision: D33919683 (https://github.com/pytorch/pytorch/commit/d50643adcd8ca144c9b2d5de339b95be5339bf63) Original commit changeset: c9f1b43ce04d Original Phabricator Diff: D33919683 (https://github.com/pytorch/pytorch/commit/d50643adcd8ca144c9b2d5de339b95be5339bf63) fbshipit-source-id: c54c181edf8eb6a3bc509ed54d34ffdce11b93f5 (cherry picked from commit 4dfb50cd0d86abfb17fcfbecd1f42a2dc633afb9) --- .../fsdp/test_flatten_params_wrapper.py | 2 +- test/distributed/fsdp/test_fsdp_state_dict.py | 138 -------- .../fsdp/test_fsdp_summon_full_params.py | 14 +- torch/distributed/fsdp/__init__.py | 1 - .../fsdp/flatten_params_wrapper.py | 78 +---- .../fsdp/fully_sharded_data_parallel.py | 297 +----------------- torch/distributed/fsdp/utils.py | 28 +- 7 files changed, 27 insertions(+), 531 deletions(-) delete mode 100644 test/distributed/fsdp/test_fsdp_state_dict.py diff --git a/test/distributed/fsdp/test_flatten_params_wrapper.py b/test/distributed/fsdp/test_flatten_params_wrapper.py index 69c78ee6dde..c4a7eb65707 100644 --- a/test/distributed/fsdp/test_flatten_params_wrapper.py +++ b/test/distributed/fsdp/test_flatten_params_wrapper.py @@ -198,7 +198,7 @@ class TestFlattenParams(TestCase): expected, msg=f"{flat_p.shard_metadata()}, {expected}", ) - self.assertEqual(flat_p.num_padded, kwargs["num_padded"]) + self.assertEqual(flat_p._num_padded, kwargs["num_padded"]) _test( kwargs={"start": -1, "end": -1, "num_padded": 0}, diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py deleted file mode 100644 index 00776fe87c6..00000000000 --- a/test/distributed/fsdp/test_fsdp_state_dict.py +++ /dev/null @@ -1,138 +0,0 @@ -# Owner(s): ["oncall: distributed"] - -import sys -from typing import Any, Dict - -import torch -from torch import distributed as dist -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import StateDictType -from torch.nn import Linear, Module -from torch.nn.parallel import DistributedDataParallel -from torch.optim import SGD -from torch.testing._internal.common_distributed import skip_if_lt_x_gpu -from torch.testing._internal.common_fsdp import ( - FSDPTest, - get_full_params, -) -from torch.testing._internal.common_utils import ( - instantiate_parametrized_tests, - parametrize, - run_tests, - TEST_WITH_DEV_DBG_ASAN, -) - - -if not dist.is_available(): - print("Distributed not available, skipping tests", file=sys.stderr) - sys.exit(0) - -if TEST_WITH_DEV_DBG_ASAN: - print( - "Skip dev-asan as torch + multiprocessing spawn have known issues", - file=sys.stderr, - ) - sys.exit(0) - -INNER_SHAPE = [4, 4] -OUTER_SHAPE = [4, 5] - - -class Model(Module): - def __init__(self, wrap_fsdp): - super().__init__() - self.inner = Linear(*INNER_SHAPE) - if wrap_fsdp: - self.inner = FSDP(self.inner) - self.outer = Linear(*OUTER_SHAPE) - - def forward(self, x): - # Forward twice. - i = self.inner(x) - j = self.inner(x) - return self.outer(i + j) - - -class TestFSDPStateDict(FSDPTest): - @property - def world_size(self): - return 2 - - def _initialize_model(self, wrap_fsdp: bool): - # keep everything deterministic for input data - torch.manual_seed(0) - - model = Model(wrap_fsdp).cuda() - if wrap_fsdp: - model = FSDP(model) - else: - model = DistributedDataParallel(model, device_ids=[self.rank]) - return model - - @staticmethod - def _state_dict(model: Module, state_dict_type: str): - return getattr(model, state_dict_type)() - - @staticmethod - def _load_state_dict( - model: Module, state_dict_type: str, state_dict: Dict[str, Any] - ): - getattr(model, f"load_{state_dict_type}")(state_dict) - - def _dist_train( - self, wrap_fsdp: bool, state_dict_type: str = "", with_context: bool = False - ): - # TODO: Move this test to common_fsdp. - model = self._initialize_model(wrap_fsdp) - optim = SGD(model.parameters(), lr=0.1) - - in_data = torch.rand(64, 4, requires_grad=True, device=torch.device("cuda")) - for _ in range(3): - out = model(in_data) - out.sum().backward() - optim.step() - optim.zero_grad() - - if wrap_fsdp: - blank_model = FSDP(Model(True).cuda()) - if with_context: - state_dict_type = { - "full_state_dict": StateDictType.FULL_STATE_DICT, - "local_state_dict": StateDictType.LOCAL_STATE_DICT, - "sharded_state_dict": StateDictType.SHARDED_STATE_DICT, - }[state_dict_type] - with model.state_dict_type(state_dict_type): - state_dict = model.state_dict() - with blank_model.state_dict_type(state_dict_type): - blank_model.load_state_dict(state_dict) - else: - state_dict = self._state_dict(model, state_dict_type) - self._load_state_dict(blank_model, state_dict_type, state_dict) - get_full_params(blank_model) - model = blank_model - - return list(model.parameters()) - - @skip_if_lt_x_gpu(2) - @parametrize("state_dict_type", ["local_state_dict"]) - def test_state_dict_save_load_flow(self, state_dict_type): - fsdp_params = self._dist_train(wrap_fsdp=True, state_dict_type=state_dict_type) - fsdp_params_using_context = self._dist_train( - wrap_fsdp=True, state_dict_type=state_dict_type, with_context=True - ) - ddp_params = self._dist_train(wrap_fsdp=False) - self.assertEqual(ddp_params, fsdp_params) - self.assertEqual(ddp_params, fsdp_params_using_context) - - @skip_if_lt_x_gpu(2) - @parametrize("state_dict_type", ["local_state_dict"]) - def test_fsdp_state_dict_keys(self, state_dict_type): - state_dict = self._state_dict(self._initialize_model(True), state_dict_type) - if state_dict_type == "local_state_dict": - self.assertEqual(set(["flat_param", "inner.flat_param"]), state_dict.keys()) - - -instantiate_parametrized_tests(TestFSDPStateDict) - -if __name__ == "__main__": - run_tests() diff --git a/test/distributed/fsdp/test_fsdp_summon_full_params.py b/test/distributed/fsdp/test_fsdp_summon_full_params.py index f6ec725b6d6..fbb2ef2bd2e 100644 --- a/test/distributed/fsdp/test_fsdp_summon_full_params.py +++ b/test/distributed/fsdp/test_fsdp_summon_full_params.py @@ -1,8 +1,8 @@ # Owner(s): ["oncall: distributed"] import itertools +from copy import deepcopy import math import sys -from copy import deepcopy import torch import torch.nn as nn @@ -35,10 +35,11 @@ if TEST_WITH_DEV_DBG_ASAN: ) sys.exit(0) - def _run_test_summon_full_param_writeback(cls, writeback, cpu_offload, modify_outer): model = FSDP( - nn.Sequential(FSDP(nn.Linear(5, 5, bias=False)), nn.Linear(5, 3, bias=False)) + nn.Sequential( + FSDP(nn.Linear(5, 5, bias=False)), nn.Linear(5, 3, bias=False) + ) ).cuda(cls.rank) # set the value @@ -63,7 +64,6 @@ def _run_test_summon_full_param_writeback(cls, writeback, cpu_offload, modify_ou else: cls.assertEqual(p.cpu()[0], cls.rank + 2) - class TestSummonFullParamsNoShard(FSDPTest): @property def world_size(self): @@ -84,7 +84,6 @@ class TestSummonFullParamsNoShard(FSDPTest): modify_outer, ) - class TestSummonFullParams(FSDPTest): @property def world_size(self): @@ -106,7 +105,10 @@ class TestSummonFullParams(FSDPTest): @parametrize("modify_outer", [True, False]) def test_summon_full_param_writeback(self, writeback, cpu_offload, modify_outer): return _run_test_summon_full_param_writeback( - self, writeback, cpu_offload, modify_outer + self, + writeback, + cpu_offload, + modify_outer ) @skip_if_lt_x_gpu(2) diff --git a/torch/distributed/fsdp/__init__.py b/torch/distributed/fsdp/__init__.py index 7c1f0b388c2..d2c311dce0d 100644 --- a/torch/distributed/fsdp/__init__.py +++ b/torch/distributed/fsdp/__init__.py @@ -1,4 +1,3 @@ from .flatten_params_wrapper import FlatParameter from .fully_sharded_data_parallel import FullyShardedDataParallel from .fully_sharded_data_parallel import CPUOffload -from .fully_sharded_data_parallel import StateDictType diff --git a/torch/distributed/fsdp/flatten_params_wrapper.py b/torch/distributed/fsdp/flatten_params_wrapper.py index 13be7bd74f1..ef3af64870f 100644 --- a/torch/distributed/fsdp/flatten_params_wrapper.py +++ b/torch/distributed/fsdp/flatten_params_wrapper.py @@ -18,60 +18,14 @@ from typing import ( Optional, Sequence, Tuple, - TYPE_CHECKING, - Union, ) import torch import torch.nn as nn from torch import Tensor -from .utils import _replace_by_prefix - -if TYPE_CHECKING: - from collections import OrderedDict # noqa: F401 - ParamOffset = Tuple[int, int] SharedParamInfo = Tuple[str, str, nn.Module, str, nn.Module, str] -FLAT_PARAM = "flat_param" -FPW_MODULE = "_fpw_module" - - -def _post_state_dict_hook( - module: nn.Module, state_dict: "OrderedDict[str, Tensor]", prefix: str, *args: Any -) -> "OrderedDict[str, Tensor]": - """ - _post_state_dict_hook() is called after the state_dict() is executed - and before returning the state_dict to the users. - This API post-processes the keys of the state_dict to remove the - FlattenParamsWrapper internal prefix. - """ - # Move everything from FPW_MODULE up one level. - _replace_by_prefix(state_dict, prefix + f"{FPW_MODULE}.", prefix) - return state_dict - - -def _pre_load_state_dict_hook( - state_dict: Union[Dict[str, Tensor], "OrderedDict[str, Tensor]"], - prefix: str, - *args: Any, -) -> None: - """ - _post_state_dict_hook() is called before the _load_from_state_dict() is - This API pre-processes the keys of the state_dict to add the - FlattenParamsWrapper internal prefix - """ - # Push everything down to FPW_MODULE level. - _replace_by_prefix(state_dict, prefix, prefix + f"{FPW_MODULE}.") - # The flat_param_* keys actually needs to move one level up. - flat_param_key = prefix + f"{FPW_MODULE}.{FLAT_PARAM}" - for k in list(state_dict.keys()): - if k.startswith(flat_param_key): - last_part = k.split(".")[-1] - assert last_part.startswith( - FLAT_PARAM - ), f"Expected key to contain flat_param, but key name is {k}" - _replace_by_prefix(state_dict, k, prefix + last_part) class ParamInfo(NamedTuple): @@ -144,13 +98,10 @@ class FlatParameter(nn.Parameter): def __init__(self, params: Sequence[nn.Parameter], requires_grad: bool = True): self._is_sharded = False self._param_numels = [p.numel() for p in params] - # The total element numbers. This is equal to the summation of the - # ``numel()`` of all the parameters. - self.full_numel = sum(self._param_numels) - assert self.numel() <= self.full_numel, ( + assert self.numel() <= sum(self._param_numels), ( "Parameter numbers mismatched. " f"The number of elements in FlatParameter: {self.numel()} vs. " - f"the number of elements in original parameters: {self.full_numel}." + f"the number of elements in original parameters: {sum(self._param_numels)}." ) # The shapes of each individual parameter. self._param_shapes = [p.size() for p in params] @@ -173,7 +124,7 @@ class FlatParameter(nn.Parameter): (0, numel) for numel in self._param_numels ] # The number of padding elements. - self.num_padded = 0 + self._num_padded = 0 def shard_by_offsets(self, start: int, end: int, num_padded: int) -> None: assert self._is_sharded @@ -182,8 +133,8 @@ class FlatParameter(nn.Parameter): f"Shard the flatten parameter with an invalid offset pair {(start, end)}." ) _shard_size = end - start + 1 - self.num_padded = num_padded - if self.num_padded > _shard_size: + self._num_padded = num_padded + if self._num_padded > _shard_size: raise ValueError("The number of padding is larger than the shard size.") self._sharded_param_offsets.clear() @@ -212,13 +163,13 @@ class FlatParameter(nn.Parameter): ) -> Iterator[Tensor]: """Return a generator of views that map to the original parameters.""" # Note, self.data could be sharded, so its numel is <= to the sum. - assert ( - self.data.numel() <= self.full_numel - ), f"Incorrect internal state {self.data.numel()} vs. {self.full_numel}" + assert self.data.numel() <= sum( + self._param_numels + ), f"Incorrect internal state {self.data.numel()} vs. {sum(self._param_numels)}" data = external_data if external_data is not None else self - if data.numel() != self.full_numel: + if data.numel() != sum(self._param_numels): raise ValueError( - f"Incorrect numel of supplied data: got {data.numel()} but expected {self.full_numel}" + f"Incorrect numel of supplied data: got {data.numel()} but expected {sum(self._param_numels)}" ) return ( t.view(s) @@ -301,15 +252,6 @@ class FlattenParamsWrapper(nn.Module): self._orig_flat_param: List[Optional[FlatParameter]] = [None] self._flatten_params() - # Sanity check for the string constants. - assert getattr(self, FPW_MODULE) is self._fpw_module - assert getattr(self, FLAT_PARAM) is self.flat_param - - # Register hook to be called after state_dict() to remove the - # "_fpw_module." prefix and before load_state_dict() to add it back. - self._register_state_dict_hook(_post_state_dict_hook) - self._register_load_state_dict_pre_hook(_pre_load_state_dict_hook) - @property def module(self) -> Any: """Support _fsdp_wrapped_module.module in case we are immitating DDP, which has .module diff --git a/torch/distributed/fsdp/fully_sharded_data_parallel.py b/torch/distributed/fsdp/fully_sharded_data_parallel.py index baea02753bc..fe61684b69d 100644 --- a/torch/distributed/fsdp/fully_sharded_data_parallel.py +++ b/torch/distributed/fsdp/fully_sharded_data_parallel.py @@ -9,10 +9,9 @@ from typing import ( Any, Callable, Dict, + Generator, List, Optional, - Generator, - NamedTuple, Set, Tuple, Union, @@ -25,28 +24,17 @@ import torch.nn as nn import torch.nn.functional as F from torch.autograd import Variable from torch.distributed import ProcessGroup -from torch.distributed._sharded_tensor import ( - init_from_local_shards, - Shard, - ShardedTensor, -) from torch.distributed.distributed_c10d import _get_default_group from torch.nn.parameter import Parameter -from .flatten_params_wrapper import FlatParameter, FlattenParamsWrapper, FLAT_PARAM -from .utils import ( - _apply_to_tensors, - _replace_by_prefix, -) +from .flatten_params_wrapper import FlatParameter, FlattenParamsWrapper +from .utils import _apply_to_tensors from .wrap import _recursive_wrap if TYPE_CHECKING: from collections import OrderedDict # noqa: F401 -FSDP_WRAPPED_MODULE = "_fsdp_wrapped_module" - - @dataclass class CPUOffload: """ @@ -110,31 +98,6 @@ class TrainingState_(Enum): SUMMON_FULL_PARAMS = auto() -class StateDictType(Enum): - """ - This enum indicates that which type of ``state_dict`` the FSDP module is - currently processing (returning or loading). - The default value should be FULL_STATE_DICT to comply the PyTorch convention. - ..note:: - FSDP currently supports three types of ``state_dict``: - 1. ``state_dict/load_state_dict`: this pair of APIs return and load - the non-sharded, unflattened parameters. The semantics is the - same as using DDP. - 2. ``local_state_dict/load_local_state``: this pair of APIs return - and load local sharded, flattened parameters. The values returned - by ``local_state_dict`` can be directly used by FSDP and is only - meaningful to FSDP (because parameters are flattened). - 3. ``sharded_state_dict/load_sharded_state_dict``: this pair of APIs - return and load sharded, unflattened parameters. The ``state_dict`` - return by ``sharded_state_dict`` can be used by all other parallel - schemes (resharding may be required). - """ - - FULL_STATE_DICT = auto() - LOCAL_STATE_DICT = auto() - SHARDED_STATE_DICT = auto() - - class FullyShardedDataParallel(nn.Module): """ A wrapper for sharding Module parameters across data parallel workers. This @@ -281,7 +244,6 @@ class FullyShardedDataParallel(nn.Module): self._fsdp_wrapped_module: FlattenParamsWrapper = FlattenParamsWrapper( module, param_list=params ) - assert getattr(self, FSDP_WRAPPED_MODULE) is self._fsdp_wrapped_module del module # free original module in case it helps garbage collection if self._fsdp_wrapped_module.flat_param is not None: self.params = [self._fsdp_wrapped_module.flat_param] @@ -306,29 +268,6 @@ class FullyShardedDataParallel(nn.Module): # Enum to indicate if we're in the forward/backward pass, idle, etc. self.training_state = TrainingState_.IDLE - self._state_dict_type = StateDictType.FULL_STATE_DICT - - # FSDP currently provides three different state_dicts. The actual - # state_dict that will be saved/loaded is decided by - # self._state_dict_type. And the main logic of each state_dict is - # implemented in the hook. Therefore, for each hook (post-save and - # pre-load), there is a dispatcher dictionary to dispatch the execution - # flow to the correct implementation. - self._register_state_dict_hook(self._post_state_dict_hook) - self._post_state_dict_hook_fn = { - StateDictType.FULL_STATE_DICT: self._full_post_state_dict_hook, - StateDictType.LOCAL_STATE_DICT: self._local_post_state_dict_hook, - StateDictType.SHARDED_STATE_DICT: self._sharded_post_state_dict_hook, - } - self._register_load_state_dict_pre_hook( - self._pre_load_state_dict_hook, with_module=True - ) - self._pre_load_state_dict_hook_fn = { - StateDictType.FULL_STATE_DICT: self._full_pre_load_state_dict_hook, - StateDictType.LOCAL_STATE_DICT: self._local_pre_load_state_dict_hook, - StateDictType.SHARDED_STATE_DICT: self._sharded_pre_load_state_dict_hook, - } - # Flag to guard against preparing gradients multiple times per backward pass. self._pre_backward_hook_has_run = False # Used for prefetching all gather full params in post backward hook @@ -740,227 +679,6 @@ class FullyShardedDataParallel(nn.Module): else: return False - @contextlib.contextmanager - def state_dict_type(self, state_dict_type: StateDictType) -> Generator: - """ - A context manager to set the state_dict_type of this FSDP module and - its descendant FSDP modules. - .. note:: This API should be called for only the root FSDP module. - .. note:: The default state_dict_type is StateDictTyp.FULL_STATE_DICT. - - Args: - state_dict_type (StateDictType): the desired state_dict_type to set. - """ - self._lazy_init() - if not self._is_root: - raise RuntimeError( - f"state_dict_type context manager can only be called from the root FSDP module. {self._is_root}" - ) - prev_state_dict_type = self._state_dict_type - for module in self.modules(): - if isinstance(module, FullyShardedDataParallel): - if module._state_dict_type != prev_state_dict_type: - raise RuntimeError( - "All FSDP module should the same state_dict_type." - ) - module._state_dict_type = state_dict_type - try: - yield - finally: - for module in self.modules(): - if isinstance(module, FullyShardedDataParallel): - module._state_dict_type = prev_state_dict_type - - def _full_post_state_dict_hook( - self, - state_dict: "OrderedDict[str, torch.Tensor]", - prefix: str, - ) -> "OrderedDict[str, torch.Tensor]": - return state_dict - - def _local_post_state_dict_hook( - self, - state_dict: "OrderedDict[str, torch.Tensor]", - prefix: str, - ) -> "OrderedDict[str, torch.Tensor]": - """ - This hook create a ShardedTensor from the local flat_param and replace - the state_dict[f"{prefix}{FLAT_PARAM}] with the ShardedTensor. No copy - will happen. The underlying storage is the same. - """ - _replace_by_prefix(state_dict, f"{prefix}{FSDP_WRAPPED_MODULE}.", prefix) - # state_dict[f"{prefix}{FLAT_PARAM}"] exists and has the same tensor - # value as the flat_param but it is a pure Tensor because - # nn.Module.state_dict() will detach the parameter. Therefore, we need - # to get flat_param from the FlattenParamsWrapper to get the metadata. - flat_param = getattr(self.module, FLAT_PARAM, None) - assert ( - flat_param is not None - ), "flat_param cannot be None when doing local_state_dict." - - # Construct a ShardedTensor from the flat_param. - full_numel = flat_param.full_numel - shard_offset = flat_param.numel() * self.rank - valid_data_size = flat_param.numel() - flat_param.num_padded - if valid_data_size > 0 and flat_param.num_padded > 0: - flat_param = flat_param.narrow(0, 0, valid_data_size) - local_shards = [ - Shard.from_tensor_and_offsets(flat_param, [shard_offset], self.rank) - ] - state_dict[f"{prefix}{FLAT_PARAM}"] = init_from_local_shards( - local_shards, full_numel, process_group=self.process_group - ) # type: ignore[assignment] - - return state_dict - - def _sharded_post_state_dict_hook( - self, - state_dict: "OrderedDict[str, torch.Tensor]", - prefix: str, - ) -> "OrderedDict[str, torch.Tensor]": - raise NotImplementedError("Will be implemented in the next PRs.") - - @staticmethod - def _post_state_dict_hook( - module: nn.Module, - state_dict: "OrderedDict[str, torch.Tensor]", - prefix: str, - *args: Any, - ) -> "OrderedDict[str, torch.Tensor]": - """ - _post_state_dict_hook() is called after the state_dict() of this - FSDP module is executed. ``self._state_dict_type`` is used to decide - what postprocessing will be done. - """ - self = cast(FullyShardedDataParallel, module) - return self._post_state_dict_hook_fn[self._state_dict_type](state_dict, prefix) - - def state_dict(self, destination=None, prefix="", keep_vars=False): - """ - The entry point of all three FSDP state_dict APIs. - ``self._state_dict_type`` decides which code path to execute. - - .. warning:: This needs to be called on all ranks, since synchronization - primitives may be used. - """ - if torch.cuda.is_available(): - torch.cuda.synchronize() - if self._state_dict_type == StateDictType.FULL_STATE_DICT: - return super().state_dict(destination, prefix, keep_vars) - elif self._state_dict_type == StateDictType.LOCAL_STATE_DICT: - assert getattr(self.module, FLAT_PARAM, None) is not None - assert isinstance(self.module.flat_param, FlatParameter) - return super().state_dict(destination, prefix, keep_vars) - elif self._state_dict_type == StateDictType.SHARDED_STATE_DICT: - raise NotImplementedError("Will be implemented in the next PRs.") - else: - raise ValueError(f"Unknown StateDictType {self._state_dict_type}.") - - def local_state_dict(self, *args: Any, **kwargs: Any) -> Any: - """ - Returns the local state of the module. Parameters are flattened and - sharded, so the resulting state_dict can only be loaded after the module - has been wrapped with FSDP. - """ - with self.state_dict_type(StateDictType.LOCAL_STATE_DICT): - return self.state_dict(*args, **kwargs) - - def _full_pre_load_state_dict_hook( - self, - state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], - prefix: str, - ) -> None: - return - - def _local_pre_load_state_dict_hook( - self, - state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], - prefix: str, - ) -> None: - """ - This hook finds the local flat_param for this FSDP module from the - state_dict. The flat_param should be a ShardedTensor. This hook converts - the ShardedTensor to a tensor. No copy happen unless padding is required. - """ - _replace_by_prefix(state_dict, prefix, f"{prefix}{FSDP_WRAPPED_MODULE}.") - key = f"{prefix}{FSDP_WRAPPED_MODULE}.{FLAT_PARAM}" - load_tensor = state_dict[key] - assert isinstance( - load_tensor, ShardedTensor - ), "Tensors in local_state_dict should be ShardedTensor." - - # Convert the ShardedTensor to a Tensor. - shards = load_tensor.local_shards() - assert len(shards), "load_local_state_dict assume one shard per ShardedTensor." - load_tensor = cast(torch.Tensor, shards[0].tensor) - - # Get the metada of the flat_param to decide whether to pad the loaded - # tensor. - flat_param = self.module.flat_param - assert flat_param is not None - if flat_param.num_padded not in (0, flat_param.numel()): - assert load_tensor.numel() < flat_param.numel(), ( - f"Local shard size = {flat_param.numel()} and the tensor in " - f"the state_dict is {load_tensor.numel()}." - ) - load_tensor = F.pad(load_tensor, [0, flat_param.num_padded]) - state_dict[key] = load_tensor - - def _sharded_pre_load_state_dict_hook( - self, - state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], - prefix: str, - ) -> None: - raise NotImplementedError("Will be implemented in the next PRs.") - - @staticmethod - def _pre_load_state_dict_hook( - module: nn.Module, - state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], - prefix: str, - *args: Any, - ) -> None: - """ - ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()`` - is called. ``self._state_dict_type`` is used to decide what preprocessing - will be done. - """ - self = cast(FullyShardedDataParallel, module) - self._pre_load_state_dict_hook_fn[self._state_dict_type](state_dict, prefix) - - def load_state_dict( - self, - state_dict: "OrderedDict[str, torch.Tensor]", - strict: bool = True, - ) -> NamedTuple: - """ - The entry point of all three FSDP load_state_dict APIs. - ``self._state_dict_type`` decides which code path to execute. - - .. warning:: This needs to be called on all ranks, since synchronization - primitives may be used. - """ - torch.cuda.synchronize() - if self._state_dict_type == StateDictType.FULL_STATE_DICT: - return super().load_state_dict(state_dict, strict) - elif self._state_dict_type == StateDictType.LOCAL_STATE_DICT: - return super().load_state_dict(state_dict, strict) - elif self._state_dict_type == StateDictType.SHARDED_STATE_DICT: - raise NotImplementedError("Will be implemented in the next PRs.") - else: - raise ValueError(f"Unknown StateDictType {self._state_dict_type}.") - - def load_local_state_dict( - self, - state_dict: "OrderedDict[str, torch.Tensor]", - strict: bool = True, - ) -> NamedTuple: - """ - Load states from a flatten, sharded state dictionary. - """ - with self.state_dict_type(StateDictType.LOCAL_STATE_DICT): - return self.load_state_dict(state_dict, strict) - def forward(self, *args: Any, **kwargs: Any) -> Any: self._lazy_init() @@ -1392,7 +1110,6 @@ class FullyShardedDataParallel(nn.Module): """ Gather all shards of params. """ - self._lazy_init() def update_p_data(output_tensor: torch.Tensor) -> None: """ @@ -1529,7 +1246,8 @@ class FullyShardedDataParallel(nn.Module): until the eventual sync. """ self._lazy_init() - assert self._is_root, "`no_sync()` on inner FSDP instances is not supported" + assert self._is_root, \ + "`no_sync()` on inner FSDP instances is not supported" self._assert_state(TrainingState_.IDLE) old_flags = [] for m in self.modules(): @@ -1540,10 +1258,9 @@ class FullyShardedDataParallel(nn.Module): yield finally: for m, old_flag in old_flags: - assert not m._require_backward_grad_sync, ( - "`_require_backward_grad_sync` was incorrectly set to " + assert not m._require_backward_grad_sync, \ + "`_require_backward_grad_sync` was incorrectly set to " \ "`True` while in the `no_sync()` context manager" - ) m._require_backward_grad_sync = old_flag diff --git a/torch/distributed/fsdp/utils.py b/torch/distributed/fsdp/utils.py index 2b64ab9c998..3b54967c5ba 100644 --- a/torch/distributed/fsdp/utils.py +++ b/torch/distributed/fsdp/utils.py @@ -1,9 +1,7 @@ -from typing import Dict, List, Tuple, Union, Any, Callable, Set, TYPE_CHECKING +from typing import Dict, List, Tuple, Union, Any, Callable, Set import torch -if TYPE_CHECKING: - from collections import OrderedDict # noqa: F401 """Useful functions to deal with tensor types with other python container types.""" @@ -24,27 +22,3 @@ def _apply_to_tensors( return x return apply(container) - - -def _replace_by_prefix( - state_dict: Union[Dict[str, torch.Tensor], "OrderedDict[str, torch.Tensor]"], - old_prefix: str, - new_prefix: str, -) -> None: - """ - Replace all keys that match a given old_prefix with a new_prefix (in-place). - - Usage:: - - state_dict = {"layer.xyz": torch.tensor(1)} - replace_by_prefix_(state_dict, "layer.", "module.layer.") - assert state_dict == {"module.layer.xyz": torch.tensor(1)} - """ - if old_prefix == new_prefix: - raise ValueError("old_prefix and new_prefix must be distinct") - for key in list(state_dict.keys()): - if not key.startswith(old_prefix): - continue - new_key = new_prefix + key[len(old_prefix) :] - state_dict[new_key] = state_dict[key] - del state_dict[key] From 24c91e23d3bbb5cd6baa395ee45d88697255dd7f Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Sun, 20 Feb 2022 09:08:13 -0800 Subject: [PATCH 195/199] Fix nasty bug in bisect_percentile_op (#73147) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73147 Code used `reserve` instead of `resize` leading to platform010 test failures: ``` Trying example: test_bisect_percentil_op_large( self=, N=20, lengths=[2, 2], max_value=100, discrete=False, p=0.0, gc=, dc=[], ) stderr: E0219 13:14:52.601948 995877 JustKnobsConfigeratorLoader.cpp:114] Failed to load config justknobs/movefast/knobs after 55000ms timeout E0219 13:14:52.602150 995877 JustKnobsConfigeratorLoader.cpp:114] Failed to load config justknobs/pytorch/compiler after 55000ms timeout test_bisect_percentil_op_large (caffe2.caffe2.python.operator_test.bisect_percentile_op_test.TestBisectPercentileOp) ... third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_vector.h:1045: std::vector::reference std::vector::operator[](std::vector::size_type) [_Tp = int, _Alloc = std::allocator]: Assertion '__n < this->size()' failed. *** Aborted at 1645305292 (Unix time, try 'date -d 1645305292') *** *** Signal 6 (SIGABRT) (0x8556000f3225) received by PID 995877 (pthread TID 0x7f13a79c51c0) (linux TID 995877) (maybe from PID 995877, UID 34134) (code: -6), stack trace: *** W0219 13:14:52.682251 995932 RetryingSender.cpp:433] Failed to make rpc. Sender name: pr-scubasing. Reason: apache::thrift::transport::TTransportException: AsyncSocketException: connect failed, type = Socket not open, errno = 111 (Connection refused): Connection refused. @ 000000000000431b folly::symbolizer::(anonymous namespace)::signalHandler(int, siginfo_t*, void*) ./folly/experimental/symbolizer/SignalHandler.cpp:449 @ 0000000000000000 (unknown) @ 000000000009c9f3 __GI___pthread_kill ``` Test Plan: Sandcastle Reviewed By: luciang Differential Revision: D34365188 fbshipit-source-id: 65dcc23226c59096afd5fb3c338c3bd29c936ec3 (cherry picked from commit a1d18e3e6aaea96ba2ac4bb6a95afe45678e0ec7) --- caffe2/operators/bisect_percentile_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/operators/bisect_percentile_op.h b/caffe2/operators/bisect_percentile_op.h index 8dc71795df8..0b5567a776c 100644 --- a/caffe2/operators/bisect_percentile_op.h +++ b/caffe2/operators/bisect_percentile_op.h @@ -44,7 +44,7 @@ class BisectPercentileOp final : public Operator { pct_upper_.size(), "Feature (raw) data and upper bound dimension should match."); n_features = pct_lens_.size(); - index.reserve(n_features + 1); + index.resize(n_features + 1); index[0] = 0; for (int i = 1; i <= n_features; ++i) { index[i] = index[i - 1] + pct_lens_[i - 1]; From 9f541aa3aca768e7fbfa4a9d648b554f22b261f7 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Sun, 20 Feb 2022 14:32:29 -0800 Subject: [PATCH 196/199] [Profiler] Optimize `reportMemoryUsage` (#71538) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71538 `reportMemoryUsage` is kind of awful. It does a bunch of string writes and such that makes it VERY expensive. Just moving that work off the hot path reduces the overhead for `profile_memory` from ~6.5 us to ~1.2 us. (85% reduction in the kineto contribution to profiling overhead.) Test Plan: Ran ubenchmark with `--op empty --stressTestKineto --kinetoProfileMemory` Reviewed By: swolchok Differential Revision: D32730167 fbshipit-source-id: fe18e8fa3881967cad8fa1c26c71c805e9b034e5 (cherry picked from commit 0d394cb252e6eac78626b467e0bb497d6d6ae86c) --- torch/csrc/autograd/profiler_kineto.cpp | 69 ++++++++++++++++--------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 206bd52f6d7..cc42e6d67ac 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -155,6 +155,19 @@ struct OpEventData { torch::profiler::impl::CUDAEventStub cuda_event_end_ = nullptr; }; +struct MemoryEventData { + int64_t start_time; + void* ptr; + int64_t alloc_size; + int64_t total_allocated; + int64_t total_reserved; + uint64_t threadID; + torch::profiler::impl::kineto::DeviceAndResource kineto_info; + c10::DeviceType device_type; + c10::DeviceIndex device_index; +}; +static_assert(std::is_pod::value, "Non-POD member of MemoryEventData."); + // Assumption: Total threads number will not exceed 2^16-1, and total ops will // not exceed 2^48 -1. static inline uint64_t getForwardThreadKey(uint64_t tid, uint64_t seqNr) { @@ -204,29 +217,16 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { int64_t total_reserved, c10::Device device) override { if (config_.profile_memory && config_.state != ProfilerState::Disabled) { - std::lock_guard guard(state_mutex_); - auto start_time = getTimeUs(); - if (cpu_trace_) { - torch::profiler::impl::kineto::recordThreadInfo(); - cpu_trace_.addMemoryUsageActivity( - kMemoryEventName, - torch::profiler::impl::kineto::kineto_ids(), - start_time, - device, - ptr, - alloc_size, - total_allocated, - total_reserved); - } - - kineto_events_.emplace_back(); - auto& evt = kineto_events_.back(); - evt.name(kMemoryEventName) - .startUs(start_time) - .deviceIndex(device.index()) - .deviceType(device.type()) - .nBytes(alloc_size) - .startThreadId(at::RecordFunction::currentThreadId()); + memory_events_.push_back( + {getTimeUs(), + ptr, + alloc_size, + total_allocated, + total_reserved, + at::RecordFunction::currentThreadId(), + torch::profiler::impl::kineto::kineto_ids(), + device.type(), + device.index()}); } } @@ -264,6 +264,28 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { void materializeOpEvents() { std::lock_guard guard(state_mutex_); + + for (const auto& e : memory_events_) { + cpu_trace_.addMemoryUsageActivity( + kMemoryEventName, + e.kineto_info, + e.start_time, + c10::Device(e.device_type, e.device_index), + e.ptr, + e.alloc_size, + e.total_allocated, + e.total_reserved); + + kineto_events_.emplace_back(); + auto& evt = kineto_events_.back(); + evt.name(kMemoryEventName) + .startUs(e.start_time) + .deviceIndex(e.device_index) + .deviceType(e.device_type) + .nBytes(e.alloc_size) + .startThreadId(e.threadID); + } + for (const auto& e : op_events_) { if (e.end_us_ < e.start_us_) { // We initialize end_us_ to the smallest int64_t, so this means that @@ -585,6 +607,7 @@ struct KinetoThreadLocalState : public ProfilerThreadLocalStateBase { uint64_t start_time_; std::set activities_; std::deque op_events_; + std::deque memory_events_; torch::profiler::impl::kineto::TraceWrapper cpu_trace_; std::vector kineto_events_; // Optional, if event post-processing is enabled. From 5dad19fef05519b11a1a1afc8620ecbe9d11d770 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Mon, 21 Feb 2022 09:34:27 -0800 Subject: [PATCH 197/199] =?UTF-8?q?Back=20out=20"[pytorch][PR]=20add=20BFl?= =?UTF-8?q?oat16=20sparse=20operators=20on=20CPU:=20copy,=20coalesce,=20sp?= =?UTF-8?q?arse=5Fmask,=20ad=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Original commit changeset: f1274125234a Original Phabricator Diff: D34343016 (https://github.com/pytorch/pytorch/commit/c6f56599bb4e8a9d9649e87ef97225f80522ad38) Test Plan: Abovementioned PR regressed OSS CI Reviewed By: atalman Differential Revision: D34379703 fbshipit-source-id: bc624cfd86249dde2fac635d9b66f08f86b4aed9 (cherry picked from commit e52827f1ae09e0c54fd3c7383b5ed49377b6293c) --- aten/src/ATen/native/sparse/SparseTensor.cpp | 2 +- .../ATen/native/sparse/SparseTensorMath.cpp | 4 ++-- test/test_sparse.py | 19 +++++-------------- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index 7d428966741..6de64bfbf2c 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -770,7 +770,7 @@ SparseTensor& sparse_mask_out_cpu( // TODO: Re-audit this; it used to be an indexSelect directly into r_values at::index_select_out(r_values, t_view, 0, indices); } else { - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, r_values.scalar_type(), "sparse_mask", [&] { + AT_DISPATCH_ALL_TYPES_AND_COMPLEX(r_values.scalar_type(), "sparse_mask", [&] { sparse_mask_out_cpu_kernel( r_values, t, r_nnz, sparse_dim, mask_indices); }); diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 7db314d3a82..c23486336a1 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -474,7 +474,7 @@ SparseTensor& add_out_sparse_contiguous(SparseTensor& r, const SparseTensor& t, auto r_indices_accessor = r_indices.accessor(); auto src_indices_accessor = src_indices.accessor(); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX( commonDtype, "cadd_sparse", [&] { scalar_t* t_values_ptr = t_values.data_ptr(); scalar_t* s_values_ptr = s_values.data_ptr(); @@ -899,7 +899,7 @@ Tensor& s_addmm_out_sparse_dense_cpu( Tensor indices = sparse_._indices(); Tensor values = sparse_._values(); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, + AT_DISPATCH_ALL_TYPES_AND_COMPLEX( values.scalar_type(), "addmm_sparse_dense", [&] { s_addmm_out_sparse_dense_worker(nnz, dim_i, dim_j, dim_k, r, beta, t, alpha, indices, values, dense); } diff --git a/test/test_sparse.py b/test/test_sparse.py index b0cbd03f15a..0ad1a91b56b 100644 --- a/test/test_sparse.py +++ b/test/test_sparse.py @@ -21,7 +21,7 @@ from torch.testing import get_all_complex_dtypes, get_all_fp_dtypes from torch.testing._internal.common_cuda import \ (SM53OrLater, SM80OrLater, CUDA11OrLater) from torch.testing._internal.common_device_type import \ - (instantiate_device_type_tests, ops, dtypes, dtypesIfCPU, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride, + (instantiate_device_type_tests, ops, dtypes, dtypesIfCUDA, onlyCPU, onlyCUDA, precisionOverride, deviceCountAtLeast, OpDTypes) from torch.testing._internal.common_methods_invocations import \ (sparse_unary_ufuncs) @@ -189,7 +189,6 @@ class TestSparse(TestCase): @coalescedonoff @dtypes(torch.double, torch.cdouble) - @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) def test_coalesce(self, device, dtype, coalesced): def _test_coalesce(t): @@ -664,7 +663,6 @@ class TestSparse(TestCase): @coalescedonoff @dtypes(torch.double, torch.cdouble) - @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) def test_Sparse_to_Sparse_copy_(self, device, dtype, coalesced): # This is for testing torch.copy_(SparseTensor, SparseTensor) sparse_dims = 3 @@ -1242,8 +1240,6 @@ class TestSparse(TestCase): @coalescedonoff @dtypes(torch.double, torch.cdouble) - @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) - @precisionOverride({torch.bfloat16: 1e-1}) def test_sparse_addmm(self, device, dtype, coalesced): def test_shape(m, n, p, nnz, broadcast, alpha_beta=None): if alpha_beta is None: @@ -1265,8 +1261,7 @@ class TestSparse(TestCase): def fn(S, D1, D2, beta=beta, alpha=alpha): return torch.sparse.addmm(D1, S, D2, beta=beta, alpha=alpha) - if dtype == torch.double or dtype == torch.cdouble: - gradcheck(fn, (S, D1, D2), check_sparse_nnz=True) + gradcheck(fn, (S, D1, D2), check_sparse_nnz=True) test_shape(7, 8, 9, 20, False, None) test_shape(7, 8, 9, 20, True, None) @@ -1406,17 +1401,15 @@ class TestSparse(TestCase): _test_spadd() _test_spadd_hybrid() + @onlyCUDA @coalescedonoff @dtypes(torch.double, torch.cdouble) def test_sparse_add_out_bfloat16(self, device, dtype, coalesced): # fp32 x, _, _ = self._gen_sparse(3, 5, 10, dtype, device, coalesced) y, _, _ = self._gen_sparse(3, 5, 10, dtype, device, coalesced) - x = x.float() - y = y.float() - if device == 'cuda': - x = x.cuda() - y = y.cuda() + x = x.float().cuda() + y = y.float().cuda() res_fp32 = torch.add(x, y) # bfloat16 @@ -1635,7 +1628,6 @@ class TestSparse(TestCase): _test_basic_ops_hybrid() @dtypes(torch.double, torch.cdouble) - @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) def test_add_dense_sparse_mismatch(self, device, dtype): def test_shape(dense_size, sparse_dims_shape, dense_dims_shape, sparse_size): x = torch.zeros(dense_size, dtype=dtype, device=device) @@ -1674,7 +1666,6 @@ class TestSparse(TestCase): @coalescedonoff @dtypes(torch.double, torch.cdouble) - @dtypesIfCPU(torch.double, torch.cdouble, torch.bfloat16) def test_sparse_mask(self, device, dtype, coalesced): def _test_sparse_mask_fixed(): i = self.index_tensor([ From c2255c36ec121fdb998ce3db8deb7508c814b567 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Mon, 21 Feb 2022 13:49:19 -0800 Subject: [PATCH 198/199] Fix binary search in bisect_percentile_op (#73146) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73146 Binary search can overflow; this fixes it. Test Plan: Sandcastle Reviewed By: meyering Differential Revision: D34365186 fbshipit-source-id: f92a810b49ef5ce345d0b019b584fe3c1f5ae017 (cherry picked from commit 9c2133ec6f3ae9874fd6ae2ffb2a98bb7b68a6b0) --- caffe2/operators/bisect_percentile_op.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/caffe2/operators/bisect_percentile_op.h b/caffe2/operators/bisect_percentile_op.h index 0b5567a776c..2c2122c884c 100644 --- a/caffe2/operators/bisect_percentile_op.h +++ b/caffe2/operators/bisect_percentile_op.h @@ -115,13 +115,10 @@ class BisectPercentileOp final : public Operator { int lo, int hi, float val) { - int mid; - bool low_cond, high_cond; - while (lo < hi) { - mid = (lo + hi) >> 1; - low_cond = (data[mid] <= val); - high_cond = (val < data[mid + 1]); + const auto mid = lo + (hi - lo) / 2; + const bool low_cond = (data[mid] <= val); + const bool high_cond = (val < data[mid + 1]); if (low_cond && high_cond) { return mid; } else if (!low_cond) { From 99bcadced431cbe040653f6217a827ba97b11065 Mon Sep 17 00:00:00 2001 From: Linbin Yu Date: Tue, 22 Feb 2022 08:05:33 +0000 Subject: [PATCH 199/199] improve android instrumentation test and update README Added tests for lite interpreter. By default the run_test.sh will use lite interpreter, unless manually set BUILD_LITE_INTERPRETER=0 Also fixed model generation script for android instrumentation test and README. Verified test can pass for both full jit and lite interpreter. Also tested on emulator and real device using different abis. Lite interpreter ``` ./scripts/build_pytorch_android.sh x86 ./android/run_tests.sh ``` Full JIT ``` BUILD_LITE_INTERPRETER=0 ./scripts/build_pytorch_android.sh x86 BUILD_LITE_INTERPRETER=0 ./android/run_tests.sh ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/72736 --- android/README.md | 24 +++++++-- android/pytorch_android/build.gradle | 12 ++++- .../generate_test_torchscripts.py | 5 +- android/pytorch_android/host/build.gradle | 1 + .../src/androidTest/assets/test.pt | Bin 2365 -> 9658 bytes .../java/org/pytorch/PytorchHostTests.java | 6 ++- .../org/pytorch/PytorchInstrumentedTests.java | 7 ++- .../pytorch/PytorchLiteInstrumentedTests.java | 46 ++++++++++++++++++ .../java/org/pytorch/PytorchTestBase.java | 40 +++++++-------- .../PytorchLiteInstrumentedTestSuite.java | 9 ++++ 10 files changed, 122 insertions(+), 28 deletions(-) create mode 100644 android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java create mode 100644 android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java diff --git a/android/README.md b/android/README.md index 002409c5234..d1d6bcd6aa3 100644 --- a/android/README.md +++ b/android/README.md @@ -14,9 +14,16 @@ repositories { jcenter() } +# lite interpreter build dependencies { - implementation 'org.pytorch:pytorch_android:1.6.0' - implementation 'org.pytorch:pytorch_android_torchvision:1.6.0' + implementation 'org.pytorch:pytorch_android_lite:1.10.0' + implementation 'org.pytorch:pytorch_android_torchvision_lite:1.10.0' +} + +# full jit build +dependencies { + implementation 'org.pytorch:pytorch_android:1.10.0' + implementation 'org.pytorch:pytorch_android_torchvision:1.10.0' } ``` @@ -32,6 +39,15 @@ repositories { } } +# lite interpreter build +dependencies { + ... + implementation 'org.pytorch:pytorch_android_lite:1.12.0-SNAPSHOT' + implementation 'org.pytorch:pytorch_android_torchvision_lite:1.12.0-SNAPSHOT' + ... +} + +# full jit build dependencies { ... implementation 'org.pytorch:pytorch_android:1.12.0-SNAPSHOT' @@ -68,7 +84,7 @@ They are specified as environment variables: `ANDROID_HOME` - path to [Android SDK](https://developer.android.com/studio/command-line/sdkmanager.html) -`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk) +`ANDROID_NDK` - path to [Android NDK](https://developer.android.com/studio/projects/install-ndk). It's recommended to use NDK 21.x. `GRADLE_HOME` - path to [gradle](https://gradle.org/releases/) @@ -133,7 +149,7 @@ android { } dependencies { - extractForNativeBuild('org.pytorch:pytorch_android:1.6.0') + extractForNativeBuild('org.pytorch:pytorch_android:1.10.0') } task extractAARForNativeBuild { diff --git a/android/pytorch_android/build.gradle b/android/pytorch_android/build.gradle index a65c0ffd436..d10f6a30508 100644 --- a/android/pytorch_android/build.gradle +++ b/android/pytorch_android/build.gradle @@ -50,7 +50,17 @@ android { } androidTest { java { - exclude 'org/pytorch/PytorchHostTests.java' + if(System.env.BUILD_LITE_INTERPRETER == '0') { + println 'Build test for full jit (pytorch_jni)' + exclude 'org/pytorch/PytorchHostTests.java' + exclude 'org/pytorch/PytorchLiteInstrumentedTests.java' + exclude 'org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java' + } else { + println 'Build test for lite interpreter (pytorch_jni_lite)' + exclude 'org/pytorch/PytorchHostTests.java' + exclude 'org/pytorch/PytorchInstrumentedTests.java' + exclude 'org/pytorch/suite/PytorchInstrumentedTestSuite.java' + } } } } diff --git a/android/pytorch_android/generate_test_torchscripts.py b/android/pytorch_android/generate_test_torchscripts.py index 8b41fefc246..909f824fb26 100644 --- a/android/pytorch_android/generate_test_torchscripts.py +++ b/android/pytorch_android/generate_test_torchscripts.py @@ -1,4 +1,6 @@ import torch +from torch import Tensor +from typing import Dict, List, Tuple, Optional OUTPUT_DIR = "src/androidTest/assets/" @@ -7,7 +9,8 @@ def scriptAndSave(module, fileName): script_module = torch.jit.script(module) print(script_module.graph) outputFileName = OUTPUT_DIR + fileName - script_module.save(outputFileName) + # note that the lite interpreter model can also be used in full JIT + script_module._save_for_lite_interpreter(outputFileName) print("Saved to " + outputFileName) print('=' * 80) diff --git a/android/pytorch_android/host/build.gradle b/android/pytorch_android/host/build.gradle index 0f795f08657..088d1b5ca42 100644 --- a/android/pytorch_android/host/build.gradle +++ b/android/pytorch_android/host/build.gradle @@ -25,6 +25,7 @@ sourceSets { java { srcDir '../src/androidTest/java' exclude '**/PytorchInstrumented*' + exclude '**/PytorchLiteInstrumented*' } resources.srcDirs = ["../src/androidTest/assets"] } diff --git a/android/pytorch_android/src/androidTest/assets/test.pt b/android/pytorch_android/src/androidTest/assets/test.pt index 375ade9bc913242e622eb8014b30aec2429d8bfe..016b6d666a2a02ca386861b17e5c8be1957a7fc8 100644 GIT binary patch literal 9658 zcmcIq2{=@1A0K3!NK%L_)hJv`NFgeFW9&vvNz-7)nT(k^GiPiqx5NzXk+jiGDXFfy zrA291SEZXu%1!&?D%b6zefi$A7$ej2eNW$c9%jya-sSiH{`-3-oTk=~MAFhC^|^FO zLr4f)j5sE+5SD{5i5K9bPnz-hwNN#l$wUOAc!e ztK={Y!99~JW^yDv9y5*=pOnfHB`_g@APH6tmBhhn4&gLStq7Bc`)Hsi`WFtM4F=Hq zVlyN;;n3m*32aCC`VPYM03Z9l7(d6Xc)w>_OxT}e1~0fq;s153)y{f0^q3(%p&Wv(4`ny}Ul`_$7nZx{1TH z;DCPAoh3eT%R;ZrW0mNo4|eqlTD7j`xqjUhwWqnogK5JaQ!_25yzcZlJE+3#=gb!y zgKIXePWW|nYbDKO%PN;4lb(1Mj-=*88pD<>6=uIEzA;wH)f(b5@OO%l`P+Z%CSJaj z^NX#K&U&l4uDee^ySI6dR{Yr2js12Xm-T-)yXxZD0Gp5_X*K#rHW8x@Dt^^@SM9U? z;w!55!93^WAnhBGA43j~a9}bwR4i#UTl4Nt^SY|Zbs7(51-Zl5rDIr({q+TDBTtQC z3UahAnNF^s?scbzOYx<@^Vxi2@yFtXvvwDsIdu6x{Aa!PfHuzo4eZmoZ@rln>)wXs zZ5pukS#-w~M^oP^_Bv4$+)k2tN5!iAjrG|Yum4_ivnb5A{)u>t`NaHs7CGN;F|Rm- zBN=Qn$ zFR7S0Imp-bv}X9xs(E{S`2CH>-4Eh7TANfJf8!ey88`pb+2|Q#(`v}o5;EJI-%;Aq z89(pl&D^ehE8G0=594OkZFxLxvS`CuzgDj!xg)6K`#YU|GP~g2?6BziTe6d~ZK3n- za*A7MsUzF7L*5xDYiZsoamc?;_qlQTt>c}d_}`|E&Uf2=IX&htVWUi#wX%9;wEk-~XVpIvezO|+&!nj`uc7`DXXO0c zz`J%6*plSkvF#TYg`DZ)lUMZLJG0-zbFyR2rt=$zvd-SUGH(8OlLLXe?_-jOXna&d zJxJv&t0E#F}jTG zY&*y3LRIN3VMz6GO4{qmrL8S9o?X41viCV=Y93!7&{#Hwj23ahd zO-grq5fb6eoHypeQpaaHqie@z#5?G^&Gz6-G+S4_=r7Ic z($#0YKfJ!8AsEE-5jby~O&XplJnM14qRKk%m>VoBoWVWswEyFU0~VWp>u`5@`eb}3 z=k`1YGbDTQs{-9uoLXo9RH>`BxnIQqjgx_8{u1%o^2PRhby)E`uVNuv$p3uZnoMS+|%N^P***xwroYfZjChK6w6JET8)>I-%!sT+s_ zRZaW|Zeyd~C!Lu?QzDtqs&2jTZ8+K~^gN%NNDgJrJ{sy_>bNq3vTL8g^0sAn#$6Th zZbsJ3Sde*f%hkf-b))wgntJ)B*ClPs(rn>|FWjNu8tz`XR-fGeMsB-)U zuNYQu+vwz4k(;?8cb3a-tz82*gjY7^dN(!<{d1Su^^{EZtjs+Z;QaT;4wz5X%Qh%3 z(e$FaUn`YnOe-ulcKzveeZ(I@v11*B?wGbzo{ZhAzGY~`c8^`^8K(krN;2}YYf97` zqVhcJO0Fi3U#iIn_O1-ft4MG#yly&oSDpWjmy4fqN7_k>PW>BXv3^a#b<3KV(j99| zKU$6ceMi^LB1aurh)cQAPVtdXWn*FS$IU5kyB@47NP4?reD)xTZblT#?77dnC9PL8 zl5C6g+_Si4YxkC|T3C}}c+F;W9&ha<1L}Z=rG9mSb!jF1y&s}Num3x??Ws0riRto< zW!yDok6S`yUBjAm=r1&qJghEwH;pRK_s>3TH!BVvRbE0dwMzHNOQfVT^XBW0$j%LT zcrkaIFxz0a%_F`|O?|BKg$)lWs}IGdjS(-to|||(h(G1Ts;v~O@*wL&u{T}E6(pA) zY`gT=_6^?dv6***Z<5hEliPgOHhAOQb8%k#=p}(2 zm$xl!EE(TlrzOy>e}hiQVYjIL9B*k@$^MAS-;B4-Rln>#BtK!%^Zl!+HBBR^YqPRM z?wSo`!&j#QvW}GT*Cdoug7TuTHqG6zZ%b3>k;iI9hIh$7KR+_*!7+=}jscq=Rkani zZ@smqir+co{w(Ven`rNR_RknT+exoM#c+ry++*UATH!9qZM}bs5a(B=i&vXp>&V;O z`N5IzFzn5q^u*OiQ@5s$yM1bO)`~eP`86XJ3(bqnwrJH32|3*yxM5jQL)AssH>Odwux*Pp^YYs2 z8=7>-eef)eYIkJh46UbIGp1|22+DVr-Ee3gZuCo&O|8Qyhe!HPwr2R|jbBgSU2R`$ zJ}Ty(epdg8w!{}>8V(8d%N;51mbKp00n{YfdQWng<;-QRF}I%PQ10$e(=iW(e7hD? z0!u63)SXK;Snn|7A}#QEaZUO9O)1s+iPi!eQ@F{2^xzM>@}4K`E37FfyOZ}wjk;j%tWU#h zf!!rJt%$DnKcebEts6X*ili2_ZrwX&(Bo@quREtjc0e0#JCWCGqKf>wLYJ1F+5gY5 z)6r#j*b&3e$4K)H(w^3QQbEJvlO}KBCJhoPWe^??mFN0{NR;ky$QL6lJ|e~we#>tL zz=f(dSmK`Ss67o75+w8wP282Qqfhd}*NIC<*l6;OFXR5rWfW8PUf&n0TGKQcL&2&& zWdK@+IK&J12scq8kceTDHA4q<;DW7*B=HDWz!$?R)(pT3DT*Ez=^qNK(o|@wGKM;6 z@rw)#kA&4g14+g}cM}pe)gSIhQ$_Dp!F^abtWE<3ii`oW3{s>&9oE2Z5h+6(Gzf%j z5epPVVpx+g5Ztp6Hs8ZT0gV;|w31+Lnu;tN9|9iYq{Czy3ACXwkaQti4EMK@GIYV6 zh@Hq4BWw{<%*|lKIy6-&f-S}pB}$UnKwPjchLz7sX2S!puVOZj1M5k#@91RkK*j(t zQ*V}FeSH1|HisqQAx$vw7lRmKcEztsxa_2$uqca}N=eqhth! zdHeaoMi|WyPCnF4I;06{LE1oi*;0s1Qw9G4X%I#cqzh2#p{Vq6(BKx`PB%!Y6jF+N z^!MU+Nq~*Nh#dMJylOuJ?+~SUQ=Ajvp>lNnWW9L>^r`_JCfLMO4zmT=ga!zffye#2 z6+S2cHpLn-T|#9655qedVO9d>z>EbohoF#f*bEyA$@`$TLiB-5Xh3c#6k-wVu1A!Z zoysi+-GMnKTSCM@U}S5c1wc{au`>sUIb6hnDdff{@z~@z3740^6-RA?&b-SAMPTT3*g2&8NMbu}SOAr&5>pgW)uH77*SOOHo_9vDk;0s#-E z;93V3Q>tc4#Kwkw(1BKRv8jG8wjQp4mXpEX*R&lIE0-^nAn+&@I{`>h?}?@_*()3N zNeQ^>^U&k4Rh!k=D^#8#LA|8R$t}jla3B!pU&vhT5GPF`Z5|IL)htuqX9C5 zo%^C8nlLm*%D5o{w9AhK^sP{-L~S?%5qh)>k5z(04NIfCBV%u07W%DFh`BI%`XP4< z*hzD!fX}9*S&l1C9LAE8tPn`rA0C7DhQ}$vH>?{U8qm5k4O2W)65-qo??Uu6!JiTP zB4TRKzA$WfJY&P~cxVifr-af-zzO57TMZ3CeIpRI+rCPPrGsHYG>K*LV!(D{VK;>> zkLaryXf#Blim!k_N5aR_U)WvA-i>-~jL5Qx2#9<+fpH5o|Qz{X`D(xb_7 zPey@}*DH^J2o8yyfZ&i|w0BNGtk@4v6wHENeEReWo!x|{iDwSPTG*We(`fFEGtFc3 z;R#swiol*A%Vy}yA2FGUY=n!jlbKA|3-Scuff&fSIbh~)acF*791Jp;C>YQggNFB5 zcw%28RSc(Jk%b>uESJTk@0SIZNf8A7uOj$djj=@Ra}i(*{kI~3CICiNWKa*WIl13Q zEc6W~Day4P1cN9^awHoqLFf`8j}1@$U%Av)P-lW6JOfs)-0HZ-n+3g=YbTM=655I0 zF~A8li5?vu?9cQK3yq>jM*GrXZ+v8r;`U|@uj;Upk(H1D`(Pde=q4Gcr~;SjUr|!j z5HKahoP%#49c7?-Cq`F{yc>}t_$hc2C7kowss71ABs~gZ3E9)Q2t);z1p6!D9IxHF z9Vk&BpvhA(HHxmlMgZ6q&=`VF!UGW>>bPs|Vagh_KitMq8-@cxZ2-)>$MS(fz*F$L zKAy#6iC`-F9IgHFQGi#pgbl*S^|XUpnH`V7G{Cl+45aJiT;QNlj{y0Cp!CIB@(Tf} zJ7!`r$Pz5xyXJNf5Z)W0rHZjo6#Qih31uYQaSs^^yJDiRRhe*zybAulNJ@ka-~i>- ziK6#X5v@23b(5w75r*))3)D`pcb* zn8%H0!;#Qnn!1c(Aa4e?3C{PrNsq!^4|=ADyCf!)FF=6x((wmq(IXUblRZ2lSzIyO zKP{e(c?&oidrl}0G#;V?#ZvHx*s;M>u(k^3zpSyE+FLAUi_lEbUnCNUJS^d9xUz-V z0k!PWxG%amMfD=`Zp^1CIoCtb@Y^>YGrsI52z5eJNT6HOZ@q_aewnz*l8sMXjpP9Rm(Nq5V;>2k5Y#RLw zr>USH^566*{-g%gyWSV-7v#V2!H4$z=0@>dI$G)VTpPhxY-pcf;Cw!Z;tL4No(aIW zW^lT$e2$)X@Skr+@ihZ@qoHs;2bM_8rtk{m^YMyoqh&$QMF;5sHeT`7#OK2mc}CNR zo@+Tckiv!&0t%;TXxI^t&DHGX9nqH#=+*rxMNiqmbrn6(0v8yEo~x6v!*lS7J`Sg; NW8F2tD6~J`_dk)nqyYc` literal 2365 zcmWIWW@cev;NW1u06Yw03?-?>CHiHlMa7x3cB-euz_(^HWmw;Wp?MRJu8F zBfEKR=;^N81|oOQhi6DOTi=*m>98Pn%jL#}`Y&4Yd<3!<`NWE7PJCka_WE~^O`^vu zEi^Y9Ond)0W^VD$>sg-lg}Gj;D<4kVqxSP?h(Yp+8Pe82mZVL6aPmcyfY)@N&8*uy zmWebTWeHmysp7 zJ564(Y!P_Qc=)5jvzV9nea@e2`m;vmK2u4X6o1g3_|$U+yK3U6i5Z`E*_df`)?ljW zvW;6*g`dXTNn4yzS#rMf_?{=L8{54+r?uFmJkwklenReb*YdMhIUXMRFn0z!oA$ZL zX7A4u#x3*X-afOcUYropz?c1Rk;ygPo^r2$Yo1&Yd@cS+c@o!lb2V$}BUgO>sLyhJ zVY;a!?bA&DCa$K_3eG~V5$5GOFTO6>)+qa67DxYEm$>U~X2)NA`4!_LtaI+7THkxV z&%ZD2k@)Oe>;Lze%`0>F70+{Hzi&7neaYq7&%|dp#Qm>6T3z0|dg%l8qW;zA)I~2| zyH~L+YC+w$ix0oq$mU7QG6i@f8uQd{^*CvjxjBoCYu%D%%U^wDIqF_~D<$9e%fJ0e zJ6M;h{yX4%G3`ifsLmn9wv4J%QTHR~eaY*7{wzeCE6Vi!TE~5>Uq@f>|NkQEMe^tV zoWG}=lIQQf``T1$(!+yoS?g!!J>PW0yke8B@7YD>@2hP8EL?W@BR?n^@BX?_pahuR z+&S?jV+95~9LZQOB{iuuJszA1t&kHmzRZS2ZtK+Z*}{$@#}|8=-QK!wR@I6H7K;+R zvOA(Shy@>!Z?(GRq`N5~tMlZsyA$5Ldvos6vLB1ZV?~cPKd@ggohz>D%d>-v#2p1+ zIevNM`%ymNp!w#ve$vW!8{R2?ero;R_ulUhWo326Y{#!i*r+T!UT{Wzwn22>-1Wz= z8g3A@QD5;?dZ(S|^0$oh3ing{ z7YCj)0fa6S?g-kilY^ltndiPhHBA z<9X%$13iNJA20G(i8;)_@Zb8v*h6OXn142(w3=?;`n&t2!Zu~WZFBuv-tno)7u?jn zb6Dxyih_Rb@4im4U0mz3c6?cq8tNaJcGS4L=cF6UdB;;vQcui$q%bpz`G{+!$-{~! zK_Bi}t(TjarDtv`_!e#8ow04Ac8-P$Ye;*oQnI|hQ?b0zuAb)~N+-zA9rA|(`vO4AU5&R_t24D9O>+b1UP@!1)?K%-(;F4GM3#!aL!# z4_n@JmQR|~cQ3hcSNYDIg5RyydUqCye5%%8d3x;!X~UPS>&?z(&-?IZl~|`jwX>ze zJR`{}o&2-AUsstbOzC!+eDvG1|4(K-+*q5l-(4}9;l%|1TOXDG$n5&7^78EuxqbYf zj3+l%E3g(#Jng*l!dvlo4F&y6ol4FJy}G;N#hR(Rxjyvn2-MqIIz4*xP2uZeMt7#} zuF@{4KB+GILu(&1>pqW{vOmjV9#deL$Dnse@N4=g<=xFe~Hbywg^fs@>ATPyeO zeI4Ff;&fMZe$HW0yVusKb5BZ|ooY;Z*C2Ckug>cGi?8|KJhIhh>%lop)piRH&R8e; zAno@5+G*RFmv-BEKED1#_qWB#Dk=M8{tk7?Mg^O5S5MJ+dFme9?(0jy}WJ0nD33eXe~0O|v?ZbO>|An5>a PRyL3vD-eRzL(~EQF6Dum diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java index bc406dc9ae7..afdde74c5bd 100644 --- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java @@ -10,7 +10,11 @@ import java.util.Objects; public class PytorchHostTests extends PytorchTestBase { @Override - protected String assetFilePath(String assetName) throws IOException { + protected Module loadModel(String path) throws IOException { + return Module.load(assetFilePath(path)); + } + + private String assetFilePath(String assetName) throws IOException { Path tempFile = Files.createTempFile("test", ".pt"); try (InputStream resource = Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("test.pt"))) { diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java index bae01e39402..45084a69bb2 100644 --- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java @@ -14,7 +14,11 @@ import org.junit.runner.RunWith; public class PytorchInstrumentedTests extends PytorchTestBase { @Override - protected String assetFilePath(String assetName) throws IOException { + protected Module loadModel(String path) throws IOException { + return Module.load(assetFilePath(path)); + } + + private String assetFilePath(String assetName) throws IOException { final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); File file = new File(appContext.getFilesDir(), assetName); if (file.exists() && file.length() > 0) { @@ -35,4 +39,5 @@ public class PytorchInstrumentedTests extends PytorchTestBase { throw e; } } + } diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java new file mode 100644 index 00000000000..7e3dff3e771 --- /dev/null +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchLiteInstrumentedTests.java @@ -0,0 +1,46 @@ +package org.pytorch; + +import android.content.Context; + +import androidx.test.InstrumentationRegistry; +import androidx.test.runner.AndroidJUnit4; + +import org.junit.runner.RunWith; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +@RunWith(AndroidJUnit4.class) +public class PytorchLiteInstrumentedTests extends PytorchTestBase { + + @Override + protected Module loadModel(String path) throws IOException { + return LiteModuleLoader.load(assetFilePath(path)); + } + + private String assetFilePath(String assetName) throws IOException { + final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); + File file = new File(appContext.getFilesDir(), assetName); + if (file.exists() && file.length() > 0) { + return file.getAbsolutePath(); + } + + try (InputStream is = appContext.getAssets().open(assetName)) { + try (OutputStream os = new FileOutputStream(file)) { + byte[] buffer = new byte[4 * 1024]; + int read; + while ((read = is.read(buffer)) != -1) { + os.write(buffer, 0, read); + } + os.flush(); + } + return file.getAbsolutePath(); + } catch (IOException e) { + throw e; + } + } + +} diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java index 2817ae1bbd0..5a1405e679b 100644 --- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java @@ -16,7 +16,7 @@ public abstract class PytorchTestBase { @Test public void testForwardNull() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue input = IValue.from(Tensor.fromBlob(Tensor.allocateByteBuffer(1), new long[] {1})); assertTrue(input.isTensor()); final IValue output = module.forward(input); @@ -25,7 +25,7 @@ public abstract class PytorchTestBase { @Test public void testEqBool() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); for (boolean value : new boolean[] {false, true}) { final IValue input = IValue.from(value); assertTrue(input.isBool()); @@ -38,7 +38,7 @@ public abstract class PytorchTestBase { @Test public void testEqInt() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); for (long value : new long[] {Long.MIN_VALUE, -1024, -1, 0, 1, 1024, Long.MAX_VALUE}) { final IValue input = IValue.from(value); assertTrue(input.isLong()); @@ -51,7 +51,7 @@ public abstract class PytorchTestBase { @Test public void testEqFloat() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); double[] values = new double[] { -Double.MAX_VALUE, @@ -86,7 +86,7 @@ public abstract class PytorchTestBase { } final Tensor inputTensor = Tensor.fromBlob(inputTensorData, inputTensorShape); - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue input = IValue.from(inputTensor); assertTrue(input.isTensor()); assertTrue(inputTensor == input.toTensor()); @@ -103,7 +103,7 @@ public abstract class PytorchTestBase { @Test public void testEqDictIntKeyIntValue() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final Map inputMap = new HashMap<>(); inputMap.put(Long.MIN_VALUE, IValue.from(-Long.MIN_VALUE)); @@ -127,7 +127,7 @@ public abstract class PytorchTestBase { @Test public void testEqDictStrKeyIntValue() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final Map inputMap = new HashMap<>(); inputMap.put("long_min_value", IValue.from(Long.MIN_VALUE)); @@ -151,7 +151,7 @@ public abstract class PytorchTestBase { @Test public void testListIntSumReturnTuple() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); for (int n : new int[] {0, 1, 128}) { long[] a = new long[n]; @@ -178,7 +178,7 @@ public abstract class PytorchTestBase { @Test public void testOptionalIntIsNone() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); assertFalse(module.runMethod("optionalIntIsNone", IValue.from(1l)).toBool()); assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).toBool()); @@ -186,7 +186,7 @@ public abstract class PytorchTestBase { @Test public void testIntEq0None() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); assertTrue(module.runMethod("intEq0None", IValue.from(0l)).isNull()); assertTrue(module.runMethod("intEq0None", IValue.from(1l)).toLong() == 1l); @@ -194,7 +194,7 @@ public abstract class PytorchTestBase { @Test(expected = IllegalArgumentException.class) public void testRunUndefinedMethod() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); module.runMethod("test_undefined_method_throws_exception"); } @@ -241,7 +241,7 @@ public abstract class PytorchTestBase { @Test public void testEqString() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); String[] values = new String[] { "smoketest", @@ -260,7 +260,7 @@ public abstract class PytorchTestBase { @Test public void testStr3Concat() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); String[] values = new String[] { "smoketest", @@ -281,7 +281,7 @@ public abstract class PytorchTestBase { @Test public void testEmptyShape() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final long someNumber = 43; final IValue input = IValue.from(Tensor.fromBlob(new long[] {someNumber}, new long[] {})); final IValue output = module.runMethod("newEmptyShapeWithItem", input); @@ -293,7 +293,7 @@ public abstract class PytorchTestBase { @Test public void testAliasWithOffset() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue output = module.runMethod("testAliasWithOffset"); assertTrue(output.isTensorList()); Tensor[] tensors = output.toTensorList(); @@ -303,7 +303,7 @@ public abstract class PytorchTestBase { @Test public void testNonContiguous() throws IOException { - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue output = module.runMethod("testNonContiguous"); assertTrue(output.isTensor()); Tensor value = output.toTensor(); @@ -316,7 +316,7 @@ public abstract class PytorchTestBase { long[] inputShape = new long[] {1, 3, 2, 2}; long[] data = new long[] {1, 11, 101, 2, 12, 102, 3, 13, 103, 4, 14, 104}; Tensor inputNHWC = Tensor.fromBlob(data, inputShape, MemoryFormat.CHANNELS_LAST); - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue outputNCHW = module.runMethod("contiguous", IValue.from(inputNHWC)); assertIValueTensor( outputNCHW, @@ -334,7 +334,7 @@ public abstract class PytorchTestBase { long[] dataNHWDC = new long[] {1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16}; Tensor inputNHWDC = Tensor.fromBlob(dataNHWDC, shape, MemoryFormat.CHANNELS_LAST_3D); - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue outputNCHWD = module.runMethod("contiguous", IValue.from(inputNHWDC)); assertIValueTensor(outputNCHWD, MemoryFormat.CONTIGUOUS, shape, dataNCHWD); @@ -358,7 +358,7 @@ public abstract class PytorchTestBase { long[] dataWeightOHWI = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1}; Tensor wNHWC = Tensor.fromBlob(dataWeightOHWI, weightShape, MemoryFormat.CHANNELS_LAST); - final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME)); + final Module module = loadModel(TEST_MODULE_ASSET_NAME); final IValue outputNCHW = module.runMethod("conv2d", IValue.from(inputNCHW), IValue.from(wNCHW), IValue.from(false)); @@ -389,5 +389,5 @@ public abstract class PytorchTestBase { assertArrayEquals(expectedData, t.getDataAsLongArray()); } - protected abstract String assetFilePath(String assetName) throws IOException; + protected abstract Module loadModel(String assetName) throws IOException; } diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java b/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java new file mode 100644 index 00000000000..a494ffc663f --- /dev/null +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/suite/PytorchLiteInstrumentedTestSuite.java @@ -0,0 +1,9 @@ +package org.pytorch.suite; + +import org.junit.runner.RunWith; +import org.junit.runners.Suite; +import org.pytorch.PytorchLiteInstrumentedTests; + +@RunWith(Suite.class) +@Suite.SuiteClasses({PytorchLiteInstrumentedTests.class}) +public class PytorchLiteInstrumentedTestSuite {}