Revert "[Reland] fix missing-prototypes warnings in torch_cpu (Part 4) (#101949)"

This reverts commit 4f2c007a1b. Reverted https://github.com/pytorch/pytorch/pull/101949 on behalf of https://github.com/osalpekar due to As noted in @izaitsevfb's comment, we are still seeing linker errors, this time due to `nnc_prepacked_linear_clamp_run` being made a static function. ([comment](https://github.com/pytorch/pytorch/pull/101949#issuecomment-1560226880))
2025-12-06 12:20:52 +01:00 · 2023-05-23 22:53:47 +00:00 · 2023-05-23 22:53:47 +00:00 · 32ce06a5ab
commit 32ce06a5ab
parent 45a8f691ec
138 changed files with 772 additions and 572 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1598,7 +1598,6 @@ TORCH_COPTS = COMMON_COPTS + [
    "-fvisibility-inlines-hidden",
    "-fno-math-errno ",
    "-fno-trapping-math",
-    "-Wno-error=unused-function",
 ]

 torch_sources = {
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -763,7 +763,7 @@ IValueComparator getGreaterThanComparator(const IValue& v) {
  };
 }

-std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
+static std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
  out << v.qualifiedClassName() << "." << v.name();
  return out;
 }
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@ -1628,7 +1628,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {

  TORCH_API friend std::ostream& operator<<(
      std::ostream& out,
-      const ivalue::EnumHolder& v);
+      const EnumHolder& v);

  TORCH_API const std::string qualifiedClassName() const;

--- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@ -405,7 +405,7 @@ static std::tuple<Tensor,optional<int64_t>> searchsorted_batch_rule(
  TORCH_INTERNAL_ASSERT(false);
 }

-static Tensor bucketize_decomp_Tensor(
+Tensor bucketize_decomp_Tensor(
    const Tensor& self,
    const Tensor& boundaries,
    bool out_int32,
@ -415,7 +415,7 @@ static Tensor bucketize_decomp_Tensor(
  return at::searchsorted(boundaries, self, out_int32, right, nullopt, nullopt);
 }

-static Tensor bucketize_decomp_Scalar(
+Tensor bucketize_decomp_Scalar(
    const Scalar& self,
    const Tensor& boundaries,
    bool out_int32,
--- a/aten/src/ATen/native/Activation.cpp
+++ b/aten/src/ATen/native/Activation.cpp
@ -374,8 +374,8 @@ TORCH_IMPL_FUNC(softshrink_backward_out) (
  shrink_backward_stub(device_type(), *this, lambd);
 }

-#if AT_MKLDNN_ENABLED()
 static bool use_mkldnn(const Tensor& input) {
+#if AT_MKLDNN_ENABLED()
  if (!at::globalContext().userEnabledMkldnn()) {
    return false;
  }
@ -386,8 +386,9 @@ static bool use_mkldnn(const Tensor& input) {
    (input.device().is_cpu() &&
    (((input.scalar_type() == kBFloat16) && mkldnn_bf16_device_check()) ||
    (input.scalar_type() == kFloat))); // input is dense layout and bfloat16/float32
-}
 #endif
+  return false;
+}

 TORCH_IMPL_FUNC(gelu_out_cpu) (
  const Tensor& self, c10::string_view approximate, const Tensor& result
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@ -809,7 +809,7 @@ Tensor& arctan2_out(const Tensor& self, const Tensor& other, Tensor& result) {
  return at::atan2_out(result, self, other);
 }

-static Tensor& add_relu_impl(
+Tensor& add_relu_impl(
    Tensor& result, const Tensor& self, const Tensor& other, const Scalar& alpha) {
  auto iter = TensorIterator::binary_op(result, self, other);
  Scalar min_val;
@ -1003,7 +1003,7 @@ Tensor& mul__scalar_sparse_csr(Tensor& self, const Scalar& other) {
  return self;
 }

-static Device correct_out_device(const Tensor& self, const Tensor& other) {
+Device correct_out_device(const Tensor& self, const Tensor& other) {
  if (self.device() == at::kCPU){
      return other.device();
  } else {
@ -1049,7 +1049,7 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
  }
 }

-static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {
+Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {
  auto out_device = correct_out_device(self, other);
  // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
  auto device_ = Device(DeviceType::Meta);
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -770,7 +770,6 @@ static void check_input_same_type_as_parameters(
  check_input_same_type_as_parameters(input, weight, /*bias=*/ Tensor());
 }

-#if AT_MKLDNN_ENABLED()
 static void check_input_same_type_as_parameters(
    const Tensor& input,
    const Tensor& weight,
@ -789,7 +788,6 @@ static void check_input_same_type_as_parameters(
    check_input_same_type_as_parameters(input, weight, bias);
  }
 }
-#endif

 static auto view4d(const at::Tensor& tensor) -> at::Tensor {
  TORCH_CHECK(tensor.ndimension() == 3,
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@ -21,7 +21,6 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_copy_from.h>
-#include <ATen/ops/_propagate_xla_data.h>
 #include <ATen/ops/copy_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/expand_copy.h>
--- a/aten/src/ATen/native/LegacyBatching.cpp
+++ b/aten/src/ATen/native/LegacyBatching.cpp
@ -3,11 +3,6 @@
 #include <ATen/WrapDimUtils.h>
 #include <ATen/LegacyVmapTransforms.h>

-#ifdef AT_PER_OPERATOR_HEADERS
-#include <ATen/ops/_add_batch_dim_native.h>
-#include <ATen/ops/_remove_batch_dim_native.h>
-#endif
-
 namespace at { namespace native {

 // Adds a batch dimension to the tensor `self` out-of-place
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -1893,7 +1893,7 @@ The behavior depends on the dimensionality of the Tensors as follows:
 - Otherwise, we return bmm, after broadcasting and folding the batched dimensions if
  there's more than one
 */
-static Tensor _matmul_impl(
+Tensor _matmul_impl(
    Tensor& out,
    const Tensor& tensor1,
    const Tensor& tensor2) {
--- a/aten/src/ATen/native/PackedSequence.cpp
+++ b/aten/src/ATen/native/PackedSequence.cpp
@ -20,7 +20,7 @@

 namespace at { namespace native {

-static void checkLongTensor(const Tensor& tensor) {
+void checkLongTensor(const Tensor& tensor) {
  TORCH_CHECK(tensor.dim() == 1 && tensor.device().type() == at::kCPU && tensor.scalar_type() == at::kLong,
           "'lengths' argument should be a 1D CPU int64 tensor, but got ",
            tensor.dim(), "D ", tensor.device().str(), " ", tensor.scalar_type(), " tensor");
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1809,7 +1809,7 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data(
                         std::move(std::get<2>(results)));
 }

-static std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(
+std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(
    const Tensor& data,
    const Tensor& batch_sizes,
    c10::List<at::Tensor> hx_,
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@ -11,7 +11,6 @@
 #include <ATen/ops/resize_as_native.h>
 #include <ATen/ops/resize_native.h>
 #include <ATen/ops/resize.h>
-#include <ATen/ops/_resize_output.h>
 #endif

 namespace at { namespace native {
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -400,7 +400,7 @@ static void build_index_op(
  iter.build(config);
 }

-static void check_indices_on_cpu_or_selfdevice(
+void check_indices_on_cpu_or_selfdevice(
    const Tensor& self,
    const at::MaterializedIOptTensorListRef& indices) {
  auto dev = self.device();
@ -965,7 +965,7 @@ TORCH_IMPL_FUNC(index_add_cpu_out)
  }
 }

-static void index_reduce_func_impl(
+void index_reduce_func_impl(
  const Tensor& self,
  int64_t dim,
  const Tensor& index,
@ -1149,7 +1149,7 @@ static void check_indexarray_range(
  }
 }

-static Tensor & index_select_out_cpu_dim1_(
+Tensor & index_select_out_cpu_dim1_(
    Tensor & result_contig, const Tensor & self, const Tensor & index_contig) {

  auto self_contig = self.contiguous();
@ -1379,6 +1379,10 @@ Tensor index_select_quantized_cpu_(const Tensor & self, int64_t dim, const Tenso
  return at::native::index_select_out_cpu_(self, dim, index, result);
 }

+Tensor index_select_backward(const Tensor& grad, at::IntArrayRef self_sizes, int64_t dim, const Tensor& index) {
+    return at::native::index_select_backward_symint(grad, c10::fromIntArrayRefSlow(self_sizes), dim, index);
+}
+
 Tensor index_select_backward_symint(const Tensor& grad, c10::SymIntArrayRef self_sizes, int64_t dim, const Tensor& index) {
  // for composite compliance, use out-of-place variant of
  // `index_add` if index tensor is a Tensor Subclass.
@ -1533,7 +1537,7 @@ static void scatter_reduce_exclude_self_helper(
  });
 }

-static void _scatter_via_index_put(
+void _scatter_via_index_put(
  const Tensor& self,
  int64_t dim,
  const Tensor& index,
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@ -1009,7 +1009,7 @@ Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optio
  return dense_to_sparse_compressed<Layout::SparseBsc>(self, blocksize, dense_dim_opt);
 }

-static void _check_blocksize_matches(
+void _check_blocksize_matches(
    const Tensor& self,
    c10::optional<IntArrayRef> blocksize_opt,
    const std::string& name) {
@ -1023,7 +1023,7 @@ static void _check_blocksize_matches(
  }
 }

-static Tensor sparse_compressed_clone(
+Tensor sparse_compressed_clone(
    const Tensor& self,
    c10::optional<IntArrayRef> blocksize,
    const std::string& name) {
@ -1046,7 +1046,7 @@ static Tensor sparse_compressed_clone(
      values.device());
 }

-static Tensor sparse_compressed_to_flipped(
+Tensor sparse_compressed_to_flipped(
    const Tensor& self,
    c10::optional<IntArrayRef> blocksize,
    const std::string& name) {
--- a/aten/src/ATen/native/Unfold3d.cpp
+++ b/aten/src/ATen/native/Unfold3d.cpp
@ -1,6 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
-#include <ATen/native/Unfold3d.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>
--- a/aten/src/ATen/native/WeightNorm.cpp
+++ b/aten/src/ATen/native/WeightNorm.cpp
@ -10,8 +10,6 @@
 #else
 #include <ATen/ops/_weight_norm_differentiable_backward_native.h>
 #include <ATen/ops/_weight_norm_interface.h>
-#include <ATen/ops/_weight_norm_interface_backward_native.h>
-#include <ATen/ops/_weight_norm_interface_native.h>
 #include <ATen/ops/_weight_norm_native.h>
 #include <ATen/ops/empty_strided.h>
 #include <ATen/ops/norm_except_dim.h>
--- a/aten/src/ATen/native/cpu/PowKernel.cpp
+++ b/aten/src/ATen/native/cpu/PowKernel.cpp
@ -13,7 +13,7 @@ namespace at::native {

 inline namespace CPU_CAPABILITY {

-static void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
+void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
  const auto dtype = iter.common_dtype();
  if (isFloatingType(dtype) || isComplexType(dtype)) {
    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, dtype, "pow", [&]() {
@ -90,7 +90,7 @@ void reciprocal_kernel(TensorIteratorBase& iter);
 void rsqrt_kernel(TensorIteratorBase& iter);
 void sqrt_kernel(TensorIteratorBase& iter);

-static void pow_tensor_scalar_kernel(
+void pow_tensor_scalar_kernel(
    TensorIteratorBase& iter,
    const Scalar& exp_scalar) {
  // prevent multiple calls to iter.common_dtype()
--- a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
+++ b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@ -32,7 +32,6 @@ namespace mkl {

 namespace {

-#if AT_USE_MKL_SPARSE()
 c10::MaybeOwned<Tensor> prepare_dense_matrix_for_mkl(
    const Tensor& tensor) {
  if (tensor.is_non_overlapping_and_dense() ||
@ -111,6 +110,7 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) {
 /*
  Resizes `input` tensor and fills it with the data from MKL.
 */
+#if AT_USE_MKL_SPARSE()
 template <typename scalar_t>
 void mkl_result_copy_(const Tensor& input, sparse_matrix_t mkl_desc) {
  sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
--- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
@ -6,7 +6,7 @@
 namespace at {
 namespace native {

-static TensorOptions verify_empty_parameters(
+TensorOptions verify_empty_parameters(
    const at::Tensor& self,
    c10::optional<ScalarType> dtype,
    c10::optional<Layout> layout,
--- a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
@ -79,6 +79,64 @@ Tensor bmm_nested(const Tensor& self, const Tensor& mat2) {
  return output;
 }

+// utilities support `matmul_nested`
+namespace {
+// Args:
+//     self_sizes: the sizes of `self` in `matmul_nested`
+//     mat2_sizes: the sizes of `mat2` in `matmul_nested`
+//     buffer_op: the options for new buffer
+//     sizemat_op: the options for new size matrix
+// Returns:
+//     the batch size of each input underlying tensor, i.e. the product of batch-dimension sizes
+//     the empty output nested tensor
+inline std::tuple<std::vector<int64_t>, Tensor>
+matmul_nested_helper(
+    const std::vector<IntArrayRef>& self_sizes,
+    const std::vector<IntArrayRef>& mat2_sizes,
+    const c10::TensorOptions& buffer_op,
+    const c10::TensorOptions& sizemat_op) {
+  int64_t ntensors = self_sizes.size(),
+      ndims = self_sizes[0].size();
+  std::vector<int64_t> batch_sizes(ntensors, 1);
+  Tensor sizemat = at::empty({ntensors, ndims}, sizemat_op);
+  int64_t* sizemat_ptr = sizemat.mutable_data_ptr<int64_t>();
+  int64_t numel = 0;
+  for (int64_t i = 0; i < ntensors; i++) {
+    const IntArrayRef& self_size = self_sizes[i],
+        & mat2_size = mat2_sizes[i];
+    int64_t& batch_size = batch_sizes[i];
+    // batch dimensions
+    for (int64_t j = 0; j < ndims - 2; j++) {
+      const int64_t& self_sizej = self_size[j],
+          & mat2_sizej = mat2_size[j];
+      TORCH_CHECK(
+          self_sizej == mat2_sizej,
+          "matmul: For nested tensors, no broadcasting is currently performed: ",
+          i, "-th nested matrices in batch at dimension ", j + 1,
+          " have mismatching sizes ", self_sizej, " and ", mat2_sizej);
+      sizemat_ptr[j] = self_sizej;
+      batch_size *= sizemat_ptr[j];
+    }
+    // matrix multiplication dimensions
+    const int64_t& self_size0 = self_size[ndims - 2], & self_size1 = self_size[ndims - 1],
+        & mat2_size0 = mat2_size[ndims - 2], & mat2_size1 = mat2_size[ndims - 1];
+    TORCH_CHECK(
+        self_size1 == mat2_size0,
+        "matmul: ",
+        i, "-th nested matrices in batch cannot be multiplied (",
+        self_size0, "x", self_size1, " and ",
+        mat2_size0, "x", mat2_size1, ")");
+    sizemat_ptr[ndims - 2] = self_size0;
+    sizemat_ptr[ndims - 1] = mat2_size1;
+    sizemat_ptr += ndims;
+    numel += batch_size * self_size0 * mat2_size1;
+  }
+  Tensor buffer = at::empty(numel, buffer_op);
+  Tensor output = wrap_buffer(buffer, sizemat);
+  return std::make_tuple(batch_sizes, output);
+}
+}
+
 Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
  // Tensor self = self_.contiguous();
  // Tensor mat2 = mat2_.contiguous();
--- a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
@ -128,7 +128,7 @@ Tensor fake_quantize_per_channel_affine_cachemask_backward(
  return dY * mask;
 }

-static Tensor _get_rounded_zero_point(
+Tensor _get_rounded_zero_point(
    const Tensor& zero_point,
    int64_t quant_min,
    int64_t quant_max) {
--- a/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
+++ b/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
@ -133,7 +133,7 @@ Tensor fake_quantize_per_tensor_affine_cachemask_backward(
  return dY * mask;
 }

-static int64_t _get_zero_point_from_tensor(
+int64_t _get_zero_point_from_tensor(
    const Tensor& zero_point,
    int64_t quant_min,
    int64_t quant_max,
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@ -285,7 +285,7 @@ std::tuple<double, int64_t> _choose_qparams_per_tensor(
  return std::make_tuple(q_params.scale, q_params.zero_point);
 }

-static float calculate_quant_loss(
+float calculate_quant_loss(
    const float* input,
    int numel,
    float xmin,
--- a/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/ReduceOps.cpp
@ -171,6 +171,15 @@ Tensor mean_quantized_cpu(
  return result;
 }

+Tensor mean_quantized_cpu(
+    const Tensor& self,
+    DimnameList dim,
+    bool keepdim,
+    optional<ScalarType> dtype) {
+  return mean_quantized_cpu(
+      self, dimnames_to_positions(self, dim), keepdim, dtype);
+}
+
 Tensor& mean_out_quantized_cpu(
    Tensor& result,
    const Tensor& self,
--- a/aten/src/ATen/native/quantized/cpu/qdropout.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qdropout.cpp
@ -9,7 +9,7 @@ namespace native {

 DEFINE_DISPATCH(qdropout_stub);

-static Tensor quantized_dropout(
+Tensor quantized_dropout(
    const Tensor& qx, double output_scale, int64_t output_zero_point, const Scalar& p, bool training) {
  return qx;
 }
--- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp
@ -35,7 +35,7 @@ DEFINE_DISPATCH(qrelu_leaky_stub);
 DEFINE_DISPATCH(qprelu_stub);

 #ifdef USE_PYTORCH_QNNPACK
-static Tensor qnnpack_relu(Tensor input) {
+Tensor qnnpack_relu(Tensor input) {
  Tensor qy;
  TORCH_CHECK(
      input.ndimension() > 0, "qnnpack_relu(): Got empty input tensor");
--- a/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensor.cpp
@ -122,7 +122,7 @@ bool solve_arange(const Tensor& input, int64_t& start, int64_t& end, int64_t& st
  formats with support to batched and dense dimensions.
 */

-static void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, const IntArrayRef size, const Layout& layout) {
+void _validate_sparse_compressed_tensor_args_worker(const Tensor& compressed_indices, const Tensor& plain_indices, const Tensor& values, const IntArrayRef size, const Layout& layout) {
  // Layout must be Sparse Compressed, 2.4
  AT_DISPATCH_ALL_SPARSE_COMPRESSED_LAYOUTS(layout, "validate_sparse_compressed_tensor_args", [&]{});

@ -321,7 +321,7 @@ void _validate_sparse_bsc_tensor_args(const Tensor& ccol_indices, const Tensor&
 // of historical reasons (that ought to be removed in future) and does
 // not mean that the corresponding functionality would be CSR layout
 // only specific.
-static SparseCsrTensor new_compressed_tensor(const TensorOptions& options) {
+SparseCsrTensor new_compressed_tensor(const TensorOptions& options) {
  // TODO: remove this comment after enabling autograd support for CSR tensor
  // constructor.
  // TORCH_INTERNAL_ASSERT(impl::variable_excluded_from_dispatch());
@ -401,7 +401,7 @@ SPARSE_COMPRESSED_TENSOR_UNSAFE(csc, kSparseCsc);
 SPARSE_COMPRESSED_TENSOR_UNSAFE(bsr, kSparseBsr);
 SPARSE_COMPRESSED_TENSOR_UNSAFE(bsc, kSparseBsc);

-static DimVector _estimate_sparse_compressed_tensor_size(
+DimVector _estimate_sparse_compressed_tensor_size(
    const Tensor& compressed_indices,
    const Tensor& plain_indices,
    const Tensor& values,
@ -716,6 +716,12 @@ int64_t dense_dim_sparse_csr(const SparseCsrTensor& self) {
  return get_sparse_csr_impl(self)->dense_dim();
 }

+bool _is_same_size_as_sparse_compressed(
+    const SparseCsrTensor& self,
+    const SparseCsrTensor& src) {
+  return self.sizes().equals(src.sizes());
+}
+
 const SparseCsrTensor& resize_as_sparse_compressed_(
    const SparseCsrTensor& self,
    const SparseCsrTensor& src) {
--- a/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseCsrTensorMath.cpp
@ -342,6 +342,16 @@ inline Tensor get_result_tensor_for_unary_op(F op, const Tensor& input) {
 }
 } // namespace

+static constexpr bool is_mkl_supported() {
+#ifdef _MSC_VER
+  return false;
+#elif __APPLE__ || __MACH__
+  return false;
+#else
+  return true;
+#endif
+}
+
 // Only accept squares sparse matrices or dense input as a vector
 // TODO: Check what happens with MKL, the output error reported with non square
 // matrices tends to be high See:
--- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp
@ -78,6 +78,20 @@
 namespace at::native {

 using namespace at::sparse;
+// --------------------------------------------------------------------
+// Utility functions
+// --------------------------------------------------------------------
+
+namespace {
+
+  inline SparseTensor get_result_tensor_for_unary_op(const SparseTensor& input) {
+    if (c10::isIntegralType(input.scalar_type(), /*includeBool=*/true)) {
+      return at::empty_like(input, input.options().dtype(c10::get_default_dtype()));
+    }
+    return at::empty_like(input);
+  }
+}
+
 // --------------------------------------------------------------------
 // zero_(SparseTensor)
 // --------------------------------------------------------------------
--- a/aten/src/ATen/nnapi/nnapi_bind.cpp
+++ b/aten/src/ATen/nnapi/nnapi_bind.cpp
@ -16,7 +16,7 @@ nnapi_wrapper* nnapi;
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 nnapi_wrapper* check_nnapi;

-static void load_platform_library() {
+void load_platform_library() {
  static int run_once = [](){
    nnapi_wrapper_load(&nnapi, &check_nnapi);
    CAFFE_ENFORCE(nnapi);
--- a/aten/src/ATen/nnapi/nnapi_wrapper.cpp
+++ b/aten/src/ATen/nnapi/nnapi_wrapper.cpp
@ -23,7 +23,7 @@
 static int loaded = 0;
 static struct nnapi_wrapper nnapi_;
 static struct nnapi_wrapper check_nnapi_;
-static int check__getDeviceCount(uint32_t* numDevices) {
+int check__getDeviceCount(uint32_t* numDevices) {
  CAFFE_ENFORCE(nnapi_._getDeviceCount);
  int ret = nnapi_._getDeviceCount(numDevices);
  // TODO: Maybe add better logging here.
@ -33,7 +33,7 @@ static int check__getDeviceCount(uint32_t* numDevices) {
  );
  return ret;
 }
-static int check__getDevice(uint32_t devIndex, ANeuralNetworksDevice** device) {
+int check__getDevice(uint32_t devIndex, ANeuralNetworksDevice** device) {
  CAFFE_ENFORCE(nnapi_._getDevice);
  int ret = nnapi_._getDevice(devIndex,device);
  // TODO: Maybe add better logging here.
@ -43,7 +43,7 @@ static int check__getDevice(uint32_t devIndex, ANeuralNetworksDevice** device) {
  );
  return ret;
 }
-static int check_Device_getName(const ANeuralNetworksDevice* device, const char** name) {
+int check_Device_getName(const ANeuralNetworksDevice* device, const char** name) {
  CAFFE_ENFORCE(nnapi_.Device_getName);
  int ret = nnapi_.Device_getName(device,name);
  // TODO: Maybe add better logging here.
@ -53,7 +53,7 @@ static int check_Device_getName(const ANeuralNetworksDevice* device, const char*
  );
  return ret;
 }
-static int check_Device_getVersion(const ANeuralNetworksDevice* device, const char** version) {
+int check_Device_getVersion(const ANeuralNetworksDevice* device, const char** version) {
  CAFFE_ENFORCE(nnapi_.Device_getVersion);
  int ret = nnapi_.Device_getVersion(device,version);
  // TODO: Maybe add better logging here.
@ -63,7 +63,7 @@ static int check_Device_getVersion(const ANeuralNetworksDevice* device, const ch
  );
  return ret;
 }
-static int check_Device_getFeatureLevel(const ANeuralNetworksDevice* device, int64_t* featureLevel) {
+int check_Device_getFeatureLevel(const ANeuralNetworksDevice* device, int64_t* featureLevel) {
  CAFFE_ENFORCE(nnapi_.Device_getFeatureLevel);
  int ret = nnapi_.Device_getFeatureLevel(device,featureLevel);
  // TODO: Maybe add better logging here.
@ -73,7 +73,7 @@ static int check_Device_getFeatureLevel(const ANeuralNetworksDevice* device, int
  );
  return ret;
 }
-static int check_Model_getSupportedOperationsForDevices( const ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, bool* supportedOps) {
+int check_Model_getSupportedOperationsForDevices( const ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, bool* supportedOps) {
  CAFFE_ENFORCE(nnapi_.Model_getSupportedOperationsForDevices);
  int ret = nnapi_.Model_getSupportedOperationsForDevices(model,devices,numDevices,supportedOps);
  // TODO: Maybe add better logging here.
@ -83,7 +83,7 @@ static int check_Model_getSupportedOperationsForDevices( const ANeuralNetworksMo
  );
  return ret;
 }
-static int check_Compilation_createForDevices(ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, ANeuralNetworksCompilation** compilation) {
+int check_Compilation_createForDevices(ANeuralNetworksModel* model, const ANeuralNetworksDevice* const* devices, uint32_t numDevices, ANeuralNetworksCompilation** compilation) {
  CAFFE_ENFORCE(nnapi_.Compilation_createForDevices);
  int ret = nnapi_.Compilation_createForDevices(model,devices,numDevices,compilation);
  // TODO: Maybe add better logging here.
@ -93,7 +93,7 @@ static int check_Compilation_createForDevices(ANeuralNetworksModel* model, const
  );
  return ret;
 }
-static int check_Execution_compute(ANeuralNetworksExecution* execution) {
+int check_Execution_compute(ANeuralNetworksExecution* execution) {
  CAFFE_ENFORCE(nnapi_.Execution_compute);
  int ret = nnapi_.Execution_compute(execution);
  // TODO: Maybe add better logging here.
@ -103,7 +103,7 @@ static int check_Execution_compute(ANeuralNetworksExecution* execution) {
  );
  return ret;
 }
-static int check_Memory_createFromFd(size_t size, int protect, int fd, size_t offset, ANeuralNetworksMemory** memory) {
+int check_Memory_createFromFd(size_t size, int protect, int fd, size_t offset, ANeuralNetworksMemory** memory) {
  CAFFE_ENFORCE(nnapi_.Memory_createFromFd);
  int ret = nnapi_.Memory_createFromFd(size,protect,fd,offset,memory);
  // TODO: Maybe add better logging here.
@ -113,11 +113,11 @@ static int check_Memory_createFromFd(size_t size, int protect, int fd, size_t of
  );
  return ret;
 }
-static void check_Memory_free(ANeuralNetworksMemory* memory) {
+void check_Memory_free(ANeuralNetworksMemory* memory) {
  CAFFE_ENFORCE(nnapi_.Memory_free);
  nnapi_.Memory_free(memory);
 }
-static int check_Model_create(ANeuralNetworksModel** model) {
+int check_Model_create(ANeuralNetworksModel** model) {
  CAFFE_ENFORCE(nnapi_.Model_create);
  int ret = nnapi_.Model_create(model);
  // TODO: Maybe add better logging here.
@ -127,11 +127,11 @@ static int check_Model_create(ANeuralNetworksModel** model) {
  );
  return ret;
 }
-static void check_Model_free(ANeuralNetworksModel* model) {
+void check_Model_free(ANeuralNetworksModel* model) {
  CAFFE_ENFORCE(nnapi_.Model_free);
  nnapi_.Model_free(model);
 }
-static int check_Model_finish(ANeuralNetworksModel* model) {
+int check_Model_finish(ANeuralNetworksModel* model) {
  CAFFE_ENFORCE(nnapi_.Model_finish);
  int ret = nnapi_.Model_finish(model);
  // TODO: Maybe add better logging here.
@ -141,7 +141,7 @@ static int check_Model_finish(ANeuralNetworksModel* model) {
  );
  return ret;
 }
-static int check_Model_addOperand(ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type) {
+int check_Model_addOperand(ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type) {
  CAFFE_ENFORCE(nnapi_.Model_addOperand);
  int ret = nnapi_.Model_addOperand(model,type);
  // TODO: Maybe add better logging here.
@ -151,7 +151,7 @@ static int check_Model_addOperand(ANeuralNetworksModel* model, const ANeuralNetw
  );
  return ret;
 }
-static int check_Model_setOperandValue(ANeuralNetworksModel* model, int32_t index, const void* buffer, size_t length) {
+int check_Model_setOperandValue(ANeuralNetworksModel* model, int32_t index, const void* buffer, size_t length) {
  CAFFE_ENFORCE(nnapi_.Model_setOperandValue);
  int ret = nnapi_.Model_setOperandValue(model,index,buffer,length);
  // TODO: Maybe add better logging here.
@ -161,7 +161,7 @@ static int check_Model_setOperandValue(ANeuralNetworksModel* model, int32_t inde
  );
  return ret;
 }
-static int check_Model_setOperandValueFromMemory(ANeuralNetworksModel* model, int32_t index, const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
+int check_Model_setOperandValueFromMemory(ANeuralNetworksModel* model, int32_t index, const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
  CAFFE_ENFORCE(nnapi_.Model_setOperandValueFromMemory);
  int ret = nnapi_.Model_setOperandValueFromMemory(model,index,memory,offset,length);
  // TODO: Maybe add better logging here.
@ -171,7 +171,7 @@ static int check_Model_setOperandValueFromMemory(ANeuralNetworksModel* model, in
  );
  return ret;
 }
-static int check_Model_addOperation(ANeuralNetworksModel* model, ANeuralNetworksOperationType type, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs) {
+int check_Model_addOperation(ANeuralNetworksModel* model, ANeuralNetworksOperationType type, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs) {
  CAFFE_ENFORCE(nnapi_.Model_addOperation);
  int ret = nnapi_.Model_addOperation(model,type,inputCount,inputs,outputCount,outputs);
  // TODO: Maybe add better logging here.
@ -181,7 +181,7 @@ static int check_Model_addOperation(ANeuralNetworksModel* model, ANeuralNetworks
  );
  return ret;
 }
-static int check_Model_identifyInputsAndOutputs(ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs) {
+int check_Model_identifyInputsAndOutputs(ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs) {
  CAFFE_ENFORCE(nnapi_.Model_identifyInputsAndOutputs);
  int ret = nnapi_.Model_identifyInputsAndOutputs(model,inputCount,inputs,outputCount,outputs);
  // TODO: Maybe add better logging here.
@ -191,7 +191,7 @@ static int check_Model_identifyInputsAndOutputs(ANeuralNetworksModel* model, uin
  );
  return ret;
 }
-static int check_Model_relaxComputationFloat32toFloat16(ANeuralNetworksModel* model, bool allow) {
+int check_Model_relaxComputationFloat32toFloat16(ANeuralNetworksModel* model, bool allow) {
  CAFFE_ENFORCE(nnapi_.Model_relaxComputationFloat32toFloat16);
  int ret = nnapi_.Model_relaxComputationFloat32toFloat16(model,allow);
  // TODO: Maybe add better logging here.
@ -201,7 +201,7 @@ static int check_Model_relaxComputationFloat32toFloat16(ANeuralNetworksModel* mo
  );
  return ret;
 }
-static int check_Compilation_create(ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation) {
+int check_Compilation_create(ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation) {
  CAFFE_ENFORCE(nnapi_.Compilation_create);
  int ret = nnapi_.Compilation_create(model,compilation);
  // TODO: Maybe add better logging here.
@ -211,11 +211,11 @@ static int check_Compilation_create(ANeuralNetworksModel* model, ANeuralNetworks
  );
  return ret;
 }
-static void check_Compilation_free(ANeuralNetworksCompilation* compilation) {
+void check_Compilation_free(ANeuralNetworksCompilation* compilation) {
  CAFFE_ENFORCE(nnapi_.Compilation_free);
  nnapi_.Compilation_free(compilation);
 }
-static int check_Compilation_setPreference(ANeuralNetworksCompilation* compilation, int32_t preference) {
+int check_Compilation_setPreference(ANeuralNetworksCompilation* compilation, int32_t preference) {
  CAFFE_ENFORCE(nnapi_.Compilation_setPreference);
  int ret = nnapi_.Compilation_setPreference(compilation,preference);
  // TODO: Maybe add better logging here.
@ -225,7 +225,7 @@ static int check_Compilation_setPreference(ANeuralNetworksCompilation* compilati
  );
  return ret;
 }
-static int check_Compilation_finish(ANeuralNetworksCompilation* compilation) {
+int check_Compilation_finish(ANeuralNetworksCompilation* compilation) {
  CAFFE_ENFORCE(nnapi_.Compilation_finish);
  int ret = nnapi_.Compilation_finish(compilation);
  // TODO: Maybe add better logging here.
@ -235,7 +235,7 @@ static int check_Compilation_finish(ANeuralNetworksCompilation* compilation) {
  );
  return ret;
 }
-static int check_Execution_create(ANeuralNetworksCompilation* compilation, ANeuralNetworksExecution** execution) {
+int check_Execution_create(ANeuralNetworksCompilation* compilation, ANeuralNetworksExecution** execution) {
  CAFFE_ENFORCE(nnapi_.Execution_create);
  int ret = nnapi_.Execution_create(compilation,execution);
  // TODO: Maybe add better logging here.
@ -245,11 +245,11 @@ static int check_Execution_create(ANeuralNetworksCompilation* compilation, ANeur
  );
  return ret;
 }
-static void check_Execution_free(ANeuralNetworksExecution* execution) {
+void check_Execution_free(ANeuralNetworksExecution* execution) {
  CAFFE_ENFORCE(nnapi_.Execution_free);
  nnapi_.Execution_free(execution);
 }
-static int check_Execution_setInput(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length) {
+int check_Execution_setInput(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length) {
  CAFFE_ENFORCE(nnapi_.Execution_setInput);
  int ret = nnapi_.Execution_setInput(execution,index,type,buffer,length);
  // TODO: Maybe add better logging here.
@ -259,7 +259,7 @@ static int check_Execution_setInput(ANeuralNetworksExecution* execution, int32_t
  );
  return ret;
 }
-static int check_Execution_setInputFromMemory(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
+int check_Execution_setInputFromMemory(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
  CAFFE_ENFORCE(nnapi_.Execution_setInputFromMemory);
  int ret = nnapi_.Execution_setInputFromMemory(execution,index,type,memory,offset,length);
  // TODO: Maybe add better logging here.
@ -269,7 +269,7 @@ static int check_Execution_setInputFromMemory(ANeuralNetworksExecution* executio
  );
  return ret;
 }
-static int check_Execution_setOutput(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length) {
+int check_Execution_setOutput(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length) {
  CAFFE_ENFORCE(nnapi_.Execution_setOutput);
  int ret = nnapi_.Execution_setOutput(execution,index,type,buffer,length);
  // TODO: Maybe add better logging here.
@ -279,7 +279,7 @@ static int check_Execution_setOutput(ANeuralNetworksExecution* execution, int32_
  );
  return ret;
 }
-static int check_Execution_setOutputFromMemory(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
+int check_Execution_setOutputFromMemory(ANeuralNetworksExecution* execution, int32_t index, const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, size_t offset, size_t length) {
  CAFFE_ENFORCE(nnapi_.Execution_setOutputFromMemory);
  int ret = nnapi_.Execution_setOutputFromMemory(execution,index,type,memory,offset,length);
  // TODO: Maybe add better logging here.
@ -289,7 +289,7 @@ static int check_Execution_setOutputFromMemory(ANeuralNetworksExecution* executi
  );
  return ret;
 }
-static int check_Execution_startCompute(ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) {
+int check_Execution_startCompute(ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) {
  CAFFE_ENFORCE(nnapi_.Execution_startCompute);
  int ret = nnapi_.Execution_startCompute(execution,event);
  // TODO: Maybe add better logging here.
@ -299,7 +299,7 @@ static int check_Execution_startCompute(ANeuralNetworksExecution* execution, ANe
  );
  return ret;
 }
-static int check_Event_wait(ANeuralNetworksEvent* event) {
+int check_Event_wait(ANeuralNetworksEvent* event) {
  CAFFE_ENFORCE(nnapi_.Event_wait);
  int ret = nnapi_.Event_wait(event);
  // TODO: Maybe add better logging here.
@ -309,11 +309,11 @@ static int check_Event_wait(ANeuralNetworksEvent* event) {
  );
  return ret;
 }
-static void check_Event_free(ANeuralNetworksEvent* event) {
+void check_Event_free(ANeuralNetworksEvent* event) {
  CAFFE_ENFORCE(nnapi_.Event_free);
  nnapi_.Event_free(event);
 }
-static int check_Execution_getOutputOperandRank(ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank) {
+int check_Execution_getOutputOperandRank(ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank) {
  CAFFE_ENFORCE(nnapi_.Execution_getOutputOperandRank);
  int ret = nnapi_.Execution_getOutputOperandRank(execution,index,rank);
  // TODO: Maybe add better logging here.
@ -323,7 +323,7 @@ static int check_Execution_getOutputOperandRank(ANeuralNetworksExecution* execut
  );
  return ret;
 }
-static int check_Execution_getOutputOperandDimensions(ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions) {
+int check_Execution_getOutputOperandDimensions(ANeuralNetworksExecution* execution, int32_t index, uint32_t* dimensions) {
  CAFFE_ENFORCE(nnapi_.Execution_getOutputOperandDimensions);
  int ret = nnapi_.Execution_getOutputOperandDimensions(execution,index,dimensions);
  // TODO: Maybe add better logging here.
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@ -83,7 +83,7 @@ QTensorImpl* get_qtensorimpl(const TensorBase& self) {
  return static_cast<QTensorImpl*>(self.unsafeGetTensorImpl());
 }

-static int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize, at::ScalarType t) {
+int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize, at::ScalarType t) {
  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  int64_t element_per_byte;
  switch(t) {
@ -178,7 +178,7 @@ Tensor PerTensorAffineQuantizer::quantize(const Tensor& rtensor) {
  return qtensor;
 }

-static void per_tensor_affine_dequantize_impl(
+void per_tensor_affine_dequantize_impl(
    Tensor& rtensor,
    const Tensor& qtensor,
    const double scale,
@ -228,7 +228,7 @@ Tensor PerChannelAffineQuantizer::quantize(const Tensor& rtensor) {
  return qtensor;
 }

-static void per_channel_affine_dequantize_impl(
+void per_channel_affine_dequantize_impl(
    Tensor& rtensor,
    const Tensor& qtensor,
    const Tensor& scale,
@ -278,7 +278,7 @@ Tensor PerChannelAffineFloatQParamsQuantizer::quantize(const Tensor& rtensor) {
  return qtensor;
 }

-static void per_channel_affine_float_q_params_dequantize_impl(
+void per_channel_affine_float_q_params_dequantize_impl(
    Tensor& rtensor,
    const Tensor& qtensor,
    const Tensor& scale,
--- a/aten/src/ATen/vulkan/Context.h
+++ b/aten/src/ATen/vulkan/Context.h
@ -22,9 +22,6 @@ class VulkanImplRegistrar {
 };

 at::Tensor& vulkan_copy_(at::Tensor& self, const at::Tensor& src);
-namespace native {
-  bool is_vulkan_available();
-}// namespace native

 } // namespace vulkan
 } // namespace at
--- a/torch/csrc/api/src/nn/modules/conv.cpp
+++ b/torch/csrc/api/src/nn/modules/conv.cpp
@ -17,7 +17,7 @@

 namespace F = torch::nn::functional;

-static F::PadFuncOptions::mode_t _get_pad_mode_from_conv_padding_mode(
+F::PadFuncOptions::mode_t _get_pad_mode_from_conv_padding_mode(
    torch::nn::detail::conv_padding_mode_t conv_padding_mode) {
  F::PadFuncOptions::mode_t pad_mode;
  if (c10::get_if<torch::enumtype::kReflect>(&conv_padding_mode)) {
--- a/torch/csrc/api/src/nn/modules/rnn.cpp
+++ b/torch/csrc/api/src/nn/modules/rnn.cpp
@ -28,7 +28,7 @@ namespace nn {
 /// https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnRNNMode_t
 enum class CuDNNMode { RNN_RELU = 0, RNN_TANH = 1, LSTM = 2, GRU = 3 };

-static CuDNNMode get_cudnn_mode_for_rnn(
+CuDNNMode get_cudnn_mode_for_rnn(
    detail::RNNOptionsBase::rnn_options_base_mode_t mode) {
  if (c10::get_if<enumtype::kRNN_RELU>(&mode)) {
    return CuDNNMode::RNN_RELU;
@ -43,7 +43,7 @@ static CuDNNMode get_cudnn_mode_for_rnn(
  }
 }

-static Tensor apply_permutation(
+Tensor apply_permutation(
    const Tensor& tensor,
    const Tensor& permutation,
    int64_t dim = 1) {
@ -397,8 +397,8 @@ template class RNNImplBase<RNNImpl>;

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-static detail::RNNOptionsBase::rnn_options_base_mode_t
-compute_rnn_options_base_mode(RNNOptions::nonlinearity_t nonlinearity) {
+detail::RNNOptionsBase::rnn_options_base_mode_t compute_rnn_options_base_mode(
+    RNNOptions::nonlinearity_t nonlinearity) {
  if (c10::get_if<enumtype::kTanh>(&nonlinearity)) {
    return torch::kRNN_TANH;
  } else if (c10::get_if<enumtype::kReLU>(&nonlinearity)) {
--- a/torch/csrc/api/src/optim/lbfgs.cpp
+++ b/torch/csrc/api/src/optim/lbfgs.cpp
@ -187,7 +187,7 @@ std::tuple<double, Tensor> LBFGS::_directional_evaluate(
  return std::make_tuple(loss, flat_grad);
 }

-static double _cubic_interpolate(
+double _cubic_interpolate(
    double x1,
    double f1,
    double g1,
@ -236,7 +236,7 @@ using Function = std::function<std::tuple<double, Tensor>(
    const std::vector<Tensor>& x,
    double t,
    const Tensor& d)>;
-static std::tuple<double, Tensor, double, int64_t> _strong_wolfe(
+std::tuple<double, Tensor, double, int64_t> _strong_wolfe(
    const Function& obj_func,
    const std::vector<Tensor>& x,
    double t,
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@ -13,7 +13,7 @@

 namespace c10d {

-static ProcessGroup::BackendType strToBackendType(std::string backend) {
+ProcessGroup::BackendType strToBackendType(std::string backend) {
  if (backend == "undefined") {
    return ProcessGroup::BackendType::UNDEFINED;
  } else if (backend == "gloo") {
@ -29,7 +29,7 @@ static ProcessGroup::BackendType strToBackendType(std::string backend) {
  }
 }

-static std::string backendTypeToStr(ProcessGroup::BackendType backendType) {
+std::string backendTypeToStr(ProcessGroup::BackendType backendType) {
  switch (backendType) {
    case ProcessGroup::BackendType::UNDEFINED:
      return "undefined";
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@ -2596,7 +2596,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::alltoall_base(
  return work;
 }

-static at::Tensor& checkSingleTensor(std::vector<at::Tensor>& tensors) {
+at::Tensor& checkSingleTensor(std::vector<at::Tensor>& tensors) {
  if (tensors.size() != 1) {
    TORCH_CHECK(false, "ProcessGroupGloo::send takes a single tensor");
  }
@ -2610,7 +2610,7 @@ static at::Tensor& checkSingleTensor(std::vector<at::Tensor>& tensors) {
  return tensor;
 }

-static uint32_t checkTag(int32_t tag) {
+uint32_t checkTag(int32_t tag) {
  TORCH_CHECK(tag >= 0, "Tag must be nonnegative");
  return (uint32_t)tag;
 }
--- a/torch/csrc/distributed/c10d/quantization/quantization.cpp
+++ b/torch/csrc/distributed/c10d/quantization/quantization.cpp
@ -9,7 +9,7 @@ namespace quantization {

 // TODO: The kernels are copied from fbgemm_gpu, we should dedup them later

-static void FloatToBFloat16Quantized_ref(
+void FloatToBFloat16Quantized_ref(
    const float* const input,
    const size_t nrows,
    const size_t ncols,
@ -26,7 +26,7 @@ static void FloatToBFloat16Quantized_ref(
  }
 }

-static void BFloat16QuantizedToFloat_ref(
+void BFloat16QuantizedToFloat_ref(
    const at::BFloat16* const input,
    const size_t nrows,
    const size_t ncols,
--- a/torch/csrc/distributed/rpc/agent_utils.cpp
+++ b/torch/csrc/distributed/rpc/agent_utils.cpp
@ -41,7 +41,7 @@ std::unordered_map<std::string, worker_id_t> collectNames(
  return nameToId;
 }

-static std::vector<std::string> splitString(
+std::vector<std::string> splitString(
    const std::string& s,
    const std::string& delim) {
  std::vector<std::string> tokens;
@ -154,7 +154,7 @@ const string storeKeyActiveCallCount = "ACTIVE_CALLS";
 const string storeKeyReady = "READY";
 static std::atomic<int> barrierId(0);

-static std::tuple<std::string, std::string, std::string> getNextKeyIds() {
+std::tuple<std::string, std::string, std::string> getNextKeyIds() {
  barrierId++;
  std::string processCountKey =
      fmt::format("{}{}{}", storeKeyProcessCount, storeKeyBarrierId, barrierId);
--- a/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.cpp
@ -7,7 +7,7 @@ namespace torch {
 namespace distributed {
 namespace rpc {

-static std::string fromVecToString(const std::vector<char>& vec) {
+std::string fromVecToString(const std::vector<char>& vec) {
  return std::string(vec.begin(), vec.end());
 }

--- a/torch/csrc/itt_wrapper.cpp
+++ b/torch/csrc/itt_wrapper.cpp
@ -1,25 +1,25 @@
+#include <c10/macros/Export.h>
 #include <ittnotify.h>
-#include <torch/csrc/itt_wrapper.h>
 #include <torch/csrc/profiler/stubs/base.h>

 namespace torch {
 namespace profiler {
 __itt_domain* _itt_domain = __itt_domain_create("PyTorch");

-bool itt_is_available() {
+TORCH_API bool itt_is_available() {
  return torch::profiler::impl::ittStubs()->enabled();
 }

-void itt_range_push(const char* msg) {
+TORCH_API void itt_range_push(const char* msg) {
  __itt_string_handle* hsMsg = __itt_string_handle_create(msg);
  __itt_task_begin(_itt_domain, __itt_null, __itt_null, hsMsg);
 }

-void itt_range_pop() {
+TORCH_API void itt_range_pop() {
  __itt_task_end(_itt_domain);
 }

-void itt_mark(const char* msg) {
+TORCH_API void itt_mark(const char* msg) {
  __itt_string_handle* hsMsg = __itt_string_handle_create(msg);
  __itt_task_begin(_itt_domain, __itt_null, __itt_null, hsMsg);
  __itt_task_end(_itt_domain);
--- a/torch/csrc/itt_wrapper.h
+++ b/torch/csrc/itt_wrapper.h
@ -1,13 +1,12 @@
 #ifndef PROFILER_ITT_H
 #define PROFILER_ITT_H
-#include <c10/macros/Export.h>

 namespace torch {
 namespace profiler {
-TORCH_API bool itt_is_available();
-TORCH_API void itt_range_push(const char* msg);
-TORCH_API void itt_range_pop();
-TORCH_API void itt_mark(const char* msg);
+bool itt_is_available();
+void itt_range_push(const char* msg);
+void itt_range_pop();
+void itt_mark(const char* msg);
 } // namespace profiler
 } // namespace torch

--- a/torch/csrc/jit/api/function_impl.cpp
+++ b/torch/csrc/jit/api/function_impl.cpp
@ -55,7 +55,7 @@ T& toGraphFunctionImpl(F& function) {

 } // namespace

-static void placeholderCreator(GraphFunction&) {
+void placeholderCreator(GraphFunction&) {
  throw RecursiveMethodCallError();
 }

--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@ -163,7 +163,7 @@ void Module::to(at::Device device, bool non_blocking) {
  to_impl(device, /*dtype=*/c10::nullopt, non_blocking);
 }

-static void module_state_to(
+void module_state_to(
    const autograd::Variable& variable,
    const c10::optional<at::Device>& device,
    const c10::optional<at::ScalarType>& dtype,
--- a/torch/csrc/jit/codegen/fuser/compiler.cpp
+++ b/torch/csrc/jit/codegen/fuser/compiler.cpp
@ -53,8 +53,7 @@ bool hasFusionBackend(at::Device::Type backend_type) {
  return getFusionBackends().count(backend_type);
 }

-static const FusedKernelConstructor& getConstructor(
-    at::Device::Type backend_type) {
+const FusedKernelConstructor& getConstructor(at::Device::Type backend_type) {
  std::lock_guard<std::mutex> guard(fusionBackendLock());
  return getFusionBackends().at(backend_type);
 }
--- a/torch/csrc/jit/codegen/fuser/executor.cpp
+++ b/torch/csrc/jit/codegen/fuser/executor.cpp
@ -190,7 +190,7 @@ static void compressContiguous(

 // Launches the requested fusion on the given device with the given inputs.
 // Output pointers are stored in outputs (to be put on the stack later).
-static void launchFusion(
+void launchFusion(
    const FusedKernel& fusion,
    const at::Device device,
    const at::ArrayRef<at::Tensor>& inputs,
--- a/torch/csrc/jit/frontend/canonicalize_modified_loop.cpp
+++ b/torch/csrc/jit/frontend/canonicalize_modified_loop.cpp
@ -12,7 +12,7 @@ namespace torch::jit {
 // Transforms a Loop that has both a trip count specified and a loop
 // body condition so that the iter count is no longer specified
 // and it is recognizable as a python while loop.
-static void canonicalizeModifiedLoop(Node* n) {
+void canonicalizeModifiedLoop(Node* n) {
  LoopView loop(n);
  if (loop.loopType() != LoopView::ModifiedLoop) {
    return;
@ -48,7 +48,7 @@ static void canonicalizeModifiedLoop(Node* n) {
  loop.bodyBlock()->insertOutput(0, new_condition);
 }

-static void canonicalizeModifiedLoops(Block* block) {
+void canonicalizeModifiedLoops(Block* block) {
  for (Node* n : block->nodes()) {
    for (Block* b : n->blocks()) {
      canonicalizeModifiedLoops(b);
--- a/torch/csrc/jit/frontend/exit_transforms.cpp
+++ b/torch/csrc/jit/frontend/exit_transforms.cpp
@ -522,7 +522,7 @@ struct ExitTransformer {
  std::shared_ptr<Graph> graph_;
 };

-static bool inlineConsecutiveIfs(Node* node) {
+bool inlineConsecutiveIfs(Node* node) {
  if (node->kind() != prim::If || node->next()->kind() != prim::If) {
    return false;
  }
@ -605,7 +605,7 @@ static bool inlineConsecutiveIfs(Node* node) {
 //   return 1
 // else:
 //   return 2
-static void inlineConsecutiveIfs(Block* block) {
+void inlineConsecutiveIfs(Block* block) {
  for (auto it = block->nodes().begin(), end = block->nodes().end();
       it != end;) {
    for (Block* b : it->blocks()) {
--- a/torch/csrc/jit/frontend/inline_loop_condition.cpp
+++ b/torch/csrc/jit/frontend/inline_loop_condition.cpp
@ -30,7 +30,7 @@ void InlineBlockBeforeNode(Node* before_node, Block* block) {
 //      <body>
 //       BlockExit(continue_condition, loop_carried_block*)
 //    }
-static void inlineLoopCondition(Node* n) {
+void inlineLoopCondition(Node* n) {
  Block* body_block = n->blocks().at(0);

  auto pre_header = n->blocks().at(1);
@ -45,7 +45,7 @@ static void inlineLoopCondition(Node* n) {
  n->eraseBlock(1);
 }

-static void inlineLoopCondition(Block* block) {
+void inlineLoopCondition(Block* block) {
  for (Node* n : block->nodes()) {
    for (Block* b : n->blocks()) {
      inlineLoopCondition(b);
--- a/torch/csrc/jit/frontend/ir_emitter.cpp
+++ b/torch/csrc/jit/frontend/ir_emitter.cpp
@ -187,7 +187,7 @@ struct CondValue {
 };

 enum NoneStatus { ALWAYS, MAYBE, NEVER };
-static NoneStatus canBeNone(Value* v) {
+NoneStatus canBeNone(Value* v) {
  if (v->node()->mustBeNone()) {
    return ALWAYS;
  }
@ -5605,7 +5605,7 @@ std::vector<Function*> CompilationUnit::define(
      self);
 }

-static void eraseListLiterals(std::shared_ptr<Graph>& graph) {
+void eraseListLiterals(std::shared_ptr<Graph>& graph) {
  DepthFirstGraphNodeIterator it(graph);

  for (auto next_node = it.next(); next_node != nullptr;) {
--- a/torch/csrc/jit/frontend/schema_matching.cpp
+++ b/torch/csrc/jit/frontend/schema_matching.cpp
@ -548,6 +548,17 @@ MatchedSchema matchSchema(
  throw ErrorReport(loc) << failure_messages.str();
 }

+MatchedSchema matchSchema(
+    const ::c10::FunctionSchema& schema,
+    const SourceRange& loc,
+    Graph& graph,
+    at::ArrayRef<Value*> args,
+    at::ArrayRef<NamedValue> kwargs) {
+  std::vector<NamedValue> named_args =
+      fmap(args, [](Value* v) { return NamedValue(v); });
+  return matchSchema(schema, loc, graph, named_args, kwargs);
+}
+
 static std::string prefixLine(
    const std::string& str,
    const std::string& prefix) {
--- a/torch/csrc/jit/frontend/tracer.cpp
+++ b/torch/csrc/jit/frontend/tracer.cpp
@ -110,7 +110,7 @@ void TracingState::delValue(const IValue& var) {
 Value* getValueTrace(const IValue& var) {
  return getTracingState()->getValue(var);
 }
-static Value* getOptTensorValueTrace(const c10::optional<at::Tensor>& var) {
+Value* getOptTensorValueTrace(const c10::optional<at::Tensor>& var) {
  return getValueTrace(IValue(var));
 }
 Value* TracingState::getValue(const IValue& var) {
@ -783,6 +783,19 @@ void addInputs(
  n->addInput(list_node->output());
 }

+void addInputs(
+    Node* n,
+    const char* name,
+    c10::optional<caffe2::TypeMeta> opt_dtype) {
+  if (opt_dtype.has_value()) {
+    return addInputs(n, name, at::typeMetaToScalarType(*opt_dtype));
+  } else {
+    Graph* g = n->owningGraph();
+    Value* none = g->insertNode(g->createNone())->output();
+    n->addInput(none);
+  }
+}
+
 void addInputs(Node* n, const char* name, at::IntArrayRef value) {
  using ArgumentStash = jit::tracer::ArgumentStash;
  std::vector<Value*> info = ArgumentStash::hasIntArrayRef(name)
@ -1049,7 +1062,7 @@ void ArgumentStash::stashValue(
 // Stack trace recording
 ////////////////////////////////////////////////////////////////////////////////
 // no python present so we just do not record source information
-static void defaultRecordSourceLocation(Node* n) {}
+void defaultRecordSourceLocation(Node* n) {}
 std::atomic<decltype(&defaultRecordSourceLocation)> record_source_location(
    defaultRecordSourceLocation);
 void recordSourceLocation(Node* n) {
@ -1059,7 +1072,7 @@ void setRecordSourceLocation(void (*v)(Node*)) {
  record_source_location.store(v);
 }

-static std::vector<StackEntry> defaultPythonCallstack() {
+std::vector<StackEntry> defaultPythonCallstack() {
  return std::vector<StackEntry>();
 }
 std::atomic<decltype(&defaultPythonCallstack)> python_callstack_fn(
@ -1071,7 +1084,7 @@ void setPythonCallstack(std::vector<StackEntry> (*v)()) {
  python_callstack_fn.store(v);
 }

-static void defaultWarn(const std::string& str) {
+void defaultWarn(const std::string& str) {
  TORCH_WARN(str);
 }
 std::atomic<warn_fn_type> warn_callback{defaultWarn};
--- a/torch/csrc/jit/ir/constants.cpp
+++ b/torch/csrc/jit/ir/constants.cpp
@ -8,13 +8,13 @@

 namespace torch::jit {

-static bool insertableTensor(const at::Tensor& ten) {
+bool insertableTensor(const at::Tensor& ten) {
  // bail if tensor has no storage i.e. opaque tensor used in MKLdnn.
  // or gradients because we have no way of serializing them & are mutable
  return !ten.requires_grad() && ten.has_storage() && !ten.is_nested();
 }

-static bool insertableIValue(const IValue& ivalue) {
+bool insertableIValue(const IValue& ivalue) {
  if (ivalue.isInt() || ivalue.isNone() || ivalue.isBool() ||
      ivalue.isDouble() || ivalue.isComplexDouble() || ivalue.isString() ||
      ivalue.isDevice() || ivalue.isEnum()) {
--- a/torch/csrc/jit/ir/ir.cpp
+++ b/torch/csrc/jit/ir/ir.cpp
@ -122,15 +122,13 @@ static std::ostream& printValueRefs(
 // Can't make these two overloads directly a template, it'll be ambiguous with
 // the global printer for operator<<.

-static std::ostream& operator<<(
+std::ostream& operator<<(
    std::ostream& out,
    const at::ArrayRef<const Value*> nodes) {
  return printValueRefs(out, nodes);
 }

-static std::ostream& operator<<(
-    std::ostream& out,
-    const at::ArrayRef<Value*> nodes) {
+std::ostream& operator<<(std::ostream& out, const at::ArrayRef<Value*> nodes) {
  return printValueRefs(out, nodes);
 }

@ -143,7 +141,7 @@ struct const_value_list_with_types {
      : values(values), delim(std::move(delim_)) {}
 };

-static std::ostream& operator<<(
+std::ostream& operator<<(
    std::ostream& out,
    const const_value_list_with_types& l) {
  size_t i = 0;
@ -969,7 +967,7 @@ void Value::replaceAllUsesDominatedByNodeWith(
      uses_.end());
 }

-static size_t findArgument(
+size_t findArgument(
    const FunctionSchema& the_schema,
    const std::string& unqualName) {
  for (const auto i : c10::irange(the_schema.arguments().size())) {
@ -982,7 +980,7 @@ static size_t findArgument(
      std::string("Couldn't find an argument called ") + unqualName);
 }

-static size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
+size_t findArgument(const FunctionSchema& the_schema, Symbol name) {
  const auto unqualName = name.toUnqualString();
  return findArgument(the_schema, unqualName);
 }
@ -2049,7 +2047,7 @@ void inlineCallStackOfNode(
    Node* to_replace,
    c10::optional<ModuleInstanceInfo> m_info);

-static void inlineCallStackOfBlock(
+void inlineCallStackOfBlock(
    Block* b,
    std::unordered_map<InlinedCallStack*, InlinedCallStackPtr>& new_cs_entries,
    Function* callee,
--- a/torch/csrc/jit/jit_opt_limit.cpp
+++ b/torch/csrc/jit/jit_opt_limit.cpp
@ -14,7 +14,7 @@
 namespace torch {
 namespace jit {

-static std::unordered_map<std::string, int64_t>& passes_to_current_counter() {
+std::unordered_map<std::string, int64_t>& passes_to_current_counter() {
  static std::unordered_map<std::string, int64_t> passes_to_current_counter;
  return passes_to_current_counter;
 }
--- a/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
+++ b/torch/csrc/jit/mobile/compatibility/model_compatibility.cpp
@ -95,7 +95,7 @@ uint64_t _get_model_bytecode_version(
  return _get_model_bytecode_version_from_bytes(data.get(), size);
 }

-static uint64_t _get_model_bytecode_version_zip(
+uint64_t _get_model_bytecode_version_zip(
    std::shared_ptr<ReadAdapterInterface> rai) {
  if (!check_zip_file(rai)) {
    TORCH_CHECK(
--- a/torch/csrc/jit/mobile/import_data.cpp
+++ b/torch/csrc/jit/mobile/import_data.cpp
@ -238,7 +238,7 @@ std::map<std::string, at::Tensor> mobile_module_to_parameter_map(
      "' in deserialized mobile::Module");
 }

-static std::map<std::string, at::Tensor> _load_parameters_bytes(
+std::map<std::string, at::Tensor> _load_parameters_bytes(
    std::shared_ptr<char> data,
    size_t size,
    c10::optional<at::Device> device) {
--- a/torch/csrc/jit/mobile/module.cpp
+++ b/torch/csrc/jit/mobile/module.cpp
@ -316,7 +316,7 @@ c10::IValue Method::operator()(std::vector<c10::IValue> stack) const {
  return stack.front();
 }

-static c10::optional<std::string> print_type(const c10::Type& t) {
+c10::optional<std::string> print_type(const c10::Type& t) {
  auto namedType = t.cast<c10::NamedType>();
  if (namedType && namedType->name()) {
    return namedType->name().value().qualifiedName();
--- a/torch/csrc/jit/mobile/prim_ops_registery.cpp
+++ b/torch/csrc/jit/mobile/prim_ops_registery.cpp
@ -4,8 +4,7 @@ namespace torch {
 namespace jit {
 namespace mobile {

-static std::unordered_map<std::string, std::function<void(Stack&)>>&
-primOpsFnTable() {
+std::unordered_map<std::string, std::function<void(Stack&)>>& primOpsFnTable() {
  static std::unordered_map<std::string, std::function<void(Stack&)>>
      prim_ops_fn;
  return prim_ops_fn;
--- a/torch/csrc/jit/mobile/train/optim/sgd.h
+++ b/torch/csrc/jit/mobile/train/optim/sgd.h
@ -21,7 +21,6 @@ class SGDParamState {
    return std::make_unique<SGDParamState>(
        static_cast<const SGDParamState&>(*this));
  }
-  friend bool operator==(const SGDParamState& lhs, const SGDParamState& rhs);
  ~SGDParamState() = default;
 };

--- a/torch/csrc/jit/passes/annotate_warns.cpp
+++ b/torch/csrc/jit/passes/annotate_warns.cpp
@ -5,7 +5,7 @@
 namespace torch {
 namespace jit {

-static void AnnotateWarns(Block* b) {
+void AnnotateWarns(Block* b) {
  static std::atomic<int64_t> idx(0);
  for (Node* n : b->nodes()) {
    for (Block* child_b : n->blocks()) {
--- a/torch/csrc/jit/passes/batch_mm.cpp
+++ b/torch/csrc/jit/passes/batch_mm.cpp
@ -83,7 +83,7 @@ c10::AliasAnalysisKind aliasAnalysisIsSpecialCase() {
 // Tunable parameter. Set to something larger if it turns out to be better.
 static constexpr size_t min_fusion_size = 4;

-static bool have_same_shape(at::TensorList inputs) {
+bool have_same_shape(at::TensorList inputs) {
  auto expected_sizes = inputs[0].sizes();
  return (std::all_of(
      inputs.begin(), inputs.end(), [expected_sizes](const at::Tensor& t) {
@ -91,19 +91,17 @@ static bool have_same_shape(at::TensorList inputs) {
      }));
 }

-static bool should_be_transposed(at::TensorList inputs) {
+bool should_be_transposed(at::TensorList inputs) {
  return (std::all_of(inputs.begin(), inputs.end(), [](const at::Tensor& t) {
    return t.stride(0) == 1 && t.stride(1) == t.size(0);
  }));
 }

-static std::vector<at::Tensor> transpose_inputs(at::TensorList inputs) {
+std::vector<at::Tensor> transpose_inputs(at::TensorList inputs) {
  return fmap(inputs, [](const at::Tensor& i) { return i.t(); });
 }

-static bool shape_is_fast_for_reduce(
-    const at::Tensor& lhs,
-    const at::Tensor& rhs) {
+bool shape_is_fast_for_reduce(const at::Tensor& lhs, const at::Tensor& rhs) {
  size_t l = lhs.size(0);
  size_t m = lhs.size(1);
  size_t r = rhs.size(1);
@ -253,7 +251,7 @@ struct TreeToken {

 enum class Side { LHS, RHS };

-static void BatchMMTreeReduce(Block* block, AliasDb& alias_db) {
+void BatchMMTreeReduce(Block* block, AliasDb& alias_db) {
  auto graph = block->owningGraph();

  // Look for trees in the block
@ -318,7 +316,7 @@ static void BatchMMTreeReduce(Block* block, AliasDb& alias_db) {
  }
 }

-static bool shape_is_fast_for_side(const at::Tensor& other_side_input) {
+bool shape_is_fast_for_side(const at::Tensor& other_side_input) {
  // Cutoff chosed by benchmarking on a TITAN V
  return other_side_input.numel() <= 1024 * 2048;
 }
@ -370,7 +368,7 @@ RegisterOperators mm_batch_side_reg({Operator(
    },
    aliasAnalysisIsSpecialCase())});

-static std::pair<std::vector<Node*>, std::vector<Node*>> gatherIndependentMMUses(
+std::pair<std::vector<Node*>, std::vector<Node*>> gatherIndependentMMUses(
    Value* value,
    AliasDb& alias_db) {
  const auto postprocess = [&](std::vector<Node*> mms) {
@ -415,7 +413,7 @@ static std::pair<std::vector<Node*>, std::vector<Node*>> gatherIndependentMMUses
      postprocess(std::move(lhses)), postprocess(std::move(rhses)));
 }

-static void BatchMMSide(Block* block, AliasDb& alias_db) {
+void BatchMMSide(Block* block, AliasDb& alias_db) {
  // NB: 8 is the current loop unrolling factor
  static constexpr size_t how_many_is_many = 8;
  const auto batch_side = [&](std::vector<Node*>& mms, Side side) {
@ -464,7 +462,7 @@ static void BatchMMSide(Block* block, AliasDb& alias_db) {
  }
 }

-static bool hasMutableOperators(Block* block) {
+bool hasMutableOperators(Block* block) {
  for (auto n : block->nodes()) {
    if (n->kind().is_aten() && n->schema().is_mutable())
      return true;
@ -476,7 +474,7 @@ static bool hasMutableOperators(Block* block) {
  return false;
 }

-static bool hasMMOperators(std::shared_ptr<Graph>& graph) {
+bool hasMMOperators(std::shared_ptr<Graph>& graph) {
  DepthFirstGraphNodeIterator it(graph);
  Node* n = nullptr;
  while ((n = it.next()) != nullptr) {
--- a/torch/csrc/jit/passes/canonicalize.cpp
+++ b/torch/csrc/jit/passes/canonicalize.cpp
@ -51,7 +51,7 @@ std::shared_ptr<Graph> Canonicalize(
 }

 // Which index in b's owning Node is b
-static size_t blockIndex(const Block* b) {
+size_t blockIndex(const Block* b) {
  auto n = b->owningNode();
  AT_ASSERT(n);
  for (size_t i = 0; i < n->blocks().size(); ++i) {
@ -73,7 +73,7 @@ static size_t blockIndex(const Block* b) {
 * NB: this is not a topological index. Topologically, two nodes in
 * different blocks of an if node are not topologically < or > each other.
 */
-static bool isBefore(Node* n1, Node* n2) {
+bool isBefore(Node* n1, Node* n2) {
  // Invalid to call with the same node as both args
  AT_ASSERT(n1 != n2);

@ -122,7 +122,7 @@ static bool isBefore(Node* n1, Node* n2) {
  }
 }

-static bool isBefore(const Use& a, const Use& b) {
+bool isBefore(const Use& a, const Use& b) {
  // If two uses are the same node, we order on offset
  if (a.user == b.user) {
    return a.offset < b.offset;
@ -131,7 +131,7 @@ static bool isBefore(const Use& a, const Use& b) {
  return isBefore(a.user, b.user);
 }

-static bool isAfter(const Use& a, const Use& b) {
+bool isAfter(const Use& a, const Use& b) {
  if (a.user == b.user && a.offset == b.offset) {
    return false;
  }
@ -157,14 +157,14 @@ c10::optional<const Use> firstOrLastUse(Value* v, bool find_first) {
  return extreme_use;
 }

-static std::vector<c10::optional<const Use>> gatherFirstUses(
+std::vector<c10::optional<const Use>> gatherFirstUses(
    at::ArrayRef<Value*> values) {
  return fmap(values, [&](Value* v) -> c10::optional<const Use> {
    return firstOrLastUse(v, true);
  });
 }

-static std::vector<size_t> sort_indexes(at::ArrayRef<Value*> values) {
+std::vector<size_t> sort_indexes(at::ArrayRef<Value*> values) {
  // initialize original index locations
  std::vector<size_t> idx(values.size());
  std::iota(idx.begin(), idx.end(), 0);
@ -194,17 +194,17 @@ static std::vector<size_t> sort_indexes(at::ArrayRef<Value*> values) {
  return idx;
 }

-static void CanonicalizeLoopOutputs(Node* n) {
+void CanonicalizeLoopOutputs(Node* n) {
  auto new_indices = sort_indexes(n->outputs());
  LoopView(n).permuteLoopCarried(new_indices);
 }

-static void CanonicalizeIfOutputs(Node* n) {
+void CanonicalizeIfOutputs(Node* n) {
  auto new_indices = sort_indexes(n->outputs());
  IfView(n).permuteOutputs(new_indices);
 }

-static void CanonicalizeOutputs(Block* block) {
+void CanonicalizeOutputs(Block* block) {
  // We iterate in reverse since ordering of a node's outputs is dependent on
  // the value use following it in the graph
  for (Node* n : block->nodes().reverse()) {
--- a/torch/csrc/jit/passes/check_strict_fusion.cpp
+++ b/torch/csrc/jit/passes/check_strict_fusion.cpp
@ -22,12 +22,12 @@ bool isStrictFusion(Value* value) {

 } // namespace

-static bool fusionGuardCheck(Symbol k) {
+bool fusionGuardCheck(Symbol k) {
  return k == Symbol::prim("TensorExprDynamicGuard") || k == prim::TypeCheck ||
      k == prim::CudaFusionGuard || k == prim::RequiresGradCheck;
 }

-static std::unordered_set<Node*> collectValuesUsedInGuard(
+std::unordered_set<Node*> collectValuesUsedInGuard(
    Node* guarding_if,
    Node* enter_node) {
  // DFS to collect
@ -58,7 +58,7 @@ static std::unordered_set<Node*> collectValuesUsedInGuard(
  return visited_nodes;
 }

-static void checkForUnfusedOps(Node* enter_node) {
+void checkForUnfusedOps(Node* enter_node) {
  std::vector<Node*> unsupported_nodes;
  std::vector<Node*> guarding_ifs; // if multiple, we will throw
  for (Node* node = enter_node->next(); node->kind() != prim::Exit;
--- a/torch/csrc/jit/passes/clear_undefinedness.cpp
+++ b/torch/csrc/jit/passes/clear_undefinedness.cpp
@ -5,7 +5,7 @@
 namespace torch {
 namespace jit {

-static void clearUndefinedness(Value* o) {
+void clearUndefinedness(Value* o) {
  if (o->type()->kind() == TensorType::Kind) {
    o->setType(TensorType::get());
  } else if (
@ -16,7 +16,7 @@ static void clearUndefinedness(Value* o) {
  }
 }

-static void clearUndefinedness(Block* block) {
+void clearUndefinedness(Block* block) {
  for (auto n : block->nodes()) {
    for (auto o : n->outputs()) {
      clearUndefinedness(o);
--- a/torch/csrc/jit/passes/decompose_ops.cpp
+++ b/torch/csrc/jit/passes/decompose_ops.cpp
@ -22,7 +22,7 @@ c10::AliasAnalysisKind aliasAnalysisFromSchema() {
 // helper to determine if an optional tensor argument/value passed in is
 // statically defined (neither a None constant nor a Optional[Tensor] type)
 // return yes, no, or no value if we can't tell
-static c10::optional<bool> isDefined(Value* tensor) {
+c10::optional<bool> isDefined(Value* tensor) {
  if (tensor->type()->isSubtypeOf(*TensorType::get())) {
    return true;
  }
@ -32,7 +32,7 @@ static c10::optional<bool> isDefined(Value* tensor) {
  return {};
 }

-static bool isDecomposableNorm(Node* normalize_op) {
+bool isDecomposableNorm(Node* normalize_op) {
  static const OperatorSet decomposable_normalization_ops = {
      "aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor",
      "aten::layer_norm(Tensor input, int[] normalized_shape, Tensor? weight, Tensor? bias, float eps, bool cudnn_enable) -> Tensor",
@ -85,7 +85,7 @@ RegisterOperators reg_ops(
         },
         aliasAnalysisFromSchema())});

-static bool DecomposeOps(Block* block, CompilationUnit& decompose_funcs) {
+bool DecomposeOps(Block* block, CompilationUnit& decompose_funcs) {
  bool decomposed = false;
  for (auto it = block->nodes().begin(), end = block->nodes().end(); it != end;
       ++it) {
--- a/torch/csrc/jit/passes/erase_number_types.cpp
+++ b/torch/csrc/jit/passes/erase_number_types.cpp
@ -9,7 +9,7 @@
 namespace torch {
 namespace jit {

-static void SetNumTypeToTensorType(Value* v) {
+void SetNumTypeToTensorType(Value* v) {
  if (v->type()->isSubtypeOf(*NumberType::get())) {
    v->setType(TensorType::fromNumberType(*v->type()));
  } else if (v->type()->isSubtypeOf(*BoolType::get())) {
--- a/torch/csrc/jit/passes/graph_rewrite_helper.cpp
+++ b/torch/csrc/jit/passes/graph_rewrite_helper.cpp
@ -34,7 +34,7 @@ c10::optional<IValue> getIValue(
  return toIValue(getValue(name, match_vmap, vmap));
 }

-static std::unordered_map<std::string, c10::IValue> getConvParams(
+std::unordered_map<std::string, c10::IValue> getConvParams(
    const Match& match,
    const std::unordered_map<std::string, Value*>& vmap) {
  std::unordered_map<std::string, c10::IValue> calc_values;
--- a/torch/csrc/jit/passes/hoist_conv_packed_params.cpp
+++ b/torch/csrc/jit/passes/hoist_conv_packed_params.cpp
@ -36,7 +36,7 @@ namespace jit {
 // %n =
 // prim::GetAttr[name="{prefix}.name1{...}.name(n-1)._packed_params"][%self]
 //
-static void hoistConvPackedParams(
+void hoistConvPackedParams(
    Module& rootModule,
    Node* getConvPackedParamsNode,
    const std::string& prefix,
--- a/torch/csrc/jit/passes/inline_fork_wait.cpp
+++ b/torch/csrc/jit/passes/inline_fork_wait.cpp
@ -4,7 +4,7 @@
 namespace torch {
 namespace jit {

-static void InlineForkWait(
+void InlineForkWait(
    Block* b,
    std::unordered_map<Value*, Value*>& future_remap) {
  auto nodes = b->nodes();
--- a/torch/csrc/jit/passes/inline_forked_closures.cpp
+++ b/torch/csrc/jit/passes/inline_forked_closures.cpp
@ -16,7 +16,7 @@ namespace jit {
 // subgraph, replace the context unpacking value with the new graph input.
 // fork(foo) ->
 // def foo(a, b):
-static void inlineForkedClosure(Node* fork_closure, NodeKind genKind) {
+void inlineForkedClosure(Node* fork_closure, NodeKind genKind) {
  Node* function_context_node = fork_closure->input()->node();

  if (function_context_node->inputs().size() != 2 ||
@ -58,7 +58,7 @@ static void inlineForkedClosure(Node* fork_closure, NodeKind genKind) {
  runCleanupPasses(fork_graph);
 }

-static void inlineForkedClosures(Block* block) {
+void inlineForkedClosures(Block* block) {
  for (auto it = block->nodes().begin(); it != block->nodes().end();) {
    Node* n = *it;
    it++;
--- a/torch/csrc/jit/passes/inliner.cpp
+++ b/torch/csrc/jit/passes/inliner.cpp
@ -30,7 +30,7 @@ GraphFunction* tryToGraphFunction(Node* n) {
  return nullptr;
 }

-static void inlineCalls(Block* block) {
+void inlineCalls(Block* block) {
  for (auto it = block->nodes().begin(), end = block->nodes().end();
       it != end;) {
    Node* cur = *it++;
--- a/torch/csrc/jit/passes/inplace_check.cpp
+++ b/torch/csrc/jit/passes/inplace_check.cpp
@ -3,7 +3,7 @@
 namespace torch {
 namespace jit {

-static void CheckInplace(Block* block) {
+void CheckInplace(Block* block) {
  for (auto node : block->nodes()) {
    if (node->kind() == prim::PythonOp && node->hasAttribute(attr::inplace)) {
      if (node->i(attr::inplace)) {
--- a/torch/csrc/jit/passes/lift_closures.cpp
+++ b/torch/csrc/jit/passes/lift_closures.cpp
@ -16,7 +16,7 @@ namespace jit {
 // closure block.
 // Within the closure subgraph, the context tuple is unpacked and the unpacked
 // values are used for closed over values.
-static void liftClosure(Node* closure) {
+void liftClosure(Node* closure) {
  auto block = closure->blocks().at(0);
  auto subgraph = std::make_shared<Graph>();
  // closures/forks can be nested, so use closure owning graph
@ -56,7 +56,7 @@ static void liftClosure(Node* closure) {
  runCleanupPasses(closure->g(attr::Subgraph));
 }

-static void liftClosures(Block* block) {
+void liftClosures(Block* block) {
  for (auto it = block->nodes().begin(); it != block->nodes().end();) {
    Node* n = *it;
    it++;
--- a/torch/csrc/jit/passes/lower_graph.cpp
+++ b/torch/csrc/jit/passes/lower_graph.cpp
@ -21,7 +21,7 @@ struct Slot {
 // parameters/attributes with extra_ivalue input Slots that hold what value to
 // pass into the graph. Used for ONNX export to remove first-class modules
 // so it can deal purely with parameters and inputs
-static std::pair<std::shared_ptr<Graph>, std::vector<Slot>> lower_graph(
+std::pair<std::shared_ptr<Graph>, std::vector<Slot>> lower_graph(
    const ModulePtr& self,
    Graph& g_,
    size_t self_offset = 0) {
--- a/torch/csrc/jit/passes/metal_rewrite.cpp
+++ b/torch/csrc/jit/passes/metal_rewrite.cpp
@ -240,12 +240,31 @@ void metalFusePrePackedConvWithClamp(script::Module& module) {
  fuseHardtanhWithPackedOps(graph);
 }

-static void metalRemoveMutation(script::Module& module) {
+void metalInsertCopyOps(script::Module& module) {
+  auto graph = module.get_method("forward").graph();
+  auto&& outputs = graph->outputs();
+  for (const auto i : c10::irange(outputs.size())) {
+    Value* output = outputs[i];
+    auto namedValue = NamedValue("", output);
+    if (namedValue.type()->kind() == TypeKind::TensorType) {
+      // find the insertion point
+      WithInsertPoint ip(output->node()->next());
+      Value* replaced_output = graph->insert(
+          Symbol::fromQualString("metal::copy_to_host"), {namedValue});
+      // replaced the output
+      graph->block()->replaceOutput(i, replaced_output);
+    }
+  }
+  SubgraphRewriter rewriter;
+  rewriter.runOnGraph(graph);
+}
+
+void metalRemoveMutation(script::Module& module) {
  auto graph = module.get_method("forward").graph();
  RemoveTensorMutation(graph);
 }

-static void metalRunCanonicalOptimizations(script::Module& module) {
+void metalRunCanonicalOptimizations(script::Module& module) {
  auto graph = module.get_method("forward").graph();
  runOptimization(graph, false /* no loop unrolling */);
 }
--- a/torch/csrc/jit/passes/pass_manager.cpp
+++ b/torch/csrc/jit/passes/pass_manager.cpp
@ -21,7 +21,7 @@ GraphPassNameType registerPostPass(GraphPass p) {
  return graphPassID++;
 }

-static GraphPassNameType registerPass(GraphPass p) {
+GraphPassNameType registerPass(GraphPass p) {
  return registerPostPass(std::move(p));
 }

--- a/torch/csrc/jit/passes/peephole.cpp
+++ b/torch/csrc/jit/passes/peephole.cpp
@ -332,7 +332,7 @@ struct PeepholeOptimizeImpl {
  bool shape_peepholes_;
 };

-static bool FuseAddMM(Block* block) {
+bool FuseAddMM(Block* block) {
  bool changed = false;
  for (Node* node : block->nodes()) {
    // XXX: remember that if you want to simplify an expression by combining
--- a/torch/csrc/jit/passes/peephole_list_idioms.cpp
+++ b/torch/csrc/jit/passes/peephole_list_idioms.cpp
@ -15,7 +15,7 @@
 namespace torch {
 namespace jit {

-static c10::optional<size_t> normalizeIndex(int64_t index, size_t len) {
+c10::optional<size_t> normalizeIndex(int64_t index, size_t len) {
  if (index < 0) {
    index = index + len;
  }
--- a/torch/csrc/jit/passes/peephole_non_tensor.cpp
+++ b/torch/csrc/jit/passes/peephole_non_tensor.cpp
@ -1,5 +1,4 @@
 #include <torch/csrc/jit/passes/peephole.h>
-#include <torch/csrc/jit/passes/peephole_non_tensor.h>

 #include <ATen/core/jit_type.h>
 #include <c10/util/irange.h>
--- a/torch/csrc/jit/passes/quantization/finalize.cpp
+++ b/torch/csrc/jit/passes/quantization/finalize.cpp
@ -168,7 +168,7 @@ void FoldQuantizedPrepackingOps(Module& module) {
  PrePackingOpsFolder(module, filter_fn, "quantized");
 }

-static std::unordered_set<std::string> RegisterPrePackingParams(
+std::unordered_set<std::string> RegisterPrePackingParams(
    Module& module,
    const std::string& method_name) {
  auto filter_fn = [](const Node* n) -> bool {
--- a/torch/csrc/jit/passes/quantization/helper.cpp
+++ b/torch/csrc/jit/passes/quantization/helper.cpp
@ -253,7 +253,7 @@ bool matchCallFuncToUse(

 // Check any use of `v` matches the aten function call
 // or CallFunction patterns
-static bool matchArgPattern(
+bool matchArgPattern(
    Value* v,
    const AtenFuncArgs& aten_func_args,
    const CallFuncArgs& call_func_args) {
@ -395,8 +395,7 @@ std::vector<Value*> getPassThroughInputs(Value* v) {
  return {};
 }

-static std::vector<NodeKind> toAtenSymbol(
-    const std::vector<std::string>& func_names) {
+std::vector<NodeKind> toAtenSymbol(const std::vector<std::string>& func_names) {
  std::vector<NodeKind> symbols;
  std::transform(
      func_names.begin(),
@ -406,18 +405,18 @@ static std::vector<NodeKind> toAtenSymbol(
  return symbols;
 }

-static bool isAtenFunc(Node* n, const std::vector<NodeKind>& aten_funcs) {
+bool isAtenFunc(Node* n, const std::vector<NodeKind>& aten_funcs) {
  return std::find(aten_funcs.begin(), aten_funcs.end(), n->kind()) !=
      aten_funcs.end();
 }

-static bool isAtenFunc(Node* n, const std::vector<std::string>& aten_funcs) {
+bool isAtenFunc(Node* n, const std::vector<std::string>& aten_funcs) {
  const auto& symbols = toAtenSymbol(aten_funcs);
  return isAtenFunc(n, symbols);
 }

 // TODO: factor out isCallFunc
-static bool isFunctionNode(
+bool isFunctionNode(
    Node* n,
    const std::vector<std::string>& call_funcs,
    const std::vector<std::string>& aten_funcs) {
@ -670,7 +669,7 @@ bool is_int_constant(
  return v && v->isInt() && v->toInt() == value;
 }

-static bool is_functional(
+bool is_functional(
    const Match& match,
    const std::unordered_map<std::string, Value*>& vmap,
    const std::string& vname,
@ -694,7 +693,7 @@ c10::optional<std::string> getModuleName(Value* value) {
  return c10::nullopt;
 }

-static bool is_module(
+bool is_module(
    const Match& match,
    const std::unordered_map<std::string, Value*>& vmap,
    const std::string& vname,
--- a/torch/csrc/jit/passes/quantization/quantization_patterns.h
+++ b/torch/csrc/jit/passes/quantization/quantization_patterns.h
@ -282,7 +282,7 @@ QuantFusionInfo getObservedQParamOpFusionInfo(

 } // namespace

-static std::vector<QuantFusionInfo> quant_fusion_pattern_and_replacements() {
+std::vector<QuantFusionInfo> quant_fusion_pattern_and_replacements() {
  // aten::conv1d
  std::string conv1d = R"(
 graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
@ -1105,8 +1105,7 @@ graph(%packed_params, %a):
  };
 }

-static std::vector<QuantFusionInfo>
-dynamic_quant_fusion_pattern_and_replacements() {
+std::vector<QuantFusionInfo> dynamic_quant_fusion_pattern_and_replacements() {
  std::string linear_dynamic = R"(
 graph(%packed_params, %a, %reduce_range, %a_dtype):
        %a_scale : float, %a_zero_point : int = aten::_choose_qparams_per_tensor(%a, %reduce_range)
@ -1143,7 +1142,7 @@ graph(%packed_params, %a):
  };
 }

-static std::vector<QuantFusionInfo> linear_prepack_unpack_patterns() {
+std::vector<QuantFusionInfo> linear_prepack_unpack_patterns() {
  std::string linear_with_quant = R"(
 graph(%a_dequant, %w_quant, %b):
        %w_dequant = aten::dequantize(%w_quant)
@ -1179,7 +1178,7 @@ graph(%w, %a_dq, %b):
  };
 }

-static std::vector<QuantFusionInfo> conv_prepack_unpack_patterns() {
+std::vector<QuantFusionInfo> conv_prepack_unpack_patterns() {
  std::string conv1d_with_quant = R"(
 graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
        %w_dequant = aten::dequantize(%w_quant)
--- a/torch/csrc/jit/passes/remove_exceptions.cpp
+++ b/torch/csrc/jit/passes/remove_exceptions.cpp
@ -7,7 +7,7 @@
 namespace torch {
 namespace jit {

-static bool certainlyThrows(Block* block) {
+bool certainlyThrows(Block* block) {
  for (Node* n : block->nodes()) {
    if (n->kind() == prim::RaiseException) {
      return true;
@ -16,7 +16,7 @@ static bool certainlyThrows(Block* block) {
  return false;
 }

-static void EliminateExceptions(Block* block) {
+void EliminateExceptions(Block* block) {
  auto graph = block->owningGraph();
  Value* false_const = graph->insertConstant(IValue(false));
  Value* true_const = graph->insertConstant(IValue(true));
--- a/torch/csrc/jit/passes/remove_mutation.cpp
+++ b/torch/csrc/jit/passes/remove_mutation.cpp
@ -75,7 +75,7 @@ Node* MutationRemover::createSpecialMappedOp(Node* n) {
  return new_node;
 }

-static bool removableSetItem(Node* n) {
+bool removableSetItem(Node* n) {
  if (n->kind() != aten::_set_item ||
      n->input(1)->node()->kind() != prim::Constant) {
    return false;
--- a/torch/csrc/jit/passes/remove_redundant_profiles.cpp
+++ b/torch/csrc/jit/passes/remove_redundant_profiles.cpp
@ -1,5 +1,4 @@
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
-#include <torch/csrc/jit/passes/remove_redundant_profiles.h>

 #include <torch/csrc/jit/ir/alias_analysis.h>
 #include <torch/csrc/jit/ir/ir_views.h>
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@ -50,7 +50,7 @@ bool mergeTypes(
  return changed;
 }

-static void applyTypes(ArrayRef<Value*> src, ArrayRef<Value*> dst) {
+void applyTypes(ArrayRef<Value*> src, ArrayRef<Value*> dst) {
  AT_ASSERT(src.size() == dst.size());
  for (const auto i : c10::irange(src.size())) {
    dst[i]->setType(src[i]->type());
--- a/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_analysis.cpp
@ -103,7 +103,7 @@ struct ShapeArg
  }
 };

-static std::ostream& operator<<(std::ostream& out, const ShapeArg& sa) {
+std::ostream& operator<<(std::ostream& out, const ShapeArg& sa) {
  if (auto val = sa.asConstantInt()) {
    out << *val;
  } else if (auto ss = sa.asShapeSymbol()) {
@ -149,7 +149,7 @@ struct ShapeArguments {
  std::vector<ShapeArg> maybe_shape_symbols_;
 };

-static std::ostream& operator<<(std::ostream& os, const ShapeArguments& sa) {
+std::ostream& operator<<(std::ostream& os, const ShapeArguments& sa) {
  if (!sa.has_dim()) {
    os << "(UNKNOWN DIM)";
    return os;
@ -176,7 +176,7 @@ bool symbolicShapeAnalysisTestModeEnabled() {

 using SSArgument = c10::variant<ShapeArguments, IValue>;

-static std::ostream& operator<<(std::ostream& out, const SSArgument& sa) {
+std::ostream& operator<<(std::ostream& out, const SSArgument& sa) {
  if (const IValue* iv = c10::get_if<IValue>(&sa)) {
    out << *iv;
  } else {
--- a/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
+++ b/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.cpp
@ -20,7 +20,7 @@ namespace jit {

 // Inserts the Compute for Each Symbolic Shape in the TensorExpr Graph
 // and returns back a map from Symbolic Shape Value to its runtime Value *
-static std::map<int64_t, Value*> InsertSymbolicShapesCompute(
+std::map<int64_t, Value*> InsertSymbolicShapesCompute(
    const ShapeComputeGraphMapping& shape_mapping,
    Node* tensorexpr_graph) {
  WithInsertPoint guard(tensorexpr_graph);
@ -140,7 +140,7 @@ inline StrideInput summarizeStrideDim(
  }
 }

-static std::vector<StrideInput> summarizeInputStrides(const TensorType& tt) {
+std::vector<StrideInput> summarizeInputStrides(const TensorType& tt) {
  auto strides = *tt.strides().concrete_sizes();
  auto sizes = *tt.sizes().concrete_sizes();
  if (c10::is_contiguous_strides(sizes, strides)) {
@ -158,7 +158,7 @@ static std::vector<StrideInput> summarizeInputStrides(const TensorType& tt) {
 };

 // Todo: incorporate in codegen
-static StrideInput summarizeOutputStrides(const TensorType& tt) {
+StrideInput summarizeOutputStrides(const TensorType& tt) {
  auto strides = *tt.strides().concrete_sizes();
  auto sizes = *tt.sizes().concrete_sizes();
  // We only try to maintain output striding for channels last tensors,
@ -178,7 +178,7 @@ static StrideInput summarizeOutputStrides(const TensorType& tt) {
 // Also summarize input striding behavior. The Size information is stored on the
 // type, The striding is returned. See StrideInput for description of stride
 // specializations
-static c10::optional<std::vector<std::vector<StrideInput>>>
+c10::optional<std::vector<std::vector<StrideInput>>>
 TryGeneralizeInputDimensionsToSymbolicShapes(
    std::shared_ptr<Graph> tensorexpr_graph) {
  std::map<size_t, int64_t> shape_to_sym_shape;
@ -212,7 +212,7 @@ TryGeneralizeInputDimensionsToSymbolicShapes(
  return input_striding;
 }

-static void moveConstantTensorsOutOfSubgraph(
+void moveConstantTensorsOutOfSubgraph(
    Node* tensorexpr_graph_node,
    std::shared_ptr<Graph> tensorexpr_graph) {
  auto parent = tensorexpr_graph_node->owningGraph();
@ -304,7 +304,7 @@ bool GenerateGuard(Node* tensorexpr_graph_node, bool add_composed_op) {
  return true;
 }

-static void inlineFallbackGraphAndAddSRCopyOutOp(std::shared_ptr<Graph> graph) {
+void inlineFallbackGraphAndAddSRCopyOutOp(std::shared_ptr<Graph> graph) {
  DepthFirstGraphNodeIterator it(graph);

  Node* n = nullptr;
@ -495,7 +495,7 @@ void insertDynamicShapesGuard(
 // tensors
 // Note: this logic is meant to reflect the invocation of the TE Kernel
 // and `runWithAllocatedOutputs` in tensorexpr_fuser.cpp
-static Operation StaticRuntimeCopyOuts(const Node* node) {
+Operation StaticRuntimeCopyOuts(const Node* node) {
  auto num_ten_inputs = node->inputs().size();
  return [num_ten_inputs](Stack& stack) {
    std::vector<IValue> inputs = pop(stack, num_ten_inputs);
@ -721,7 +721,7 @@ void runTensorExprDynamicGroup(const Code& code, Stack& stack) {
  interpreter.run(stack);
 }

-static Operation createTensorExprDynamicGroup(const Node* node) {
+Operation createTensorExprDynamicGroup(const Node* node) {
  const auto& graph = node->g(attr::Subgraph);
  Code code(graph, "");
  // This implementation creates a Code object and InterpreterState on every
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@ -43,7 +43,7 @@ namespace jit {

 static bool texpr_reductions_enabled = false;

-static bool isSupportedForBlock(Node* node) {
+bool isSupportedForBlock(Node* node) {
  switch (node->kind()) {
    case aten::add:
    case aten::mul:
@ -187,7 +187,7 @@ bool texprReductionsEnabled() {
  return texpr_reductions_enabled;
 }

-static void removeProfileNodesAndSpecializeTypes(Block* b) {
+void removeProfileNodesAndSpecializeTypes(Block* b) {
  for (auto it = b->nodes().begin(); it != b->nodes().end(); it++) {
    if (it->kind() == prim::profile) {
      GRAPH_DEBUG("Removing prim::profile: %", it->output()->debugName());
@ -275,7 +275,7 @@ bool hasTensorTypeSpecialization(Value* v) {
  return true;
 }

-static void removeTensorTypeSpecialization(Value* v) {
+void removeTensorTypeSpecialization(Value* v) {
  if (hasTensorTypeSpecialization(v)) {
    v->setType(TensorType::get());
  }
@ -1364,7 +1364,7 @@ void FuseTensorExprs(
  GRAPH_DUMP("After TExprFuser: ", graph);
 }

-static Operation createTensorExprOp(const Node* node) {
+Operation createTensorExprOp(const Node* node) {
  bool dynamic_shape_fusion_node =
      node->hasAttribute(attr::striding_inputs_desc);
  if (!dynamic_shape_fusion_node) {
--- a/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp
+++ b/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.cpp
@ -6,7 +6,7 @@
 namespace torch {
 namespace jit {

-static void UpdateDifferentiableGraphRequiresGrad(
+void UpdateDifferentiableGraphRequiresGrad(
    Block* block,
    c10::optional<bool> new_requires_grad) {
  for (Node* n : block->nodes()) {
--- a/torch/csrc/jit/passes/utils/subgraph_utils.cpp
+++ b/torch/csrc/jit/passes/utils/subgraph_utils.cpp
@ -227,7 +227,7 @@ void unmergeSubgraph(Node* subgraphNode) {
  subgraphNode->destroy();
 }

-static void collectNestedUses(
+void collectNestedUses(
    std::unordered_set<Value*>& closed_over_values,
    std::unordered_set<Value*>& new_values,
    std::unordered_map<Value*, Value*>& externalValuesMap,
@ -271,7 +271,7 @@ static void collectNestedUses(
  }
 }

-static std::unordered_set<Value*> closedOverValues(
+std::unordered_set<Value*> closedOverValues(
    Node* toMerge,
    std::unordered_map<Value*, Value*>& externalValuesMap) {
  std::unordered_set<Value*> closed_over_values;
@ -602,7 +602,7 @@ void unmergeNode(Node* n, Node* subgraphNode) {
  n->destroy();
 }

-static std::string truncateStrWithHash(const std::string& s, size_t maxlen) {
+std::string truncateStrWithHash(const std::string& s, size_t maxlen) {
  if (s.size() <= maxlen) {
    return s;
  }
--- a/torch/csrc/jit/passes/vulkan_rewrite.cpp
+++ b/torch/csrc/jit/passes/vulkan_rewrite.cpp
@ -399,12 +399,12 @@ void vulkanFoldPrePackingOps(script::Module& m) {
  PrePackingOpsFolder(m, filter_fn, "prepack_folding");
 }

-static void vulkanRemoveMutation(script::Module& module) {
+void vulkanRemoveMutation(script::Module& module) {
  auto graph = module.get_method("forward").graph();
  RemoveTensorMutation(graph);
 }

-static void vulkanRunCanonicalOptimizations(script::Module& module) {
+void vulkanRunCanonicalOptimizations(script::Module& module) {
  auto graph = module.get_method("forward").graph();
  for (const auto& method : module.get_methods()) {
    auto method_graph = method.graph();
--- a/torch/csrc/jit/runtime/autodiff.cpp
+++ b/torch/csrc/jit/runtime/autodiff.cpp
@ -22,13 +22,19 @@ namespace jit {
 using value_map = std::unordered_map<Value*, Value*>;
 using value_set = std::unordered_set<Value*>;

+void wrapDim(int64_t& dim, const std::vector<int64_t>& sizes) {
+  if (dim < 0) {
+    dim += sizes.size();
+  }
+}
+
 // need_trim_grad_ops contains functions that return multiple outputs in
 // forward, but only the first one requires grad.
 // Example:
 // kthvalue returns (kthvalue, index of kthvalue), currently autodiff only
 // supports at most one output that requires grad. Thus we need to remove
 // the grad for index that doesn't require grad.
-static bool needTrimGrad(Node* n) {
+bool needTrimGrad(Node* n) {
  static OperatorSet need_trim_grad_ops = {
      "aten::kthvalue(Tensor self, int k, int dim, bool keepdim) -> (Tensor, Tensor)",
      "aten::topk(Tensor self, int k, int dim, bool largest, bool sorted) -> (Tensor, Tensor)",
@ -829,7 +835,7 @@ static void lambdaLiftReverse(Gradient& grad_desc, ReverseDetails& rev_info) {
  reverse_block->owningNode()->destroy();
 }

-static void packReturnValuesIntoTuple(const std::shared_ptr<Graph>& graph) {
+void packReturnValuesIntoTuple(const std::shared_ptr<Graph>& graph) {
  auto returnNode = graph->block()->return_node();
  WithInsertPoint wip(returnNode);
  auto tuple = graph->insertNode(graph->createTuple(returnNode->inputs()));
--- a/torch/csrc/jit/runtime/decomposition_registry.cpp
+++ b/torch/csrc/jit/runtime/decomposition_registry.cpp
@ -70,7 +70,7 @@ void loadDecompositionFunctions() {

 } // anonymous namespace

-static void DecomposeOp(Node* n) {
+void DecomposeOp(Node* n) {
  auto schema = n->maybeSchema();
  if (!schema) {
    return;
@ -89,7 +89,7 @@ static void DecomposeOp(Node* n) {
  n->destroy();
 }

-static void RunDecompositions(Block* block) {
+void RunDecompositions(Block* block) {
  for (auto it = block->nodes().begin(); it != block->nodes().end();) {
    Node* n = *it;
    it++; // advance iterator bc the current node may be destroyed
--- a/torch/csrc/jit/runtime/instruction.cpp
+++ b/torch/csrc/jit/runtime/instruction.cpp
@ -5,7 +5,7 @@

 namespace torch {
 namespace jit {
-static std::ostream& operator<<(std::ostream& out, OpCode op) {
+std::ostream& operator<<(std::ostream& out, OpCode op) {
  switch (op) {
 #define OP_STRING(x, _) \
  case x:               \
@ -27,7 +27,7 @@ char const* toString(OpCode op) {
  return nullptr;
 }

-static const char* OpInfo(OpCode op) {
+const char* OpInfo(OpCode op) {
  switch (op) {
 #define OP_INFO(x, info) \
  case x:                \
--- a/torch/csrc/jit/runtime/instruction.h
+++ b/torch/csrc/jit/runtime/instruction.h
@ -95,7 +95,6 @@ std::ostream& operator<<(std::ostream& out, Instruction inst);

 bool isOpSupportedInMobile(OpCode op);
 char const* toString(OpCode op);
-OpCode parseOpCode(const char* str);
 std::ostream& operator<<(std::ostream& out, Instruction inst);

 } // namespace jit
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@ -83,7 +83,7 @@ static std::atomic<bool> profiling_mode{true};

 static std::mutex fusion_strategy_lock;

-static FusionStrategy getInitialStrategy() {
+FusionStrategy getInitialStrategy() {
  if (FLAGS_torch_jit_always_dynamic) {
    return {{FusionBehavior::DYNAMIC, 12}};
  }
@ -245,7 +245,7 @@ static C10_UNUSED void setRequiresGradOnDiffGraph(Node* dnode) {
  }
 }

-static bool guardDifferentiableGraph(Node* dnode) {
+bool guardDifferentiableGraph(Node* dnode) {
  auto gi = dnode->g(attr::Subgraph)->inputs();
  bool all_inputs_seen = true;
  for (const auto i : c10::irange(gi.size())) {
@ -323,7 +323,7 @@ void runNooptPassPipeline(std::shared_ptr<Graph>& graph) {
      "After EliminateDeadCode (end of runNooptPassPipeline)\n", *graph);
 }

-static void runPreAutodiffPassPipeline(std::shared_ptr<Graph>& graph) {
+void runPreAutodiffPassPipeline(std::shared_ptr<Graph>& graph) {
  GRAPH_DEBUG(
      "Before InsertGuards (beginning of runPreAutodiffPassPipeline)\n",
      *graph);
@ -700,7 +700,7 @@ GraphExecutorState ProfilingGraphExecutorImpl::getDebugState() {
  return state;
 }

-static Node* insertFallbackFunctionCall(
+Node* insertFallbackFunctionCall(
    Graph* graph,
    GraphFunction* func,
    ArrayRef<Value*> inputs) {
@ -721,7 +721,7 @@ static Node* insertFallbackFunctionCall(
  return fun_unpack_tuple;
 }

-static GraphFunction* createFallbackPathFunction(
+GraphFunction* createFallbackPathFunction(
    Block* b,
    const std::string& function_name) {
  auto value_map = [](Value* v) { return v; };
--- a/Show More
+++ b/Show More