[2/N] Turn inline static functions into static (#140068)

Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/140068 Approved by: https://github.com/ezyang
2025-12-06 12:20:52 +01:00 · 2024-11-09 03:31:24 +00:00 · 2024-11-09 03:31:24 +00:00 · 032135f8a2
commit 032135f8a2
parent 3b8470c461
7 changed files with 33 additions and 40 deletions
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@ -267,7 +267,7 @@ struct Dist {
  // This does a backward pass down a Vec column of the input
  template <typename F>
-  inline static void backward_down_column_pdist(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) {
+  static void backward_down_column_pdist(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) {
    for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {
      const Vec self_vec_i = Vec::loadu(self_i, count);
@ -391,7 +391,7 @@ struct Dist {
  }
  template <typename F>
-  inline static void backward_down_column_cdist(const scalar_t * t1, const scalar_t * t2, scalar_t * res, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t r1, int64_t r2, int64_t m, int64_t d, int64_t gs, int64_t l1_size, int64_t l2_size, int64_t count = Vec::size()) {
+  static void backward_down_column_cdist(const scalar_t * t1, const scalar_t * t2, scalar_t * res, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t r1, int64_t r2, int64_t m, int64_t d, int64_t gs, int64_t l1_size, int64_t l2_size, int64_t count = Vec::size()) {
    const scalar_t * t1_end = t1 + l1_size;
    const scalar_t * t2_end = t2 + l2_size;
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp
@ -238,16 +238,9 @@ void ldl_solve_cusolver(
 #if defined(USE_LINALG_SOLVER)
 inline static Tensor column_major_identity_matrix_like(const Tensor& self) {
  auto size = self.sizes();
  auto size_slice = IntArrayRef(size.data(), size.size()-1);
  return at::ones(size_slice, self.options()).diag_embed().mT();
 }
 // call cusolver gesvd function to calculate svd
 template<typename scalar_t>
-inline static void apply_svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
+static void apply_svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
  const Tensor& infos, bool full_matrices, bool compute_uv,
  const bool calculate_all_batches,
  const std::vector<int64_t>& batches
@ -319,7 +312,7 @@ inline static void apply_svd_cusolver_gesvd(const Tensor& A, const Tensor& U, co
 }
 // We'll copy A inside svd_cusolver_gesvd
-inline static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
+static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
  const Tensor& infos, bool full_matrices, bool compute_uv,
  const bool calculate_all_batches = true,
  const std::vector<int64_t>& batches = {}
@ -356,7 +349,7 @@ inline static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Te
 // call cusolver gesvdj function to calculate svd
 template<typename scalar_t>
-inline static void apply_svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
+static void apply_svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
  const Tensor& infos, bool full_matrices, bool compute_uv) {
  using value_t = typename c10::scalar_value_type<scalar_t>::type;
  int m = cuda_int_cast(A.size(-2), "m");
@ -430,7 +423,7 @@ inline static void apply_svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, c
 // wrapper around apply_svd_cusolver_gesvdj that handles dtype dispatch
 // note that gesvdj returns V, which is what we want
 // Need to pass a copy of A, since A will be rewritten inside the function call
-inline static void svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) {
+static void svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) {
  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "svd_cuda_gesvdj", [&] {
    apply_svd_cusolver_gesvdj<scalar_t>(A, U, S, V, infos, full_matrices, compute_uv);
  });
@ -438,7 +431,7 @@ inline static void svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const T
 // call cusolver gesvdj batched function to calculate svd
 template<typename scalar_t>
-inline static void apply_svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
+static void apply_svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
  const Tensor& infos, bool compute_uv
 ) {
  using value_t = typename c10::scalar_value_type<scalar_t>::type;
@ -481,7 +474,7 @@ inline static void apply_svd_cusolver_gesvdjBatched(const Tensor& A, const Tenso
  TORCH_CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
-inline static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) {
+static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) {
  auto m = A.size(-2);
  auto n = A.size(-1);
  auto k = std::min(m, n);
@ -520,7 +513,7 @@ inline static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U,
 }
 template<typename scalar_t>
-inline static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
+static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
    const Tensor& infos, bool full_matrices, bool compute_uv) {
 #ifndef CUDART_VERSION
  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend.")
@ -577,7 +570,7 @@ inline static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, cons
 }
 // We'll copy A inside svd_cusolver_gesvdaStridedBatched
-inline static void svd_cusolver_gesvdaStridedBatched(
+static void svd_cusolver_gesvdaStridedBatched(
    const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
    const Tensor& infos, bool full_matrices, bool compute_uv) {
  // We need to pass a copy of A, as it will be overwritten
@ -716,7 +709,7 @@ void svd_cusolver(const Tensor& A,
 // Implementation of Cholesky decomposition using looped cusolverDn<T>potrf or cusolverDnXpotrf (64-bit)
 template<typename scalar_t>
-inline static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_working_copy, bool upper, const Tensor& infos) {
+static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_working_copy, bool upper, const Tensor& infos) {
  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
  const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
  const int64_t n = self_working_copy.size(-1);
@ -785,7 +778,7 @@ inline static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_worki
 // Warning: cusolverDn<T>potrfBatched doesn't work quite well when matrix size or batch size is zero.
 // If you write your own C++ extension and use this function, make sure you do a zero numel check for the input.
 template<typename scalar_t>
-inline static void apply_cholesky_cusolver_potrfBatched(const Tensor& self_working_copy, bool upper, const Tensor& infos) {
+static void apply_cholesky_cusolver_potrfBatched(const Tensor& self_working_copy, bool upper, const Tensor& infos) {
  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
  const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
  const int n = cuda_int_cast(self_working_copy.size(-1), "n");
@ -820,7 +813,7 @@ void cholesky_helper_cusolver(const Tensor& input, bool upper, const Tensor& inf
 template<typename scalar_t>
-inline static void apply_cholesky_cusolver_potrs(Tensor& self_working_copy, const Tensor& A_column_major_copy, bool upper, Tensor& infos) {
+static void apply_cholesky_cusolver_potrs(Tensor& self_working_copy, const Tensor& A_column_major_copy, bool upper, Tensor& infos) {
  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
  const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
  const int64_t n = self_working_copy.size(-2);
@ -876,7 +869,7 @@ inline static void apply_cholesky_cusolver_potrs(Tensor& self_working_copy, cons
 // This code path is only dispatched to if MAGMA is not linked in the pytorch build.
 // cusolverDn<t>potrsBatched only supports nrhs == 1
 template<typename scalar_t>
-inline static void apply_cholesky_cusolver_potrsBatched(Tensor& self_working_copy, const Tensor& A_column_major_copy, bool upper, Tensor& infos) {
+static void apply_cholesky_cusolver_potrsBatched(Tensor& self_working_copy, const Tensor& A_column_major_copy, bool upper, Tensor& infos) {
  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
  const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
  const int64_t n = self_working_copy.size(-2);
@ -1147,7 +1140,7 @@ void ormqr_cusolver(const Tensor& input, const Tensor& tau, const Tensor& other,
  For further details, please see the cuSOLVER documentation for ORGQR and UNGQR.
 */
 template <typename scalar_t>
-inline static void apply_orgqr(Tensor& self, const Tensor& tau) {
+static void apply_orgqr(Tensor& self, const Tensor& tau) {
  auto self_data = self.data_ptr<scalar_t>();
  auto tau_data = tau.const_data_ptr<scalar_t>();
  auto self_matrix_stride = matrixStride(self);
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -2456,7 +2456,7 @@ Call this whenever a new thread is created in order to propagate values from
 // Checks that the _C shared library isn't initialized multiple times. This
 // can happen if the same csrc files are compiled into multiple shared
 // libraries.
-inline static void pytorch_duplicate_guard() {
+static void pytorch_duplicate_guard() {
  static int initialized = 0;
  if (initialized) {
    fmt::print(stderr, "pytorch: _C shared library re-initialized\n");
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@ -1134,7 +1134,7 @@ void Engine::evaluate_function(
  }
 }
-inline static uint64_t compute_min_topological_nr(const edge_list& outputs) {
+static uint64_t compute_min_topological_nr(const edge_list& outputs) {
  // Computes the mininum topological number among all the outputs
  if (outputs.empty()) {
    return 0;
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@ -162,7 +162,7 @@ c10::intrusive_ptr<at::ivalue::Future> PythonEngine::execute_with_graph_task(
 PyObject* THPEngineClass = nullptr;
-inline static Edge parseGradientEdge(PyObject* obj, int64_t index) {
+static Edge parseGradientEdge(PyObject* obj, int64_t index) {
  PyObject* grad_fn = PyTuple_GetItem(obj, 0);
  auto output_nr = THPUtils_unpackLong(PyTuple_GetItem(obj, 1));
  std::shared_ptr<torch::autograd::Node> grad_fn_sp;
--- a/torch/csrc/dynamo/eval_frame.c
+++ b/torch/csrc/dynamo/eval_frame.c
@ -18,7 +18,7 @@ static int active_dynamo_threads = 0;
 static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
-inline static PyObject* eval_frame_callback_get(void) {
+static PyObject* eval_frame_callback_get(void) {
  void* result = PyThread_tss_get(&eval_frame_callback_key);
  if (unlikely(result == NULL)) {
    return (PyObject*)Py_None;
@ -27,7 +27,7 @@ inline static PyObject* eval_frame_callback_get(void) {
  }
 }
-inline static void eval_frame_callback_set(PyObject* obj) {
+static void eval_frame_callback_set(PyObject* obj) {
  PyThread_tss_set(&eval_frame_callback_key, obj);
 }
@ -186,7 +186,7 @@ static PyObject* dynamo_custom_eval_frame_shim(THP_EVAL_API_FRAME_OBJECT* frame,
 }
 #endif
-inline static PyObject* dynamo_eval_frame_default(
+static PyObject* dynamo_eval_frame_default(
    PyThreadState* tstate,
    THP_EVAL_API_FRAME_OBJECT* frame,
    int throw_flag) {
@ -205,7 +205,7 @@ inline static PyObject* dynamo_eval_frame_default(
 #endif
 }
-inline static void enable_eval_frame_shim(PyThreadState* tstate) {
+static void enable_eval_frame_shim(PyThreadState* tstate) {
 #if PY_VERSION_HEX >= 0x03090000
  if (_PyInterpreterState_GetEvalFrameFunc(tstate->interp) !=
      &dynamo_custom_eval_frame_shim) {
@ -222,7 +222,7 @@ inline static void enable_eval_frame_shim(PyThreadState* tstate) {
 #endif
 }
-inline static void enable_eval_frame_default(PyThreadState* tstate) {
+static void enable_eval_frame_default(PyThreadState* tstate) {
 #if PY_VERSION_HEX >= 0x03090000
  if (_PyInterpreterState_GetEvalFrameFunc(tstate->interp) !=
      previous_eval_frame) {
@ -240,13 +240,13 @@ inline static void enable_eval_frame_default(PyThreadState* tstate) {
 }
-inline static const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
+static const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
  // Returns the C string name of the current frame.
  DEBUG_CHECK(PyUnicode_Check(F_CODE(frame)->co_name));
  return PyUnicode_AsUTF8(F_CODE(frame)->co_name);
 }
-static inline PyObject* dynamo_call_callback(
+static PyObject* dynamo_call_callback(
    PyObject* callable,
    THP_EVAL_API_FRAME_OBJECT* _frame,
    PyObject* locals,
@ -277,7 +277,7 @@ static inline PyObject* dynamo_call_callback(
  return res;
 }
-static inline void clear_old_frame_if_python_312_plus(
+static void clear_old_frame_if_python_312_plus(
  PyThreadState* tstate,
  THP_EVAL_API_FRAME_OBJECT* frame) {
 #if IS_PYTHON_3_12_PLUS
@ -288,7 +288,7 @@ static inline void clear_old_frame_if_python_312_plus(
 #endif
 }
-inline static PyObject* dynamo_eval_custom_code_impl(
+static PyObject* dynamo_eval_custom_code_impl(
    PyThreadState* tstate,
    THP_EVAL_API_FRAME_OBJECT* frame,
    PyCodeObject* code,
@ -467,7 +467,7 @@ inline static PyObject* dynamo_eval_custom_code_impl(
 }
 // This wrapper function adds a profiler event
-inline static PyObject* dynamo_eval_custom_code(
+static PyObject* dynamo_eval_custom_code(
    PyThreadState* tstate,
    THP_EVAL_API_FRAME_OBJECT* frame,
    PyCodeObject* code,
@ -725,8 +725,8 @@ typedef struct THPPyInterpreterFrame {
  _PyInterpreterFrame* frame; // Borrowed reference
 } THPPyInterpreterFrame;
-inline static void enable_eval_frame_shim(PyThreadState* tstate) {}
+static void enable_eval_frame_shim(PyThreadState* tstate) {}
-inline static void enable_eval_frame_default(PyThreadState* tstate) {}
+static void enable_eval_frame_default(PyThreadState* tstate) {}
 static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};
--- a/torch/csrc/dynamo/guards.cpp
+++ b/torch/csrc/dynamo/guards.cpp
@ -756,7 +756,7 @@ static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) {
 }
 template <typename T>
-inline static void unwrap_size_tuple(PyObject* obj, T& output) {
+static void unwrap_size_tuple(PyObject* obj, T& output) {
  TORCH_CHECK(PyTuple_CheckExact(obj));
  size_t len = PyTuple_GET_SIZE(obj);
  output.reserve(len);
@ -768,7 +768,7 @@ inline static void unwrap_size_tuple(PyObject* obj, T& output) {
 }
 template <typename T>
-inline static void _parse_empty_strided_args(
+static void _parse_empty_strided_args(
    PyObject* args,
    T& sizes,
    T& strides,
@ -783,7 +783,7 @@ inline static void _parse_empty_strided_args(
  dtype = reinterpret_cast<THPDtype*>(py_dtype)->scalar_type;
 }
-inline static PyObject* _empty_strided_device(
+static PyObject* _empty_strided_device(
    PyObject* dummy,
    PyObject* args,
    c10::DeviceType device_type) {