[Reland] [5/N] Change static functions in headers to inline (#131010)

Reland of #130673 Pull Request resolved: https://github.com/pytorch/pytorch/pull/131010 Approved by: https://github.com/Skylion007
2025-12-06 12:20:52 +01:00 · 2024-07-18 15:53:48 +00:00 · 2024-07-18 15:53:48 +00:00 · 7c90a82970
commit 7c90a82970
parent d6ae8bbf16
14 changed files with 110 additions and 110 deletions
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -432,73 +432,73 @@ class TORCH_API Context {

 TORCH_API Context& globalContext();

-static inline void init() {
+inline void init() {
  globalContext();
 }

 TORCH_API Allocator* getCPUAllocator();

-static inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
+inline DeprecatedTypeProperties& getDeprecatedTypeProperties(
    Backend p,
    ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      p, s);
 }

-static inline DeprecatedTypeProperties& CPU(ScalarType s) {
+inline DeprecatedTypeProperties& CPU(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::CPU, s);
 }

-static inline DeprecatedTypeProperties& CUDA(ScalarType s) {
+inline DeprecatedTypeProperties& CUDA(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::CUDA, s);
 }

-static inline DeprecatedTypeProperties& HIP(ScalarType s) {
+inline DeprecatedTypeProperties& HIP(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::HIP, s);
 }

-static inline DeprecatedTypeProperties& MPS(ScalarType s) {
+inline DeprecatedTypeProperties& MPS(ScalarType s) {
  return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
      Backend::MPS, s);
 }

-static inline bool hasCUDA() {
+inline bool hasCUDA() {
  return globalContext().hasCUDA();
 }

-static inline bool hasMTIA() {
+inline bool hasMTIA() {
  return globalContext().hasMTIA();
 }

-static inline bool hasHIP() {
+inline bool hasHIP() {
  return globalContext().hasHIP();
 }

-static inline bool hasIPU() {
+inline bool hasIPU() {
  return globalContext().hasIPU();
 }

-static inline bool hasXLA() {
+inline bool hasXLA() {
  return globalContext().hasXLA();
 }

-static inline bool hasMPS() {
+inline bool hasMPS() {
  return globalContext().hasMPS();
 }

-static inline bool hasMAIA() {
+inline bool hasMAIA() {
  return globalContext().hasMAIA();
 }

-static inline bool hasXPU() {
+inline bool hasXPU() {
  return globalContext().hasXPU();
 }

 // Despite its name, this function returns the number of *CUDA* GPUs.
-static inline size_t getNumGPUs() {
+inline size_t getNumGPUs() {
  // WARNING: DO NOT ADD LOGIC TO HANDLE OTHER DEVICE TYPES TO THIS
  // FUNCTION.  If you are interested in interrogating the number of
  // devices for a specific device type, add that function to the
@ -517,27 +517,27 @@ static inline size_t getNumGPUs() {
  }
 }

-static inline bool hasOpenMP() {
+inline bool hasOpenMP() {
  return globalContext().hasOpenMP();
 }

-static inline bool hasMKL() {
+inline bool hasMKL() {
  return globalContext().hasMKL();
 }

-static inline bool hasLAPACK() {
+inline bool hasLAPACK() {
  return globalContext().hasLAPACK();
 }

-static inline bool hasMAGMA() {
+inline bool hasMAGMA() {
  return globalContext().hasMAGMA();
 }

-static inline bool hasMKLDNN() {
+inline bool hasMKLDNN() {
  return globalContext().hasMKLDNN();
 }

-static inline void manual_seed(uint64_t seed) {
+inline void manual_seed(uint64_t seed) {
  auto gen = globalContext().defaultGenerator(c10::DeviceType::CPU);
  {
    // See Note [Acquire lock when using random generators]
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -499,7 +499,7 @@ inline Tensor sum_to(
  return _sum_to(std::move(tensor), shape, always_return_non_view);
 }

-static inline bool is_expandable_to(
+inline bool is_expandable_to(
    SymIntArrayRef shape,
    c10::SymIntArrayRef desired) {
  size_t ndim = shape.size();
@ -517,7 +517,7 @@ static inline bool is_expandable_to(
  return true;
 }

-static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
+inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) {
  auto sym_shape = c10::SymIntArrayRef(
      reinterpret_cast<const c10::SymInt*>(shape.data()), shape.size());
  auto sym_desired = c10::SymIntArrayRef(
--- a/aten/src/ATen/TensorOperators.h
+++ b/aten/src/ATen/TensorOperators.h
@ -34,13 +34,13 @@ namespace at {
  _(!=, x.ne(y), y.ne(x))

 #define DEFINE_OPERATOR(op, body, reverse_scalar_body)          \
-  static inline Tensor operator op(const Tensor& x, const Tensor& y) { \
+  inline Tensor operator op(const Tensor& x, const Tensor& y) { \
    return body;                                                \
  }                                                             \
-  static inline Tensor operator op(const Tensor& x, const Scalar& y) { \
+  inline Tensor operator op(const Tensor& x, const Scalar& y) { \
    return body;                                                \
  }                                                             \
-  static inline Tensor operator op(const Scalar& x, const Tensor& y) { \
+  inline Tensor operator op(const Scalar& x, const Tensor& y) { \
    return reverse_scalar_body;                                 \
  }

--- a/aten/src/ATen/TracerMode.h
+++ b/aten/src/ATen/TracerMode.h
@ -113,12 +113,12 @@

 namespace at::tracer::impl {

-static inline bool is_dispatch_enabled() {
+inline bool is_dispatch_enabled() {
  return c10::impl::tls_is_dispatch_key_included(at::DispatchKey::Tracer) &&
      !c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Tracer);
 }

-static inline void set_dispatch_enabled(bool enabled) {
+inline void set_dispatch_enabled(bool enabled) {
  TORCH_INTERNAL_ASSERT(
      !c10::impl::tls_is_dispatch_key_excluded(at::DispatchKey::Tracer),
      "Cannot enable tracing within the scope of NoTracerDispatchMode!");
--- a/aten/src/ATen/Utils.h
+++ b/aten/src/ATen/Utils.h
@ -29,7 +29,7 @@ TORCH_API int _crash_if_asan(int);
 // Converts a TensorList (i.e. ArrayRef<Tensor> to vector of TensorImpl*)
 // NB: This is ONLY used by legacy TH bindings, and ONLY used by cat.
 // Once cat is ported entirely to ATen this can be deleted!
-static inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
+inline std::vector<TensorImpl*> checked_dense_tensor_list_unwrap(
    ArrayRef<Tensor> tensors,
    const char* name,
    int pos,
--- a/aten/src/ATen/WrapDimUtilsMulti.h
+++ b/aten/src/ATen/WrapDimUtilsMulti.h
@ -13,7 +13,7 @@ namespace at {

 constexpr size_t dim_bitset_size = 64;

-static inline std::bitset<dim_bitset_size> dim_list_to_bitset(
+inline std::bitset<dim_bitset_size> dim_list_to_bitset(
    OptionalIntArrayRef opt_dims,
    size_t ndims) {
  TORCH_CHECK(
--- a/aten/src/ATen/core/Formatting.h
+++ b/aten/src/ATen/core/Formatting.h
@ -18,7 +18,7 @@ TORCH_API std::ostream& print(
    std::ostream& stream,
    const Tensor& tensor,
    int64_t linesize);
-static inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
+inline std::ostream& operator<<(std::ostream & out, const Tensor & t) {
  return print(out,t,80);
 }
 TORCH_API void print(const Tensor & t, int64_t linesize=80);
--- a/aten/src/ATen/core/boxing/impl/boxing.h
+++ b/aten/src/ATen/core/boxing/impl/boxing.h
@ -93,7 +93,7 @@ torch::jit::Stack boxArgs(Args... args) {
 }

 template <class T>
-static inline constexpr size_t boxed_size_one() {
+inline constexpr size_t boxed_size_one() {
  static_assert(!std::is_same<std::decay_t<T>, c10::TensorOptions>::value, "need to patch this path to support TensorOptions passed by reference");
  return 1;
 }
--- a/aten/src/ATen/cuda/Atomic.cuh
+++ b/aten/src/ATen/cuda/Atomic.cuh
@ -159,7 +159,7 @@ struct Atomic##NAME##IntegerImpl<T, 8> {


 # define GPU_ATOMIC_INTEGER(NAME, OP, DTYPE)                                                                           \
-static inline __device__ void gpuAtomic##NAME(DTYPE *address, DTYPE val) {                                             \
+inline __device__ void gpuAtomic##NAME(DTYPE *address, DTYPE val) {                                             \
 Atomic##NAME##IntegerImpl<DTYPE, sizeof(DTYPE)>()(address,                                                             \
                                                      val,                                                             \
                                                      [](DTYPE a, DTYPE b) {                                           \
@ -171,7 +171,7 @@ ATOMIC_INTEGER_IMPL(Add)
 GPU_ATOMIC_INTEGER(Add, a || b, bool)

 // Don't instantiate gpuAtomicAdd with the macro as it seems non-standard (see int32, int64)
-static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
+inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
  AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address,
                                                   val,
                                                   [](uint8_t a, uint8_t b) {
@ -179,7 +179,7 @@ static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
                                                   });
 }

-static inline  __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
+inline  __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
  AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address,
                                                 val,
                                                 [](int8_t a, int8_t b) {
@ -187,7 +187,7 @@ static inline  __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
                                                 });
 }

-static inline  __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
+inline  __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
  AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address,
                                                   val,
                                                   [](int16_t a, int16_t b) {
@ -195,11 +195,11 @@ static inline  __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
                                                   });
 }

-static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
+inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
  return atomicAdd(address, val);
 }

-static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
+inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
 #if defined(USE_ROCM)
  __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
 #else
@ -208,7 +208,7 @@ static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
 #endif
 }

-static inline  __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
+inline  __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
 #if defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
  return AtomicFPOp<at::Half>()(address, val,
                                [](at::Half hsum, at::Half val) {
@ -219,7 +219,7 @@ static inline  __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val)
 #endif
 }

-static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
+inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
 #if defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))
 return AtomicFPOp<at::BFloat16>()(address, val,
                                  [](at::BFloat16 bsum, at::BFloat16 val) {
@ -233,7 +233,7 @@ return AtomicFPOp<at::BFloat16>()(address, val,

 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
 // from CUDA C Programmic Guide
-static inline __device__ double atomicAdd(double* address, double val)
+inline __device__ double atomicAdd(double* address, double val)
 #if defined(__clang__) && defined(__CUDA__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wgcc-compat"
@ -261,20 +261,20 @@ static inline __device__ double atomicAdd(double* address, double val)

 #if defined(USE_ROCM) && __hcc_workweek__ < 18312 && !__HIP__
  // This needs to be defined for the host side pass
-  static inline  __device__  double atomicAdd(double *address, double val) { }
+  inline  __device__  double atomicAdd(double *address, double val) { }
 #endif
 #endif

-static inline __device__ double gpuAtomicAdd(double *address, double val) {
+inline __device__ double gpuAtomicAdd(double *address, double val) {
  return atomicAdd(address, val);
 }

-static inline __device__ float gpuAtomicAdd(float *address, float val) {
+inline __device__ float gpuAtomicAdd(float *address, float val) {
  return atomicAdd(address, val);
 }

 template<typename T>
-static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
+inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
  gpuAtomicAdd(&address->real_, val.real_);
  gpuAtomicAdd(&address->imag_, val.imag_);
 }
@ -285,31 +285,31 @@ static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::comple
 * directly and require non-library provided data type support. Only for these, we
 * continue to provide atomicAdd overloads.
 */
-static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
+inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
  return gpuAtomicAdd(address, val);
 }

-static inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
+inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
  return gpuAtomicAdd(address, val);
 }

-static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
+inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
  gpuAtomicAdd(address, val);
 }

-static inline  __device__ void atomicAdd(int8_t *address, int8_t val) {
+inline  __device__ void atomicAdd(int8_t *address, int8_t val) {
  gpuAtomicAdd(address, val);
 }

-static inline  __device__ void atomicAdd(int16_t *address, int16_t val) {
+inline  __device__ void atomicAdd(int16_t *address, int16_t val) {
  gpuAtomicAdd(address, val);
 }

-static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
+inline __device__ void atomicAdd(int64_t *address, int64_t val) {
  gpuAtomicAdd(address, val);
 }

-static inline __device__ void atomicAdd(bool *address, bool val) {
+inline __device__ void atomicAdd(bool *address, bool val) {
  gpuAtomicAdd(address, val);
 }

@ -321,20 +321,20 @@ static inline __device__ void atomicAdd(bool *address, bool val) {
 * therefore we need a new API 'gpuAtomicAddNoReturn'.
 */
 template<typename T>
-static inline __device__ void gpuAtomicAddNoReturn(c10::complex<T> *address, c10::complex<T> val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(uint8_t *address, uint8_t val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(int8_t *address, int8_t val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(int16_t *address, int16_t val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(int32_t *address, int32_t val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); }
-static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(c10::complex<T> *address, c10::complex<T> val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(uint8_t *address, uint8_t val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(int8_t *address, int8_t val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(int16_t *address, int16_t val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(int32_t *address, int32_t val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); }

 /* Special case fp32 atomic. */
 #if defined(USE_ROCM)
-static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) {
+inline __device__ void gpuAtomicAddNoReturn(float *address, float val) {
 #if defined(__gfx908__)
  atomicAddNoRet(address, val);
 #else
@ -342,7 +342,7 @@ static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) {
 #endif
 }
 #else
-static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
+inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); }
 #endif

 // Atomic multiplication implementation.
--- a/aten/src/ATen/native/ScatterGatherChecks.h
+++ b/aten/src/ATen/native/ScatterGatherChecks.h
@ -11,7 +11,7 @@ namespace {

 // checks whether index.dtype == int64
 // and self.dtype == src.dtype if src is a Tensor
-static void scatter_gather_dtype_check(
+inline void scatter_gather_dtype_check(
  const std::string& method_name,
  const Tensor& self,
  const Tensor& index,
@ -38,7 +38,7 @@ static void scatter_gather_dtype_check(
 // Test:
 // 1. index.size(d) <= self.size(d) for all d != dim
 // 2. index.dim() == self.dim()
-static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim,
+inline void gather_shape_check(const Tensor& self, int64_t dim,
  const Tensor& index
 ) {
  auto self_dims = ensure_nonempty_dim(self.dim());
@ -64,7 +64,7 @@ static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim,
 //  1. index.size(d) <= self.size(d) for all d != dim
 //  2. index.size(d) <= src.size(d) for all d if src is a Tensor
 //  3. index.dim() == self.dim() == src.dim()
-static C10_UNUSED void scatter_shape_check(
+inline void scatter_shape_check(
  const Tensor& self, int64_t dim, const Tensor& index,
  const std::optional<Tensor>& src_opt = std::nullopt
 ) {
--- a/aten/src/ATen/native/cuda/GridSampler.cuh
+++ b/aten/src/ATen/native/cuda/GridSampler.cuh
@ -18,7 +18,7 @@ using detail::GridSamplerPadding;
 //     +1 --> (size - 1) + 0.5 == size - 0.5
 //     scale_factor = size / 2
 template <typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
  if (align_corners) {
    // unnormalize coord from [-1, 1] to [0, size - 1]
@ -34,7 +34,7 @@ scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners)
 // `grad_in`.
 // This is useful in the backward pass of grid_sampler.
 template <typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size,
                                           bool align_corners, scalar_t *grad_in) {
  if (align_corners) {
@ -50,7 +50,7 @@ scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size,

 // Clips coordinates to between 0 and clip_limit - 1
 template <typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t clip_coordinates(scalar_t in, int clip_limit) {
  return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
 }
@ -59,7 +59,7 @@ scalar_t clip_coordinates(scalar_t in, int clip_limit) {
 // it also returns the `d output / d input` via pointer argument `grad_in`.
 // This is useful in the backward pass of grid_sampler.
 template <typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) {
  // Note that it is important for the gradient calculation that borders
  // are considered out of bounds.
@ -82,7 +82,7 @@ scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_i
 // The bounds are passed as twice their value so that half-integer values
 // can be represented as ints.
 template <typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) {
  if (twice_low == twice_high) {
    return static_cast<scalar_t>(0);
@ -105,7 +105,7 @@ scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) {
 // `grad_in`.
 // This is useful in the backward pass of grid_sampler.
 template <typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high,
                                      scalar_t *grad_in) {
  if (twice_low == twice_high) {
@ -135,7 +135,7 @@ scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high
 }

 template<typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t safe_downgrade_to_int_range(scalar_t x){
  // -100.0 does not have special meaning. This is just to make sure
  // it's not within_bounds_2d or within_bounds_3d, and does not cause
@ -146,7 +146,7 @@ scalar_t safe_downgrade_to_int_range(scalar_t x){
 }

 template<typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t compute_coordinates(scalar_t coord, int size,
                             GridSamplerPadding padding_mode,
                             bool align_corners) {
@ -170,7 +170,7 @@ scalar_t compute_coordinates(scalar_t coord, int size,

 // Computes the pixel source index value for a grid coordinate
 template <typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t grid_sampler_compute_source_index(
    scalar_t coord,
    int size,
@ -186,7 +186,7 @@ scalar_t grid_sampler_compute_source_index(
 // `d output / d input` via pointer argument `grad_in`.
 // This is useful in the backward pass of grid_sampler.
 template <typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t grid_sampler_compute_source_index_set_grad(
    scalar_t coord,
    int size,
@ -215,18 +215,18 @@ scalar_t grid_sampler_compute_source_index_set_grad(
  return coord;
 }

-static __forceinline__ __device__
+__forceinline__ __device__
 bool within_bounds_2d(int h, int w, int H, int W) {
  return h >= 0 && h < H && w >= 0 && w < W;
 }

-static __forceinline__ __device__
+__forceinline__ __device__
 bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
 }

 template<typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 scalar_t get_value_bounded(
    const scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
    GridSamplerPadding padding_mode,
@ -245,7 +245,7 @@ scalar_t get_value_bounded(
 }

 template<typename scalar_t, typename index_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 void safe_add_2d(scalar_t *data, int h, int w,
                 int sH, int sW, int H, int W,
                 scalar_t delta,
@ -261,7 +261,7 @@ void safe_add_2d(scalar_t *data, int h, int w,
 }

 template<typename scalar_t, typename index_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 void safe_add_3d(scalar_t *data, int d, int h, int w,
                 int sD, int sH, int sW, int D, int H, int W,
                 scalar_t delta,
@ -277,7 +277,7 @@ void safe_add_3d(scalar_t *data, int d, int h, int w,
 }

 template<typename scalar_t, typename index_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 void add_value_bounded(
    scalar_t* data, scalar_t x, scalar_t y, int W, int H, int sW, int sH,
    scalar_t delta,
@ -297,7 +297,7 @@ void add_value_bounded(

 // Calculate the differential of the cubic convolution, i.e. `d coeff / d x`
 template<typename scalar_t>
-static __forceinline__ __device__
+__forceinline__ __device__
 void get_cubic_coefficients_grad(
    scalar_t coeffs[4],
    scalar_t t) {
--- a/aten/src/ATen/native/cuda/SortingCommon.cuh
+++ b/aten/src/ATen/native/cuda/SortingCommon.cuh
@ -21,7 +21,7 @@ constexpr int MAX_BLOCK_SIZE = 1024;
 // Maximum size per grid dimension that we assume (compute capability >= 2.0)
 constexpr int64_t MAX_GRID_SIZE = 65535LL;

-static bool getGridFromTiles(int64_t gridTiles, dim3& grid) {
+inline bool getGridFromTiles(int64_t gridTiles, dim3& grid) {
  if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) {
    return false;
  }
@ -92,7 +92,7 @@ struct GlobalIndexToPerSliceIndex {
 };

 // Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks
-static uint64_t nextHighestPowerOf2(uint64_t n) {
+inline uint64_t nextHighestPowerOf2(uint64_t n) {
  n--;
  n |= n >> 1;
  n |= n >> 2;
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@ -72,7 +72,7 @@ __device__ inline scalar_t max(scalar_t a, scalar_t b) {

 // see NOTE [ Nearest neighbor upsampling kernel implementation ]
 template <typename accscalar_t>
-__host__ __forceinline__ static accscalar_t compute_scales_value(
+__host__ __forceinline__ accscalar_t compute_scales_value(
    const std::optional<double> scale,
    int64_t src_size,
    int64_t dst_size) {
@ -83,7 +83,7 @@ __host__ __forceinline__ static accscalar_t compute_scales_value(

 // see NOTE [ Nearest neighbor upsampling kernel implementation ]
 template <typename accscalar_t>
-__host__ __forceinline__ static accscalar_t compute_scales_value_backwards(
+__host__ __forceinline__ accscalar_t compute_scales_value_backwards(
    const std::optional<double> scale,
    int64_t src_size,
    int64_t dst_size) {
@ -93,7 +93,7 @@ __host__ __forceinline__ static accscalar_t compute_scales_value_backwards(
 }

 template <typename accscalar_t>
-__host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
+__host__ __forceinline__ accscalar_t area_pixel_compute_scale(
    int input_size,
    int output_size,
    bool align_corners,
@ -112,7 +112,7 @@ __host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
 }

 template <typename accscalar_t>
-__device__ __forceinline__ static accscalar_t area_pixel_compute_source_index(
+__device__ __forceinline__ accscalar_t area_pixel_compute_source_index(
    accscalar_t scale,
    int dst_index,
    bool align_corners,
@ -130,7 +130,7 @@ __device__ __forceinline__ static accscalar_t area_pixel_compute_source_index(
 }

 // see NOTE [ Nearest neighbor upsampling kernel implementation ]
-__device__ __forceinline__ static int nearest_neighbor_compute_source_index(
+__device__ __forceinline__ int nearest_neighbor_compute_source_index(
    const float scale,
    int dst_index,
    int input_size) {
@ -144,7 +144,7 @@ __device__ __forceinline__ static int nearest_neighbor_compute_source_index(
  return src_index;
 }

-__device__ __forceinline__ static int nearest_neighbor_exact_compute_source_index(
+__device__ __forceinline__ int nearest_neighbor_exact_compute_source_index(
    const float scale,
    int dst_index,
    int input_size) {
@ -157,7 +157,7 @@ __device__ __forceinline__ static int nearest_neighbor_exact_compute_source_inde
 }

 // see NOTE [ Nearest neighbor upsampling kernel implementation ]
-__device__ __forceinline__ static int nearest_neighbor_bw_compute_source_index(
+__device__ __forceinline__ int nearest_neighbor_bw_compute_source_index(
    const float scale,
    int dst_index,
    int output_size) {
@ -170,7 +170,7 @@ __device__ __forceinline__ static int nearest_neighbor_bw_compute_source_index(
 }

 // see NOTE [ Nearest neighbor upsampling kernel implementation ]
-__device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_index(
+__device__ __forceinline__ int nearest_neighbor_exact_bw_compute_source_index(
    const float scale,
    int dst_index,
    int output_size) {
@ -182,7 +182,7 @@ __device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_i

 /* Used by UpSampleBicubic2d.cu */
 template <typename scalar_t>
-__device__ __forceinline__ static scalar_t upsample_get_value_bounded(
+__device__ __forceinline__ scalar_t upsample_get_value_bounded(
    const PackedTensorAccessor64<const scalar_t, 4>& data,
    int batch,
    int channel,
@ -197,7 +197,7 @@ __device__ __forceinline__ static scalar_t upsample_get_value_bounded(

 /* Used by UpSampleBicubic2d.cu */
 template <typename scalar_t, typename accscalar_t>
-__device__ __forceinline__ static void upsample_increment_value_bounded(
+__device__ __forceinline__ void upsample_increment_value_bounded(
    PackedTensorAccessor64<scalar_t, 4>& data,
    int batch,
    int channel,
@ -218,21 +218,21 @@ __device__ __forceinline__ static void upsample_increment_value_bounded(
 // Based on
 // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
 template <typename accscalar_t>
-__device__ __forceinline__ static accscalar_t cubic_convolution1(
+__device__ __forceinline__ accscalar_t cubic_convolution1(
    accscalar_t x,
    accscalar_t A) {
  return ((A + 2) * x - (A + 3)) * x * x + 1;
 }

 template <typename accscalar_t>
-__device__ __forceinline__ static accscalar_t cubic_convolution2(
+__device__ __forceinline__ accscalar_t cubic_convolution2(
    accscalar_t x,
    accscalar_t A) {
  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
 }

 template <typename accscalar_t>
-__device__ __forceinline__ static void get_cubic_upsampling_coefficients(
+__device__ __forceinline__ void get_cubic_upsampling_coefficients(
    accscalar_t coeffs[4],
    accscalar_t t) {
  accscalar_t A = -0.75;
@ -248,7 +248,7 @@ __device__ __forceinline__ static void get_cubic_upsampling_coefficients(
 }

 template <typename scalar_t, typename accscalar_t>
-__device__ __forceinline__ static accscalar_t cubic_interp1d(
+__device__ __forceinline__ accscalar_t cubic_interp1d(
    scalar_t x0,
    scalar_t x1,
    scalar_t x2,
@ -306,7 +306,7 @@ struct BicubicFilterFunctor {
 };

 template <typename accscalar_t>
-__device__ __forceinline__ static void _compute_weights_span(
+__device__ __forceinline__ void _compute_weights_span(
    const int i,
    const int input_size,
    const accscalar_t scale,
@ -320,7 +320,7 @@ __device__ __forceinline__ static void _compute_weights_span(
 }

 template <typename scalar_t, typename accscalar_t, typename interp_filter_t>
-__device__ __forceinline__ static void _compute_weights(
+__device__ __forceinline__ void _compute_weights(
    scalar_t* wt_ptr,
    const accscalar_t scale,
    int interp_size,
@ -347,7 +347,7 @@ __device__ __forceinline__ static void _compute_weights(
 }

 template <typename scalar_t, typename accscalar_t>
-__device__ __forceinline__ static accscalar_t interpolate_aa_single_dim(
+__device__ __forceinline__ accscalar_t interpolate_aa_single_dim(
    const scalar_t* src,
    const scalar_t* weights,
    int size) {
--- a/c10/cuda/CUDAMathCompat.h
+++ b/c10/cuda/CUDAMathCompat.h
@ -14,7 +14,7 @@
 #ifdef __CUDACC_RTC__
 #define __MATH_FUNCTIONS_DECL__ C10_HOST_DEVICE
 #else /* __CUDACC_RTC__ */
-#define __MATH_FUNCTIONS_DECL__ static inline C10_HOST_DEVICE
+#define __MATH_FUNCTIONS_DECL__ inline C10_HOST_DEVICE
 #endif /* __CUDACC_RTC__ */
 #endif /* __HIPCC__ */