[2/N] Turn inline static functions into static (#140068)

Fixes #ISSUE_NUMBER

Pull Request resolved: https://github.com/pytorch/pytorch/pull/140068
Approved by: https://github.com/ezyang
This commit is contained in:
cyy 2024-11-09 03:31:24 +00:00 committed by PyTorch MergeBot
parent 3b8470c461
commit 032135f8a2
7 changed files with 33 additions and 40 deletions

View File

@ -267,7 +267,7 @@ struct Dist {
// This does a backward pass down a Vec column of the input // This does a backward pass down a Vec column of the input
template <typename F> template <typename F>
inline static void backward_down_column_pdist(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) { static void backward_down_column_pdist(const scalar_t * self_i, scalar_t * res_i, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t n, int64_t m, int64_t gs, int64_t count = Vec::size()) {
for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) { for (const scalar_t * const self_end = self_i + m * n; self_i != self_end - m; self_i += m, res_i += m) {
const Vec self_vec_i = Vec::loadu(self_i, count); const Vec self_vec_i = Vec::loadu(self_i, count);
@ -391,7 +391,7 @@ struct Dist {
} }
template <typename F> template <typename F>
inline static void backward_down_column_cdist(const scalar_t * t1, const scalar_t * t2, scalar_t * res, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t r1, int64_t r2, int64_t m, int64_t d, int64_t gs, int64_t l1_size, int64_t l2_size, int64_t count = Vec::size()) { static void backward_down_column_cdist(const scalar_t * t1, const scalar_t * t2, scalar_t * res, const scalar_t * grad_k, const scalar_t * dist_k, const Vec& pvec, int64_t r1, int64_t r2, int64_t m, int64_t d, int64_t gs, int64_t l1_size, int64_t l2_size, int64_t count = Vec::size()) {
const scalar_t * t1_end = t1 + l1_size; const scalar_t * t1_end = t1 + l1_size;
const scalar_t * t2_end = t2 + l2_size; const scalar_t * t2_end = t2 + l2_size;

View File

@ -238,16 +238,9 @@ void ldl_solve_cusolver(
#if defined(USE_LINALG_SOLVER) #if defined(USE_LINALG_SOLVER)
inline static Tensor column_major_identity_matrix_like(const Tensor& self) {
auto size = self.sizes();
auto size_slice = IntArrayRef(size.data(), size.size()-1);
return at::ones(size_slice, self.options()).diag_embed().mT();
}
// call cusolver gesvd function to calculate svd // call cusolver gesvd function to calculate svd
template<typename scalar_t> template<typename scalar_t>
inline static void apply_svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, static void apply_svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
const Tensor& infos, bool full_matrices, bool compute_uv, const Tensor& infos, bool full_matrices, bool compute_uv,
const bool calculate_all_batches, const bool calculate_all_batches,
const std::vector<int64_t>& batches const std::vector<int64_t>& batches
@ -319,7 +312,7 @@ inline static void apply_svd_cusolver_gesvd(const Tensor& A, const Tensor& U, co
} }
// We'll copy A inside svd_cusolver_gesvd // We'll copy A inside svd_cusolver_gesvd
inline static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
const Tensor& infos, bool full_matrices, bool compute_uv, const Tensor& infos, bool full_matrices, bool compute_uv,
const bool calculate_all_batches = true, const bool calculate_all_batches = true,
const std::vector<int64_t>& batches = {} const std::vector<int64_t>& batches = {}
@ -356,7 +349,7 @@ inline static void svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Te
// call cusolver gesvdj function to calculate svd // call cusolver gesvdj function to calculate svd
template<typename scalar_t> template<typename scalar_t>
inline static void apply_svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, static void apply_svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
const Tensor& infos, bool full_matrices, bool compute_uv) { const Tensor& infos, bool full_matrices, bool compute_uv) {
using value_t = typename c10::scalar_value_type<scalar_t>::type; using value_t = typename c10::scalar_value_type<scalar_t>::type;
int m = cuda_int_cast(A.size(-2), "m"); int m = cuda_int_cast(A.size(-2), "m");
@ -430,7 +423,7 @@ inline static void apply_svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, c
// wrapper around apply_svd_cusolver_gesvdj that handles dtype dispatch // wrapper around apply_svd_cusolver_gesvdj that handles dtype dispatch
// note that gesvdj returns V, which is what we want // note that gesvdj returns V, which is what we want
// Need to pass a copy of A, since A will be rewritten inside the function call // Need to pass a copy of A, since A will be rewritten inside the function call
inline static void svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) { static void svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) {
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "svd_cuda_gesvdj", [&] { AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "svd_cuda_gesvdj", [&] {
apply_svd_cusolver_gesvdj<scalar_t>(A, U, S, V, infos, full_matrices, compute_uv); apply_svd_cusolver_gesvdj<scalar_t>(A, U, S, V, infos, full_matrices, compute_uv);
}); });
@ -438,7 +431,7 @@ inline static void svd_cusolver_gesvdj(const Tensor& A, const Tensor& U, const T
// call cusolver gesvdj batched function to calculate svd // call cusolver gesvdj batched function to calculate svd
template<typename scalar_t> template<typename scalar_t>
inline static void apply_svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, static void apply_svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
const Tensor& infos, bool compute_uv const Tensor& infos, bool compute_uv
) { ) {
using value_t = typename c10::scalar_value_type<scalar_t>::type; using value_t = typename c10::scalar_value_type<scalar_t>::type;
@ -481,7 +474,7 @@ inline static void apply_svd_cusolver_gesvdjBatched(const Tensor& A, const Tenso
TORCH_CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params)); TORCH_CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
} }
inline static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) { static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& infos, bool full_matrices, bool compute_uv) {
auto m = A.size(-2); auto m = A.size(-2);
auto n = A.size(-1); auto n = A.size(-1);
auto k = std::min(m, n); auto k = std::min(m, n);
@ -520,7 +513,7 @@ inline static void svd_cusolver_gesvdjBatched(const Tensor& A, const Tensor& U,
} }
template<typename scalar_t> template<typename scalar_t>
inline static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
const Tensor& infos, bool full_matrices, bool compute_uv) { const Tensor& infos, bool full_matrices, bool compute_uv) {
#ifndef CUDART_VERSION #ifndef CUDART_VERSION
TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend.") TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend.")
@ -577,7 +570,7 @@ inline static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, cons
} }
// We'll copy A inside svd_cusolver_gesvdaStridedBatched // We'll copy A inside svd_cusolver_gesvdaStridedBatched
inline static void svd_cusolver_gesvdaStridedBatched( static void svd_cusolver_gesvdaStridedBatched(
const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V, const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
const Tensor& infos, bool full_matrices, bool compute_uv) { const Tensor& infos, bool full_matrices, bool compute_uv) {
// We need to pass a copy of A, as it will be overwritten // We need to pass a copy of A, as it will be overwritten
@ -716,7 +709,7 @@ void svd_cusolver(const Tensor& A,
// Implementation of Cholesky decomposition using looped cusolverDn<T>potrf or cusolverDnXpotrf (64-bit) // Implementation of Cholesky decomposition using looped cusolverDn<T>potrf or cusolverDnXpotrf (64-bit)
template<typename scalar_t> template<typename scalar_t>
inline static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_working_copy, bool upper, const Tensor& infos) { static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_working_copy, bool upper, const Tensor& infos) {
auto handle = at::cuda::getCurrentCUDASolverDnHandle(); auto handle = at::cuda::getCurrentCUDASolverDnHandle();
const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
const int64_t n = self_working_copy.size(-1); const int64_t n = self_working_copy.size(-1);
@ -785,7 +778,7 @@ inline static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_worki
// Warning: cusolverDn<T>potrfBatched doesn't work quite well when matrix size or batch size is zero. // Warning: cusolverDn<T>potrfBatched doesn't work quite well when matrix size or batch size is zero.
// If you write your own C++ extension and use this function, make sure you do a zero numel check for the input. // If you write your own C++ extension and use this function, make sure you do a zero numel check for the input.
template<typename scalar_t> template<typename scalar_t>
inline static void apply_cholesky_cusolver_potrfBatched(const Tensor& self_working_copy, bool upper, const Tensor& infos) { static void apply_cholesky_cusolver_potrfBatched(const Tensor& self_working_copy, bool upper, const Tensor& infos) {
auto handle = at::cuda::getCurrentCUDASolverDnHandle(); auto handle = at::cuda::getCurrentCUDASolverDnHandle();
const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
const int n = cuda_int_cast(self_working_copy.size(-1), "n"); const int n = cuda_int_cast(self_working_copy.size(-1), "n");
@ -820,7 +813,7 @@ void cholesky_helper_cusolver(const Tensor& input, bool upper, const Tensor& inf
template<typename scalar_t> template<typename scalar_t>
inline static void apply_cholesky_cusolver_potrs(Tensor& self_working_copy, const Tensor& A_column_major_copy, bool upper, Tensor& infos) { static void apply_cholesky_cusolver_potrs(Tensor& self_working_copy, const Tensor& A_column_major_copy, bool upper, Tensor& infos) {
auto handle = at::cuda::getCurrentCUDASolverDnHandle(); auto handle = at::cuda::getCurrentCUDASolverDnHandle();
const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
const int64_t n = self_working_copy.size(-2); const int64_t n = self_working_copy.size(-2);
@ -876,7 +869,7 @@ inline static void apply_cholesky_cusolver_potrs(Tensor& self_working_copy, cons
// This code path is only dispatched to if MAGMA is not linked in the pytorch build. // This code path is only dispatched to if MAGMA is not linked in the pytorch build.
// cusolverDn<t>potrsBatched only supports nrhs == 1 // cusolverDn<t>potrsBatched only supports nrhs == 1
template<typename scalar_t> template<typename scalar_t>
inline static void apply_cholesky_cusolver_potrsBatched(Tensor& self_working_copy, const Tensor& A_column_major_copy, bool upper, Tensor& infos) { static void apply_cholesky_cusolver_potrsBatched(Tensor& self_working_copy, const Tensor& A_column_major_copy, bool upper, Tensor& infos) {
auto handle = at::cuda::getCurrentCUDASolverDnHandle(); auto handle = at::cuda::getCurrentCUDASolverDnHandle();
const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
const int64_t n = self_working_copy.size(-2); const int64_t n = self_working_copy.size(-2);
@ -1147,7 +1140,7 @@ void ormqr_cusolver(const Tensor& input, const Tensor& tau, const Tensor& other,
For further details, please see the cuSOLVER documentation for ORGQR and UNGQR. For further details, please see the cuSOLVER documentation for ORGQR and UNGQR.
*/ */
template <typename scalar_t> template <typename scalar_t>
inline static void apply_orgqr(Tensor& self, const Tensor& tau) { static void apply_orgqr(Tensor& self, const Tensor& tau) {
auto self_data = self.data_ptr<scalar_t>(); auto self_data = self.data_ptr<scalar_t>();
auto tau_data = tau.const_data_ptr<scalar_t>(); auto tau_data = tau.const_data_ptr<scalar_t>();
auto self_matrix_stride = matrixStride(self); auto self_matrix_stride = matrixStride(self);

View File

@ -2456,7 +2456,7 @@ Call this whenever a new thread is created in order to propagate values from
// Checks that the _C shared library isn't initialized multiple times. This // Checks that the _C shared library isn't initialized multiple times. This
// can happen if the same csrc files are compiled into multiple shared // can happen if the same csrc files are compiled into multiple shared
// libraries. // libraries.
inline static void pytorch_duplicate_guard() { static void pytorch_duplicate_guard() {
static int initialized = 0; static int initialized = 0;
if (initialized) { if (initialized) {
fmt::print(stderr, "pytorch: _C shared library re-initialized\n"); fmt::print(stderr, "pytorch: _C shared library re-initialized\n");

View File

@ -1134,7 +1134,7 @@ void Engine::evaluate_function(
} }
} }
inline static uint64_t compute_min_topological_nr(const edge_list& outputs) { static uint64_t compute_min_topological_nr(const edge_list& outputs) {
// Computes the mininum topological number among all the outputs // Computes the mininum topological number among all the outputs
if (outputs.empty()) { if (outputs.empty()) {
return 0; return 0;

View File

@ -162,7 +162,7 @@ c10::intrusive_ptr<at::ivalue::Future> PythonEngine::execute_with_graph_task(
PyObject* THPEngineClass = nullptr; PyObject* THPEngineClass = nullptr;
inline static Edge parseGradientEdge(PyObject* obj, int64_t index) { static Edge parseGradientEdge(PyObject* obj, int64_t index) {
PyObject* grad_fn = PyTuple_GetItem(obj, 0); PyObject* grad_fn = PyTuple_GetItem(obj, 0);
auto output_nr = THPUtils_unpackLong(PyTuple_GetItem(obj, 1)); auto output_nr = THPUtils_unpackLong(PyTuple_GetItem(obj, 1));
std::shared_ptr<torch::autograd::Node> grad_fn_sp; std::shared_ptr<torch::autograd::Node> grad_fn_sp;

View File

@ -18,7 +18,7 @@ static int active_dynamo_threads = 0;
static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT; static Py_tss_t eval_frame_callback_key = Py_tss_NEEDS_INIT;
inline static PyObject* eval_frame_callback_get(void) { static PyObject* eval_frame_callback_get(void) {
void* result = PyThread_tss_get(&eval_frame_callback_key); void* result = PyThread_tss_get(&eval_frame_callback_key);
if (unlikely(result == NULL)) { if (unlikely(result == NULL)) {
return (PyObject*)Py_None; return (PyObject*)Py_None;
@ -27,7 +27,7 @@ inline static PyObject* eval_frame_callback_get(void) {
} }
} }
inline static void eval_frame_callback_set(PyObject* obj) { static void eval_frame_callback_set(PyObject* obj) {
PyThread_tss_set(&eval_frame_callback_key, obj); PyThread_tss_set(&eval_frame_callback_key, obj);
} }
@ -186,7 +186,7 @@ static PyObject* dynamo_custom_eval_frame_shim(THP_EVAL_API_FRAME_OBJECT* frame,
} }
#endif #endif
inline static PyObject* dynamo_eval_frame_default( static PyObject* dynamo_eval_frame_default(
PyThreadState* tstate, PyThreadState* tstate,
THP_EVAL_API_FRAME_OBJECT* frame, THP_EVAL_API_FRAME_OBJECT* frame,
int throw_flag) { int throw_flag) {
@ -205,7 +205,7 @@ inline static PyObject* dynamo_eval_frame_default(
#endif #endif
} }
inline static void enable_eval_frame_shim(PyThreadState* tstate) { static void enable_eval_frame_shim(PyThreadState* tstate) {
#if PY_VERSION_HEX >= 0x03090000 #if PY_VERSION_HEX >= 0x03090000
if (_PyInterpreterState_GetEvalFrameFunc(tstate->interp) != if (_PyInterpreterState_GetEvalFrameFunc(tstate->interp) !=
&dynamo_custom_eval_frame_shim) { &dynamo_custom_eval_frame_shim) {
@ -222,7 +222,7 @@ inline static void enable_eval_frame_shim(PyThreadState* tstate) {
#endif #endif
} }
inline static void enable_eval_frame_default(PyThreadState* tstate) { static void enable_eval_frame_default(PyThreadState* tstate) {
#if PY_VERSION_HEX >= 0x03090000 #if PY_VERSION_HEX >= 0x03090000
if (_PyInterpreterState_GetEvalFrameFunc(tstate->interp) != if (_PyInterpreterState_GetEvalFrameFunc(tstate->interp) !=
previous_eval_frame) { previous_eval_frame) {
@ -240,13 +240,13 @@ inline static void enable_eval_frame_default(PyThreadState* tstate) {
} }
inline static const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) { static const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame) {
// Returns the C string name of the current frame. // Returns the C string name of the current frame.
DEBUG_CHECK(PyUnicode_Check(F_CODE(frame)->co_name)); DEBUG_CHECK(PyUnicode_Check(F_CODE(frame)->co_name));
return PyUnicode_AsUTF8(F_CODE(frame)->co_name); return PyUnicode_AsUTF8(F_CODE(frame)->co_name);
} }
static inline PyObject* dynamo_call_callback( static PyObject* dynamo_call_callback(
PyObject* callable, PyObject* callable,
THP_EVAL_API_FRAME_OBJECT* _frame, THP_EVAL_API_FRAME_OBJECT* _frame,
PyObject* locals, PyObject* locals,
@ -277,7 +277,7 @@ static inline PyObject* dynamo_call_callback(
return res; return res;
} }
static inline void clear_old_frame_if_python_312_plus( static void clear_old_frame_if_python_312_plus(
PyThreadState* tstate, PyThreadState* tstate,
THP_EVAL_API_FRAME_OBJECT* frame) { THP_EVAL_API_FRAME_OBJECT* frame) {
#if IS_PYTHON_3_12_PLUS #if IS_PYTHON_3_12_PLUS
@ -288,7 +288,7 @@ static inline void clear_old_frame_if_python_312_plus(
#endif #endif
} }
inline static PyObject* dynamo_eval_custom_code_impl( static PyObject* dynamo_eval_custom_code_impl(
PyThreadState* tstate, PyThreadState* tstate,
THP_EVAL_API_FRAME_OBJECT* frame, THP_EVAL_API_FRAME_OBJECT* frame,
PyCodeObject* code, PyCodeObject* code,
@ -467,7 +467,7 @@ inline static PyObject* dynamo_eval_custom_code_impl(
} }
// This wrapper function adds a profiler event // This wrapper function adds a profiler event
inline static PyObject* dynamo_eval_custom_code( static PyObject* dynamo_eval_custom_code(
PyThreadState* tstate, PyThreadState* tstate,
THP_EVAL_API_FRAME_OBJECT* frame, THP_EVAL_API_FRAME_OBJECT* frame,
PyCodeObject* code, PyCodeObject* code,
@ -725,8 +725,8 @@ typedef struct THPPyInterpreterFrame {
_PyInterpreterFrame* frame; // Borrowed reference _PyInterpreterFrame* frame; // Borrowed reference
} THPPyInterpreterFrame; } THPPyInterpreterFrame;
inline static void enable_eval_frame_shim(PyThreadState* tstate) {} static void enable_eval_frame_shim(PyThreadState* tstate) {}
inline static void enable_eval_frame_default(PyThreadState* tstate) {} static void enable_eval_frame_default(PyThreadState* tstate) {}
static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL}; static struct PyGetSetDef THPPyInterpreterFrame_properties[] = {NULL};

View File

@ -756,7 +756,7 @@ static PyObject* assert_size_stride(PyObject* dummy, PyObject* args) {
} }
template <typename T> template <typename T>
inline static void unwrap_size_tuple(PyObject* obj, T& output) { static void unwrap_size_tuple(PyObject* obj, T& output) {
TORCH_CHECK(PyTuple_CheckExact(obj)); TORCH_CHECK(PyTuple_CheckExact(obj));
size_t len = PyTuple_GET_SIZE(obj); size_t len = PyTuple_GET_SIZE(obj);
output.reserve(len); output.reserve(len);
@ -768,7 +768,7 @@ inline static void unwrap_size_tuple(PyObject* obj, T& output) {
} }
template <typename T> template <typename T>
inline static void _parse_empty_strided_args( static void _parse_empty_strided_args(
PyObject* args, PyObject* args,
T& sizes, T& sizes,
T& strides, T& strides,
@ -783,7 +783,7 @@ inline static void _parse_empty_strided_args(
dtype = reinterpret_cast<THPDtype*>(py_dtype)->scalar_type; dtype = reinterpret_cast<THPDtype*>(py_dtype)->scalar_type;
} }
inline static PyObject* _empty_strided_device( static PyObject* _empty_strided_device(
PyObject* dummy, PyObject* dummy,
PyObject* args, PyObject* args,
c10::DeviceType device_type) { c10::DeviceType device_type) {