Simplify c10::guts::apply (#164566)

There is only one call site of `c10::guts::apply` that can be replaced by `:std::apply` except for ROCm. This PR therefore simplifies the implementation of `c10::guts::apply`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/164566 Approved by: https://github.com/Aidyn-A, https://github.com/albanD
2025-12-06 12:20:52 +01:00 · 2025-10-22 00:47:43 +00:00 · 2025-10-22 00:47:43 +00:00 · 35153d0846
commit 35153d0846
parent 7773a22cdb
2 changed files with 9 additions and 13 deletions
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@ -1,18 +1,17 @@
 #pragma once

+#include <ATen/OpMathType.h>
+#include <ATen/cuda/detail/OffsetCalculator.cuh>
 #include <ATen/detail/FunctionTraits.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/native/TensorIteratorDynamicCasting.h>
-#include <ATen/cuda/detail/OffsetCalculator.cuh>
-#include <ATen/OpMathType.h>
 #include <ATen/native/cuda/thread_constants.h>
-
-#include <thrust/tuple.h>
-
 #include <ATen/native/cuda/MemoryAccess.cuh>

 #include <tuple>

+
+
 namespace at::native {

 template<int N>
@ -62,7 +61,11 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
  #pragma unroll
  for (int i = 0; i < elems_per_thread; i++) {
    if (policy.check_inbounds(i)) {
+#if defined(__HIP__)
      results[i] = c10::guts::apply(f, args[i]);
+#else
+      results[i] = std::apply(f, args[i]);
+#endif
    }
  }

--- a/c10/util/C++17.h
+++ b/c10/util/C++17.h
@ -45,14 +45,7 @@ constexpr bool is_pod_v = is_pod<T>::value;

 namespace guts {

-#if defined(__cpp_lib_apply) && !defined(__CUDA_ARCH__) && !defined(__HIP__)
-
-template <class F, class Tuple>
-C10_HOST_DEVICE inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
-  return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
-}
-
-#else
+#if defined(__HIP__)

 // Implementation from http://en.cppreference.com/w/cpp/utility/apply (but
 // modified)