ATen Unary Ops (#6030)

Implements a few unary operations for which there are AVX intrinsics. The perf comparison script is here: https://paste.fedoraproject.org/paste/f1adcJhpGtzDNWImS34XzQ
2025-12-06 12:20:52 +01:00 · 2018-03-27 20:39:28 -04:00 · 2018-03-27 20:39:28 -04:00 · bde2f6b298
commit bde2f6b298
parent ebc0194950
13 changed files with 797 additions and 289 deletions
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@ -1665,20 +1665,8 @@
    - THTensor* self
 ]]
 [[
-  name: sqrt_
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
+  name: _sqrt
  cname: sqrt
-  return: self
-  arguments:
-    - THTensor* self
-    - THTensor* self
-]]
-[[
-  name: sqrt
  types:
    - floating_point
  backends:
@ -1723,20 +1711,8 @@
    - THTensor* self
 ]]
 [[
-  name: ceil_
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
+  name: _ceil
  cname: ceil
-  return: self
-  arguments:
-    - THTensor* self
-    - THTensor* self
-]]
-[[
-  name: ceil
  types:
    - floating_point
  backends:
@ -1752,20 +1728,8 @@
    - THTensor* self
 ]]
 [[
-  name: floor_
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
+  name: _floor
  cname: floor
-  return: self
-  arguments:
-    - THTensor* self
-    - THTensor* self
-]]
-[[
-  name: floor
  types:
    - floating_point
  backends:
@ -1781,20 +1745,8 @@
    - THTensor* self
 ]]
 [[
-  name: round_
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
+  name: _round
  cname: round
-  return: self
-  arguments:
-    - THTensor* self
-    - THTensor* self
-]]
-[[
-  name: round
  types:
    - floating_point
  backends:
@ -1810,20 +1762,8 @@
    - THTensor* self
 ]]
 [[
-  name: trunc_
-  types:
-    - floating_point
-  backends:
-    - CPU
-    - CUDA
+  name: _trunc
  cname: trunc
-  return: self
-  arguments:
-    - THTensor* self
-    - THTensor* self
-]]
-[[
-  name: trunc
  types:
    - floating_point
  backends:
--- a/aten/src/ATen/Parallel.cpp
+++ b/aten/src/ATen/Parallel.cpp
@ -4,15 +4,15 @@
 #include <tbb/parallel_reduce.h>
 #include <tbb/partitioner.h>
 #include <tbb/tbb.h>
-#include <thread>
 #include <cassert>
+#include <thread>

-namespace at {
-namespace internal {
+namespace at { namespace internal {

 // thread_local variable with internal linkage
 // requires no guarding as it's storage duration is defined to be per thread
-static thread_local tbb::task_scheduler_init tbbinit(tbb::task_scheduler_init::deferred);
+static thread_local tbb::task_scheduler_init tbbinit(
+    tbb::task_scheduler_init::deferred);
 // Tracks number of threads uses which TBB doesn't track.
 static thread_local int num_threads_ = -1;

@ -22,9 +22,9 @@ void init_tbb_num_threads() {
  int num_threads = at::get_num_threads();
  // In order to have control over the number of threads this function
  // must be called first before any other tbb parallel construct is
-  // excercised within a particular thread. Otherwise the default 
-  // scheduler will be created over which we do not have control. 
-  // The following code will and must throw an error if tbb has 
+  // excercised within a particular thread. Otherwise the default
+  // scheduler will be created over which we do not have control.
+  // The following code will and must throw an error if tbb has
  // already been initialized before this function was called.
  if (!tbbinit.is_active() && !first_call)
    throw std::runtime_error(
@ -51,5 +51,4 @@ void init_tbb_num_threads() {
    num_threads_ = num_threads;
  }
 }
-}
-}
+}} // namespace at::internal
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@ -1,6 +1,7 @@
 #pragma once
-#include <cstddef>
+#include <ATen/ATen.h>
 #include <tbb/tbb.h>
+#include <cstddef>

 namespace at {
 namespace internal {
@ -21,12 +22,15 @@ void init_tbb_num_threads();
 // no parallel algorithm (such as parallel_reduce) should split work into
 // smaller than GRAIN_SIZE chunks.
 constexpr size_t TBB_GRAIN_SIZE = 32768;
-}
+} // namespace internal

 template <class T, template <class> class OP>
-T parallel_reduce(T (*f)(const T *, size_t, size_t, T), const T *data,
-                  size_t start, size_t end, T init_) {
-
+T parallel_reduce(
+    T (*f)(const T*, size_t, size_t, T),
+    const T* data,
+    size_t start,
+    size_t end,
+    T init_) {
  internal::init_tbb_num_threads();

  T result_;
@ -35,19 +39,25 @@ T parallel_reduce(T (*f)(const T *, size_t, size_t, T), const T *data,
    result_ = f(data, start, end, init_);
  } else {
    result_ = tbb::parallel_reduce(
-        tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE), init_,
+        tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE),
+        init_,
        [&data, &f](const tbb::blocked_range<size_t> r, T init) -> T {
          return f(data, r.begin(), r.end(), init);
        },
-        OP<T>(), ap);
+        OP<T>(),
+        ap);
  }
  return result_;
 }

 template <class T>
-void parallel_reduce_2d(void (*f)(const T *, T *, size_t, size_t), size_t num_rows,
-                     size_t num_cols, size_t numel, const T *arr_, T *outarr_) {
-
+void parallel_reduce_2d(
+    void (*f)(const T*, T*, size_t, size_t),
+    size_t num_rows,
+    size_t num_cols,
+    size_t numel,
+    const T* arr_,
+    T* outarr_) {
  internal::init_tbb_num_threads();

  static tbb::affinity_partitioner ap;
@ -63,19 +73,44 @@ void parallel_reduce_2d(void (*f)(const T *, T *, size_t, size_t), size_t num_ro
      f(arr, outarr, num_rows, num_cols);
    }
  } else {
-    tbb::parallel_for(tbb::blocked_range<size_t>(
-                          0, max_i_, 1),
-                      [&arr_, &outarr_, num_rows, num_cols,
-                       &f](const tbb::blocked_range<size_t> r) {
-                        for (size_t i_ = r.begin(); i_ < r.end(); i_++) {
-                          int64_t i = i_ * num_rows * num_cols;
-                          int64_t i_r = i_ * num_cols;
-                          const T *arr = arr_ + i;
-                          T *outarr = outarr_ + i_r;
-                          f(arr, outarr, num_rows, num_cols);
-                        }
-                      },
-                      ap);
+    tbb::parallel_for(
+        tbb::blocked_range<size_t>(0, max_i_, 1),
+        [&arr_, &outarr_, num_rows, num_cols, &f](
+            const tbb::blocked_range<size_t> r) {
+          for (size_t i_ = r.begin(); i_ < r.end(); i_++) {
+            int64_t i = i_ * num_rows * num_cols;
+            int64_t i_r = i_ * num_cols;
+            const T* arr = arr_ + i;
+            T* outarr = outarr_ + i_r;
+            f(arr, outarr, num_rows, num_cols);
+          }
+        },
+        ap);
  }
 }
+
+template <class T>
+void parallel_for_1d(
+    void (*f)(T*, const T*, size_t, size_t),
+    Tensor& result,
+    const Tensor& self) {
+  internal::init_tbb_num_threads();
+
+  static tbb::affinity_partitioner ap;
+
+  T* arr_out = result.data<T>();
+  const T* arr_in = self.data<T>();
+  size_t start = 0;
+  size_t end = self.numel();
+  if (end - start < internal::TBB_GRAIN_SIZE) {
+    f(arr_out, arr_in, start, end);
+  } else {
+    tbb::parallel_for(
+        tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE),
+        [&arr_out, &arr_in, &f](const tbb::blocked_range<size_t> r) {
+          f(arr_out, arr_in, r.begin(), r.end());
+        },
+        ap);
+  }
 }
+} // namespace at
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -12,43 +12,46 @@

 #include <map>

-namespace at {
-namespace native {
+namespace at { namespace native {

-using reduce_type = void(Tensor &, const Tensor &, size_t, bool);
-reduce_type *sumImpl = &DispatchStub<reduce_type>::init<sumImplC, &sumImpl>;
-reduce_type *prodImpl = &DispatchStub<reduce_type>::init<prodImplC, &prodImpl>;
+using reduce_type = void(Tensor&, const Tensor&, size_t, bool);
+reduce_type* sumImpl = &DispatchStub<reduce_type>::init<sumImplC, &sumImpl>;
+reduce_type* prodImpl = &DispatchStub<reduce_type>::init<prodImplC, &prodImpl>;

 // ALL REDUCE #################################################################

-Tensor _reduce_cpu(reduce_type *f, const Tensor &self) {
+Tensor _reduce_cpu(reduce_type* f, const Tensor& self) {
  Tensor result = self.type().tensor({});
  f(result, self, 0, true);
  return result;
 }

-Tensor _sum_cpu(const Tensor &self) {
+Tensor _sum_cpu(const Tensor& self) {
  if (self.is_contiguous())
    return _reduce_cpu(sumImpl, self);
  return self._sumall();
 }

-Tensor _prod_cpu(const Tensor &self) {
+Tensor _prod_cpu(const Tensor& self) {
  if (self.is_contiguous())
    return _reduce_cpu(prodImpl, self);
  return self._prodall();
 }

-Tensor _sum_cuda(const Tensor &self_) { return self_._sumall(); }
+Tensor _sum_cuda(const Tensor& self_) {
+  return self_._sumall();
+}

-Tensor _prod_cuda(const Tensor &self_) { return self_._prodall(); }
+Tensor _prod_cuda(const Tensor& self_) {
+  return self_._prodall();
+}

 // \ALL REDUCE ################################################################

 // DIM REDUCE #################################################################

-static bool _dimreduce_return_trivial(Tensor &result, const Tensor &self,
-                                      int64_t ident) {
+static bool
+_dimreduce_return_trivial(Tensor& result, const Tensor& self, int64_t ident) {
  if (self.numel() == 1 && self.ndimension() == 0) {
    result.resize_({});
    result.fill_(self);
@ -63,8 +66,8 @@ static bool _dimreduce_return_trivial(Tensor &result, const Tensor &self,
  return false;
 }

-static Tensor &_dimreduce_setup(Tensor &result, const Tensor &self,
-                                int64_t dim) {
+static Tensor&
+_dimreduce_setup(Tensor& result, const Tensor& self, int64_t dim) {
  IntList self_sizes = self.sizes();
  std::vector<int64_t> result_sizes;
  result_sizes.insert(result_sizes.end(), self_sizes.begin(), self_sizes.end());
@ -73,8 +76,12 @@ static Tensor &_dimreduce_setup(Tensor &result, const Tensor &self,
  return result;
 }

-Tensor &_reduce_out_cpu(reduce_type *f, Tensor &result, const Tensor &self,
-                        int64_t dim, bool keepdim) {
+Tensor& _reduce_out_cpu(
+    reduce_type* f,
+    Tensor& result,
+    const Tensor& self,
+    int64_t dim,
+    bool keepdim) {
  result = _dimreduce_setup(result, self, dim);
  f(result, self, dim, false);
  if (!keepdim)
@ -82,8 +89,8 @@ Tensor &_reduce_out_cpu(reduce_type *f, Tensor &result, const Tensor &self,
  return result;
 }

-Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
-                     bool keepdim) {
+Tensor&
+_sum_out_cpu(Tensor& result, const Tensor& self, int64_t dim_, bool keepdim) {
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  if (_dimreduce_return_trivial(result, self, 0))
    return result;
@ -93,8 +100,8 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
  return at::_sum_out(result, self, dim, keepdim);
 }

-Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
-                      bool keepdim) {
+Tensor&
+_prod_out_cpu(Tensor& result, const Tensor& self, int64_t dim_, bool keepdim) {
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  if (_dimreduce_return_trivial(result, self, 1))
    return result;
@ -104,28 +111,27 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
  return at::_prod_out(result, self, dim, keepdim);
 }

-Tensor &_sum_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
-                      bool keepdim) {
+Tensor&
+_sum_out_cuda(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) {
  return at::_sum_out(result, self, dim, keepdim);
 }

-Tensor &_prod_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
-                       bool keepdim) {
+Tensor&
+_prod_out_cuda(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) {
  return at::_prod_out(result, self, dim, keepdim);
 }

-Tensor sum(const Tensor &self, int64_t dim_, bool keepdim) {
+Tensor sum(const Tensor& self, int64_t dim_, bool keepdim) {
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  Tensor result = self.type().tensor();
  return at::sum_out(result, self, dim, keepdim);
 }

-Tensor prod(const Tensor &self, int64_t dim_, bool keepdim) {
+Tensor prod(const Tensor& self, int64_t dim_, bool keepdim) {
  int64_t dim = maybe_wrap_dim(dim_, self.dim());
  Tensor result = self.type().tensor();
  return at::prod_out(result, self, dim, keepdim);
 }

 // \DIM REDUCE ################################################################
-}
-}
+}} // namespace at::native
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@ -0,0 +1,118 @@
+#include "ATen/ATen.h"
+#include "ATen/Dispatch.h"
+#include "ATen/ExpandUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/WrapDimUtils.h"
+#include "cpu/UnaryOpsKernel.h"
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include <map>
+
+namespace at { namespace native {
+
+using unary_type = void(Tensor&, const Tensor&);
+unary_type* ceilImpl = DispatchStub<unary_type>::init<ceilImplC, &ceilImpl>;
+unary_type* floorImpl = DispatchStub<unary_type>::init<floorImplC, &floorImpl>;
+unary_type* roundImpl = DispatchStub<unary_type>::init<roundImplC, &roundImpl>;
+unary_type* truncImpl = DispatchStub<unary_type>::init<truncImplC, &truncImpl>;
+unary_type* sqrtImpl = DispatchStub<unary_type>::init<sqrtImplC, &sqrtImpl>;
+
+// WRAP OPS #################################################################
+
+Tensor ceil(const Tensor& self) {
+  Tensor result = self.type().tensor();
+  return at::ceil_out(result, self);
+}
+Tensor floor(const Tensor& self) {
+  Tensor result = self.type().tensor();
+  return at::floor_out(result, self);
+}
+Tensor round(const Tensor& self) {
+  Tensor result = self.type().tensor();
+  return at::round_out(result, self);
+}
+Tensor trunc(const Tensor& self) {
+  Tensor result = self.type().tensor();
+  return at::trunc_out(result, self);
+}
+Tensor sqrt(const Tensor& self) {
+  Tensor result = self.type().tensor();
+  return at::sqrt_out(result, self);
+}
+
+Tensor& ceil_(Tensor& self) {
+  return at::ceil_out(self, self);
+}
+Tensor& floor_(Tensor& self) {
+  return at::floor_out(self, self);
+}
+Tensor& round_(Tensor& self) {
+  return at::round_out(self, self);
+}
+Tensor& trunc_(Tensor& self) {
+  return at::trunc_out(self, self);
+}
+Tensor& sqrt_(Tensor& self) {
+  return at::sqrt_out(self, self);
+}
+
+// \WRAP OPS #################################################################
+
+bool _unops_out_cpu(unary_type* f, Tensor& result, const Tensor& self) {
+  if (result.is_contiguous() && self.is_contiguous()) {
+    result.resize_(self.sizes());
+    f(result, self);
+    return true;
+  }
+  return false;
+}
+
+// CPU OPS ###################################################################
+
+Tensor& _ceil_out_cpu(Tensor& result, const Tensor& self) {
+  return _unops_out_cpu(ceilImpl, result, self) ? result
+                                                : at::_ceil_out(result, self);
+}
+Tensor& _floor_out_cpu(Tensor& result, const Tensor& self) {
+  return _unops_out_cpu(floorImpl, result, self) ? result
+                                                 : at::_floor_out(result, self);
+}
+Tensor& _round_out_cpu(Tensor& result, const Tensor& self) {
+  return _unops_out_cpu(roundImpl, result, self) ? result
+                                                 : at::_round_out(result, self);
+}
+Tensor& _trunc_out_cpu(Tensor& result, const Tensor& self) {
+  return _unops_out_cpu(truncImpl, result, self) ? result
+                                                 : at::_trunc_out(result, self);
+}
+Tensor& _sqrt_out_cpu(Tensor& result, const Tensor& self) {
+  return _unops_out_cpu(sqrtImpl, result, self) ? result
+                                                : at::_sqrt_out(result, self);
+}
+
+// \CPU OPS #################################################################
+
+// CUDA OPS #################################################################
+
+Tensor& _ceil_out_cuda(Tensor& result, const Tensor& self) {
+  return at::_ceil_out(result, self);
+}
+Tensor& _floor_out_cuda(Tensor& result, const Tensor& self) {
+  return at::_floor_out(result, self);
+}
+Tensor& _round_out_cuda(Tensor& result, const Tensor& self) {
+  return at::_round_out(result, self);
+}
+Tensor& _trunc_out_cuda(Tensor& result, const Tensor& self) {
+  return at::_trunc_out(result, self);
+}
+Tensor& _sqrt_out_cuda(Tensor& result, const Tensor& self) {
+  return at::_sqrt_out(result, self);
+}
+
+// \CUDA OPS ################################################################
+}} // namespace at::native
--- a/aten/src/ATen/native/cpu/CapabilityDispatch.h
+++ b/aten/src/ATen/native/cpu/CapabilityDispatch.h
@ -1,9 +1,8 @@
-#include "ATen/cpu/cpuinfo/include/cpuinfo.h"
-#include <type_traits>
 #include <iostream>
+#include <type_traits>
+#include "ATen/cpu/cpuinfo/include/cpuinfo.h"

-namespace at {
-namespace native {
+namespace at { namespace native {

 enum class CPUCapability { DEFAULT, AVX, AVX2 };

@ -15,19 +14,19 @@ constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::AVX;
 constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::AVX2;
 #endif

-template <typename FnType> struct DispatchStub {};
+template <typename FnType>
+struct DispatchStub {};

 template <typename... ArgTypes>
 struct DispatchStub<void(ArgTypes...)> {
  using FnType = void(ArgTypes...);

-  template <template <CPUCapability> class allImpl,
-            FnType **dispatch_ptr>
+  template <template <CPUCapability> class allImpl, FnType** dispatch_ptr>
  static void init(ArgTypes... args) {
    *dispatch_ptr = allImpl<CPUCapability::DEFAULT>::function;
    // Check if platform is supported
    if (cpuinfo_initialize()) {
-      // Set function pointer to best implementation last
+    // Set function pointer to best implementation last
 #if defined(HAVE_AVX_CPU_DEFINITION)
      if (cpuinfo_has_x86_avx()) {
        *dispatch_ptr = allImpl<CPUCapability::AVX>::function;
@ -43,5 +42,4 @@ struct DispatchStub<void(ArgTypes...)> {
  }
 };

-}
-}
+}} // namespace at::native
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -1,16 +1,16 @@
 #include "ATen/native/cpu/ReduceOpsKernel.h"
 #include "ATen/Dispatch.h"
 #include "ATen/Parallel.h"
+#include "ATen/native/cpu/Vec256.h"

-namespace at {
-namespace native {
+namespace at { namespace native {

 using namespace vec256;

 // This adds the content of arr to sum
 template <class scalar_t, template <class> class OP, CPUCapability C>
-inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,
-                                  scalar_t sum) {
+inline scalar_t
+allreduce_kernel_(const scalar_t* arr, size_t start, size_t end, scalar_t sum) {
  Vec256<scalar_t> part_sum;
  // Use all 16 registers.
  Vec256<scalar_t> tmp_sum[4], tmp_sum1, tmp_sum2, tmp_sum3;
@ -38,7 +38,7 @@ inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,
  if (k > 0) {
    scalar_t sarr[32 / sizeof(scalar_t)];
    part_sum.store(sarr);
-    for (size_t i = 0; i < part_sum.size(); i++) {
+    for (size_t i = 0; i < part_sum.size; i++) {
      sum = OP<scalar_t>()(sum, sarr[i]);
    }
  }
@ -51,8 +51,11 @@ inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,

 // This overwrites the content of outarr
 template <class scalar_t, template <class> class OP, CPUCapability C>
-inline void dimreduce_kernel_(const scalar_t *arr, scalar_t *outarr,
-                              size_t num_rows, size_t num_cols) {
+inline void dimreduce_kernel_(
+    const scalar_t* arr,
+    scalar_t* outarr,
+    size_t num_rows,
+    size_t num_cols) {
  size_t width =
      256 / (sizeof(scalar_t)); // primitives per 256 bytes (two cache lines)
  Vec256<scalar_t> a[8];
@ -88,31 +91,49 @@ inline void dimreduce_kernel_(const scalar_t *arr, scalar_t *outarr,
 }

 template <template <class> class OP, CPUCapability C>
-inline void allImpl(Tensor & result, const Tensor & self, size_t dim, bool all, const char* name, int64_t init) {
+inline void allImpl(
+    Tensor& result,
+    const Tensor& self,
+    size_t dim,
+    bool all,
+    const char* name,
+    int64_t init) {
  AT_DISPATCH_ALL_TYPES(self.type(), name, [&] {
    if (all) {
      result.fill_(at::parallel_reduce<scalar_t, OP>(
-          &allreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>, self.data<scalar_t>(),
-          (size_t)0, (size_t)self.numel(), (scalar_t)init));
+          &allreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>,
+          self.data<scalar_t>(),
+          (size_t)0,
+          (size_t)self.numel(),
+          (scalar_t)init));
    } else {
      at::parallel_reduce_2d<scalar_t>(
          &dimreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>,
-          self.sizes()[dim], self.strides()[dim], self.numel(),
-          self.data<scalar_t>(), result.data<scalar_t>());
+          self.sizes()[dim],
+          self.strides()[dim],
+          self.numel(),
+          self.data<scalar_t>(),
+          result.data<scalar_t>());
    }
-    });
+  });
 }

 template <>
-void sumImplC<CURRENT_CAPABILITY>::function(Tensor &result, const Tensor &self,
-                                            size_t dim, bool all) {
+void sumImplC<CURRENT_CAPABILITY>::function(
+    Tensor& result,
+    const Tensor& self,
+    size_t dim,
+    bool all) {
  allImpl<std::plus, CURRENT_CAPABILITY>(result, self, dim, all, "sum", 0);
 }

 template <>
-void prodImplC<CURRENT_CAPABILITY>::function(Tensor &result, const Tensor &self,
-                                             size_t dim, bool all) {
-  allImpl<std::multiplies, CURRENT_CAPABILITY>(result, self, dim, all, "prod", 1);
-}
-}
+void prodImplC<CURRENT_CAPABILITY>::function(
+    Tensor& result,
+    const Tensor& self,
+    size_t dim,
+    bool all) {
+  allImpl<std::multiplies, CURRENT_CAPABILITY>(
+      result, self, dim, all, "prod", 1);
 }
+}} // namespace at::native
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.h
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.h
@ -1,23 +1,21 @@
 #pragma once
 #include <ATen/ATen.h>
 #include <ATen/Parallel.h>
-#include "CapabilityDispatch.h"
-#include "Vec256.h"
 #include <stdexcept>
+#include "CapabilityDispatch.h"

+namespace at { namespace native {

-namespace at {
-namespace native {
-
-template <CPUCapability C> struct sumImplC {
-  static void function(Tensor &result, const Tensor &self, size_t dim,
-                       bool all);
+template <CPUCapability C>
+struct sumImplC {
+  static void
+  function(Tensor& result, const Tensor& self, size_t dim, bool all);
 };

-template <CPUCapability C> struct prodImplC {
-  static void function(Tensor &result, const Tensor &self, size_t dim,
-                       bool all);
+template <CPUCapability C>
+struct prodImplC {
+  static void
+  function(Tensor& result, const Tensor& self, size_t dim, bool all);
 };

-}
-}
+}} // namespace at::native
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
@ -0,0 +1,121 @@
+#include "ATen/native/cpu/UnaryOpsKernel.h"
+#include <cmath>
+#include <iostream>
+#include "ATen/Dispatch.h"
+#include "ATen/Parallel.h"
+#include "ATen/native/cpu/Vec256.h"
+
+namespace at { namespace native {
+
+using namespace vec256;
+
+// This modifies arr in place with given OP
+template <class scalar_t, template <class> class VOP, CPUCapability C>
+inline void
+kernel_(scalar_t* arr_out, const scalar_t* arr_in, size_t start, size_t end) {
+  Vec256<scalar_t> a;
+  size_t epr = 32 / sizeof(scalar_t); // primitives per Vec256
+  size_t k = start;
+  size_t vec_end = end > epr ? end - epr : 0;
+  for (; k < vec_end; k += epr) {
+    a.load(arr_in + k);
+    VOP<scalar_t>()(a).store(arr_out + k);
+  }
+  size_t leftover = std::min((end - k), a.size);
+  a.load(arr_in + k, leftover);
+  VOP<scalar_t>()(a).store(arr_out + k, leftover);
+}
+
+template <template <class> class VOP, CPUCapability C>
+inline void allImpl(Tensor& result, const Tensor& self, const char* name) {
+  AT_DISPATCH_FLOATING_TYPES(self.type(), name, [&] {
+    at::parallel_for_1d<scalar_t>(
+        &kernel_<scalar_t, VOP, CURRENT_CAPABILITY>, result, self);
+  });
+}
+
+namespace {
+
+template <typename T>
+struct ceilVOP {
+  Vec256<T> operator()(Vec256<T>& x) const {
+    return x.ceil();
+  }
+};
+} // namespace
+
+template <>
+void ceilImplC<CURRENT_CAPABILITY>::function(
+    Tensor& result,
+    const Tensor& self) {
+  allImpl<ceilVOP, CURRENT_CAPABILITY>(result, self, "ceil");
+}
+
+namespace {
+
+template <typename T>
+struct floorVOP {
+  Vec256<T> operator()(Vec256<T>& x) const {
+    return x.floor();
+  }
+};
+} // namespace
+
+template <>
+void floorImplC<CURRENT_CAPABILITY>::function(
+    Tensor& result,
+    const Tensor& self) {
+  allImpl<floorVOP, CURRENT_CAPABILITY>(result, self, "floor");
+}
+
+namespace {
+
+template <typename T>
+struct roundVOP {
+  Vec256<T> operator()(Vec256<T>& x) const {
+    return x.round();
+  }
+};
+} // namespace
+
+template <>
+void roundImplC<CURRENT_CAPABILITY>::function(
+    Tensor& result,
+    const Tensor& self) {
+  allImpl<roundVOP, CURRENT_CAPABILITY>(result, self, "round");
+}
+
+namespace {
+
+template <typename T>
+struct truncVOP {
+  Vec256<T> operator()(Vec256<T>& x) const {
+    return x.trunc();
+  }
+};
+} // namespace
+
+template <>
+void truncImplC<CURRENT_CAPABILITY>::function(
+    Tensor& result,
+    const Tensor& self) {
+  allImpl<truncVOP, CURRENT_CAPABILITY>(result, self, "trunc");
+}
+
+namespace {
+
+template <typename T>
+struct sqrtVOP {
+  Vec256<T> operator()(Vec256<T>& x) const {
+    return x.sqrt();
+  }
+};
+} // namespace
+
+template <>
+void sqrtImplC<CURRENT_CAPABILITY>::function(
+    Tensor& result,
+    const Tensor& self) {
+  allImpl<sqrtVOP, CURRENT_CAPABILITY>(result, self, "sqrt");
+}
+}} // namespace at::native
--- a/aten/src/ATen/native/cpu/UnaryOpsKernel.h
+++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.h
@ -0,0 +1,56 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <stdexcept>
+#include "CapabilityDispatch.h"
+
+namespace at { namespace native {
+
+template <CPUCapability C>
+struct ceilImplC {
+  static void function(Tensor& result, const Tensor& self);
+};
+template <CPUCapability C>
+struct floorImplC {
+  static void function(Tensor& result, const Tensor& self);
+};
+template <CPUCapability C>
+struct roundImplC {
+  static void function(Tensor& result, const Tensor& self);
+};
+template <CPUCapability C>
+struct truncImplC {
+  static void function(Tensor& result, const Tensor& self);
+};
+template <CPUCapability C>
+struct sqrtImplC {
+  static void function(Tensor& result, const Tensor& self);
+};
+
+// Missing unary functions
+// TODO: Add generic apply function for contiguous and non-contiguous tensors
+// The goal here is to move more ops entirely into ATen and take advantage of
+// automatic vectorization with file-specific flags
+// acos
+// asin
+// atan
+// cos
+// cosh
+// digamma
+// erf
+// erfinv
+// exp
+// expm1
+// frac
+// lgamma
+// log1p
+// log
+// rsqrt
+// sigmoid
+// sin
+// sinh
+// tan
+// tanh
+// trunc
+
+}} // namespace at::native
--- a/aten/src/ATen/native/cpu/Vec256.h
+++ b/aten/src/ATen/native/cpu/Vec256.h
@ -15,7 +15,7 @@
 #elif defined(__GNUC__) && defined(__IWMMXT__)
 /* GCC-compatible compiler, targeting ARM with WMMX */
 #include <mmintrin.h>
-#elif (defined(__GNUC__) || defined(__xlC__)) &&                               \
+#elif (defined(__GNUC__) || defined(__xlC__)) && \
    (defined(__VEC__) || defined(__ALTIVEC__))
 /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
 #include <altivec.h>
@ -25,94 +25,212 @@
 #endif

 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
 #include <iostream>

-
 // NOTE:
 // If you specialize on a type, you must define all operations!
 // C arrays and intrinsic types don't mix
-namespace at {
-namespace native {
-namespace vec256 {
+namespace at { namespace native { namespace vec256 {

-template <class T> class Vec256 {
-public:
+template <class T>
+class Vec256 {
+ public:
  T values[32 / sizeof(T)]; // Mimics AVX behavior
-  inline void load(const T *ptr) { 
-    std::memcpy(values, ptr, 32); 
+  inline void load(const T* ptr) {
+    std::memcpy(values, ptr, 32);
  };
-  inline void store(T *ptr) { std::memcpy(ptr, values, 32); }
-  inline size_t size() { return 32 / sizeof(T); }
-  inline void operator=(const Vec256<T> &b) {
+  inline void store(T* ptr) const {
+    std::memcpy(ptr, values, 32);
+  }
+  inline void load(const T* ptr, size_t count) {
+    std::memcpy(values, ptr, 32 / sizeof(T) * count);
+  };
+  inline void store(T* ptr, size_t count) const {
+    std::memcpy(ptr, values, 32 / sizeof(T) * count);
+  }
+  size_t size = 32 / sizeof(T);
+  inline void operator=(const Vec256<T>& b) {
    std::memcpy(values, b.values, 32);
  }
+  inline Vec256<T> map(T (*f)(T)) {
+    Vec256<T> ret;
+    for (size_t i = 0; i < size; i++)
+      ret.values[i] = f(values[i]);
+    return ret;
+  }
+  inline Vec256<T> ceil() {
+    return map(std::ceil);
+  }
+  inline Vec256<T> floor() {
+    return map(std::floor);
+  }
+  inline Vec256<T> round() {
+    return map(std::round);
+  }
+  inline Vec256<T> trunc() {
+    return map(std::trunc);
+  }
+  inline Vec256<T> sqrt() {
+    return map(std::sqrt);
+  }
 };

-template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T>
+Vec256<T> operator+(const Vec256<T>& a, const Vec256<T>& b) {
  Vec256<T> c = Vec256<T>();
-  for (size_t i = 0; i < c.size(); i++) {
+  for (size_t i = 0; i < a.size; i++)
    c.values[i] = a.values[i] + b.values[i];
-  }
  return c;
 }

-template <class T> Vec256<T> operator*(const Vec256<T> &a, const Vec256<T> &b) {
+template <class T>
+Vec256<T> operator*(const Vec256<T>& a, const Vec256<T>& b) {
  Vec256<T> c = Vec256<T>();
-  for (size_t i = 0; i < c.size(); i++) {
+  for (size_t i = 0; i < a.size; i++)
    c.values[i] = a.values[i] * b.values[i];
-  }
  return c;
 }

 #ifdef __AVX__
-template <> class Vec256<float> {
-public:
+template <>
+class Vec256<float> {
+ public:
  __m256 values;
  Vec256<float>() {}
-  inline void load(const float *ptr) { values = _mm256_loadu_ps(ptr); }
-  inline void store(float *ptr) { _mm256_storeu_ps(ptr, values); }
-  inline size_t size() { return 32 / sizeof(float); }
-  inline void operator=(const Vec256<float> &b) { values = b.values; }
-};
-
-template <> class Vec256<double> {
-public:
-  __m256d values;
-  Vec256<double>() {}
-  inline void load(const double *ptr) { values = _mm256_loadu_pd(ptr); }
-  inline void store(double *ptr) { _mm256_storeu_pd(ptr, values); }
-  inline size_t size() { return 32 / sizeof(double); }
-  inline void operator=(const Vec256<double> &b) { values = b.values; }
+  inline void load(const float* ptr) {
+    values = _mm256_loadu_ps(ptr);
+  }
+  inline void store(float* ptr) const {
+    _mm256_storeu_ps(ptr, values);
+  }
+  inline void load(const float* ptr, size_t count) {
+    float tmp_values[8];
+    std::memcpy(tmp_values, ptr, count * sizeof(float));
+    load(tmp_values);
+  }
+  inline void store(float* ptr, size_t count) const {
+    float tmp_values[8];
+    store(tmp_values);
+    std::memcpy(ptr, tmp_values, count * sizeof(float));
+  }
+  size_t size = 8;
+  inline void operator=(const Vec256<float>& b) {
+    values = b.values;
+  }
+  inline Vec256<float> ceil() {
+    Vec256<float> ret;
+    ret.values = _mm256_ceil_ps(values);
+    return ret;
+  }
+  inline Vec256<float> floor() {
+    Vec256<float> ret;
+    ret.values = _mm256_floor_ps(values);
+    return ret;
+  }
+  inline Vec256<float> round() {
+    Vec256<float> ret;
+    ret.values = _mm256_round_ps(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    return ret;
+  }
+  inline Vec256<float> trunc() {
+    Vec256<float> ret;
+    ret.values =
+        _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    return ret;
+  }
+  inline Vec256<float> sqrt() {
+    Vec256<float> ret;
+    ret.values = _mm256_sqrt_ps(values);
+    return ret;
+  }
 };

 template <>
-Vec256<float> inline operator+(const Vec256<float> &a, const Vec256<float> &b) {
+class Vec256<double> {
+ public:
+  __m256d values;
+  Vec256<double>() {}
+  inline void load(const double* ptr) {
+    values = _mm256_loadu_pd(ptr);
+  }
+  inline void store(double* ptr) const {
+    _mm256_storeu_pd(ptr, values);
+  }
+  inline void load(const double* ptr, size_t count) {
+    double tmp_values[4];
+    std::memcpy(tmp_values, ptr, count * sizeof(double));
+    load(tmp_values);
+  }
+  inline void store(double* ptr, size_t count) const {
+    double tmp_values[4];
+    store(tmp_values);
+    std::memcpy(ptr, tmp_values, count * sizeof(double));
+  }
+  size_t size = 4;
+  inline void operator=(const Vec256<double>& b) {
+    values = b.values;
+  }
+  inline Vec256<double> ceil() {
+    Vec256<double> ret;
+    ret.values = _mm256_ceil_pd(values);
+    return ret;
+  }
+  inline Vec256<double> floor() {
+    Vec256<double> ret;
+    ret.values = _mm256_floor_pd(values);
+    return ret;
+  }
+  inline Vec256<double> round() {
+    Vec256<double> ret;
+    ret.values = _mm256_round_pd(
+        values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+    return ret;
+  }
+  inline Vec256<double> trunc() {
+    Vec256<double> ret;
+    ret.values =
+        _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
+    return ret;
+  }
+  inline Vec256<double> sqrt() {
+    Vec256<double> ret;
+    ret.values = _mm256_sqrt_pd(values);
+    return ret;
+  }
+};
+
+template <>
+Vec256<float> inline operator+(const Vec256<float>& a, const Vec256<float>& b) {
  Vec256<float> c = Vec256<float>();
  c.values = _mm256_add_ps(a.values, b.values);
  return c;
 }

 template <>
-Vec256<float> inline operator*(const Vec256<float> &a, const Vec256<float> &b) {
+Vec256<float> inline operator*(const Vec256<float>& a, const Vec256<float>& b) {
  Vec256<float> c = Vec256<float>();
  c.values = _mm256_mul_ps(a.values, b.values);
  return c;
 }

 template <>
-Vec256<double> inline operator+(const Vec256<double> &a,
-                                const Vec256<double> &b) {
+Vec256<double> inline operator+(
+    const Vec256<double>& a,
+    const Vec256<double>& b) {
  Vec256<double> c = Vec256<double>();
  c.values = _mm256_add_pd(a.values, b.values);
  return c;
 }

 template <>
-Vec256<double> inline operator*(const Vec256<double> &a,
-                                const Vec256<double> &b) {
+Vec256<double> inline operator*(
+    const Vec256<double>& a,
+    const Vec256<double>& b) {
  Vec256<double> c = Vec256<double>();
  c.values = _mm256_mul_pd(a.values, b.values);
  return c;
@ -120,67 +238,109 @@ Vec256<double> inline operator*(const Vec256<double> &a,
 #endif

 #ifdef __AVX2__
-template <> class Vec256<int64_t> {
-public:
+template <>
+class Vec256<int64_t> {
+ public:
  __m256i values;
  Vec256<int64_t>() {}
-  inline void load(const int64_t *ptr) {
-    values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
+  inline void load(const int64_t* ptr) {
+    values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
  }
-  inline void store(int64_t *ptr) {
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
+  inline void store(int64_t* ptr) const {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
  }
-  inline size_t size() { return 32 / sizeof(int64_t); }
-  inline void operator=(const Vec256<int64_t> &b) { values = b.values; }
-};
-
-template <> class Vec256<int32_t> {
-public:
-  __m256i values;
-  Vec256<int32_t>() {}
-  inline void load(const int32_t *ptr) {
-    values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
+  inline void load(const int64_t* ptr, size_t count) {
+    int64_t tmp_values[4];
+    std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
+    load(tmp_values);
  }
-  inline void store(int32_t *ptr) {
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
+  inline void store(int64_t* ptr, size_t count) const {
+    int64_t tmp_values[4];
+    store(tmp_values);
+    std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
  }
-  inline size_t size() { return 32 / sizeof(int32_t); }
-  inline void operator=(const Vec256<int32_t> &b) { values = b.values; }
-};
-
-template <> class Vec256<int16_t> {
-public:
-  __m256i values;
-  Vec256<int16_t>() {}
-  inline void load(const int16_t *ptr) {
-    values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
+  size_t size = 4;
+  inline void operator=(const Vec256<int64_t>& b) {
+    values = b.values;
  }
-  inline void store(int16_t *ptr) {
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
-  }
-  inline size_t size() { return 32 / sizeof(int16_t); }
-  inline void operator=(const Vec256<int16_t> &b) { values = b.values; }
 };

 template <>
-Vec256<int64_t> inline operator+(const Vec256<int64_t> &a,
-                                 const Vec256<int64_t> &b) {
+class Vec256<int32_t> {
+ public:
+  __m256i values;
+  Vec256<int32_t>() {}
+  inline void load(const int32_t* ptr) {
+    values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  inline void store(int32_t* ptr) const {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+  }
+  inline void load(const int32_t* ptr, size_t count) {
+    int32_t tmp_values[8];
+    std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
+    load(tmp_values);
+  }
+  inline void store(int32_t* ptr, size_t count) const {
+    int32_t tmp_values[8];
+    store(tmp_values);
+    std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
+  }
+  size_t size = 8;
+  inline void operator=(const Vec256<int32_t>& b) {
+    values = b.values;
+  }
+};
+
+template <>
+class Vec256<int16_t> {
+ public:
+  __m256i values;
+  Vec256<int16_t>() {}
+  inline void load(const int16_t* ptr) {
+    values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+  }
+  inline void store(int16_t* ptr) const {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
+  }
+  inline void load(const int16_t* ptr, size_t count) {
+    int16_t tmp_values[16];
+    std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
+    load(tmp_values);
+  }
+  inline void store(int16_t* ptr, size_t count) const {
+    int16_t tmp_values[16];
+    store(tmp_values);
+    std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
+  }
+  size_t size = 16;
+  inline void operator=(const Vec256<int16_t>& b) {
+    values = b.values;
+  }
+};
+
+template <>
+Vec256<int64_t> inline operator+(
+    const Vec256<int64_t>& a,
+    const Vec256<int64_t>& b) {
  Vec256<int64_t> c = Vec256<int64_t>();
  c.values = _mm256_add_epi64(a.values, b.values);
  return c;
 }

 template <>
-Vec256<int32_t> inline operator+(const Vec256<int32_t> &a,
-                                 const Vec256<int32_t> &b) {
+Vec256<int32_t> inline operator+(
+    const Vec256<int32_t>& a,
+    const Vec256<int32_t>& b) {
  Vec256<int32_t> c = Vec256<int32_t>();
  c.values = _mm256_add_epi32(a.values, b.values);
  return c;
 }

 template <>
-Vec256<int16_t> inline operator+(const Vec256<int16_t> &a,
-                                 const Vec256<int16_t> &b) {
+Vec256<int16_t> inline operator+(
+    const Vec256<int16_t>& a,
+    const Vec256<int16_t>& b) {
  Vec256<int16_t> c = Vec256<int16_t>();
  c.values = _mm256_add_epi16(a.values, b.values);
  return c;
@ -191,8 +351,9 @@ Vec256<int16_t> inline operator+(const Vec256<int16_t> &a,
 // This is also technically avx compatible, but then we'll need AVX
 // code for add as well.
 template <>
-Vec256<int64_t> inline operator*(const Vec256<int64_t> &a,
-                                 const Vec256<int64_t> &b) {
+Vec256<int64_t> inline operator*(
+    const Vec256<int64_t>& a,
+    const Vec256<int64_t>& b) {
  Vec256<int64_t> c = Vec256<int64_t>();

  int64_t a0 = _mm256_extract_epi64(a.values, 0);
@ -215,21 +376,21 @@ Vec256<int64_t> inline operator*(const Vec256<int64_t> &a,
 }

 template <>
-Vec256<int32_t> inline operator*(const Vec256<int32_t> &a,
-                                 const Vec256<int32_t> &b) {
+Vec256<int32_t> inline operator*(
+    const Vec256<int32_t>& a,
+    const Vec256<int32_t>& b) {
  Vec256<int32_t> c = Vec256<int32_t>();
  c.values = _mm256_mullo_epi32(a.values, b.values);
  return c;
 }

 template <>
-Vec256<int16_t> inline operator*(const Vec256<int16_t> &a,
-                                 const Vec256<int16_t> &b) {
+Vec256<int16_t> inline operator*(
+    const Vec256<int16_t>& a,
+    const Vec256<int16_t>& b) {
  Vec256<int16_t> c = Vec256<int16_t>();
  c.values = _mm256_mullo_epi16(a.values, b.values);
  return c;
 }
 #endif
-}
-}
-}
+}}} // namespace at::native::vec256
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -29,6 +29,18 @@
 - func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
  variants: function, method

+- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
+  variants: function
+
+- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
+  variants: function
+
+- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor grad_output, Tensor grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
+  variants: function
+
+- func: _cudnn_init_dropout_state(Type ty, double dropout, bool train, int64_t dropout_seed) -> Tensor
+  variants: function
+
 - func: adaptive_avg_pool1d(Tensor self, IntList[1] output_size) -> Tensor
  variants: function

@ -80,6 +92,16 @@
 - func: cat_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor
  variants: function

+- func: ceil(Tensor self) -> Tensor
+
+- func: ceil_(Tensor self) -> Tensor
+
+- func: ceil_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _ceil_out_cpu
+    CUDA: _ceil_out_cuda
+
 - func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList

 - func: cudnn_is_acceptable(Tensor self) -> bool
@ -271,6 +293,16 @@
    CPU: eye_out_cpu
    CUDA: eye_out_cuda

+- func: floor(Tensor self) -> Tensor
+
+- func: floor_(Tensor self) -> Tensor
+
+- func: floor_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _floor_out_cpu
+    CUDA: _floor_out_cuda
+
 - func: full(Type dtype, IntList size, Scalar fill_value) -> Tensor
  variants: function

@ -437,18 +469,28 @@
    CPU: RoiPooling2d_backward_cpu
    CUDA: RoiPooling2d_backward_cuda

+- func: round(Tensor self) -> Tensor
+
+- func: round_(Tensor self) -> Tensor
+
+- func: round_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _round_out_cpu
+    CUDA: _round_out_cuda
+
 - func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
  variants: function

 - func: rrelu_(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
  variants: function

- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
-
 - func: relu(Tensor self) -> Tensor

 - func: relu_(Tensor self) -> Tensor

+- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
+
 - func: selu(Tensor self) -> Tensor
  variants: function

@ -512,6 +554,16 @@
    CPU: _sum_out_cpu
    CUDA: _sum_out_cuda

+- func: sqrt(Tensor self) -> Tensor
+
+- func: sqrt_(Tensor self) -> Tensor
+
+- func: sqrt_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _sqrt_out_cpu
+    CUDA: _sqrt_out_cuda
+
 - func: prod(Tensor self) -> Tensor
  dispatch:
    CPU: _prod_cpu
@ -525,14 +577,24 @@
    CPU: _prod_out_cpu
    CUDA: _prod_out_cuda

+- func: t_(Tensor self) -> Tensor
+  variants: method
+
 - func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
  variants: method

 - func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, double margin=1.0, double p=2, double eps=1e-6, bool swap=false, bool size_average=true, bool reduce=true) -> Tensor
  variants: function

- func: t_(Tensor self) -> Tensor
-  variants: method
+- func: trunc(Tensor self) -> Tensor
+
+- func: trunc_(Tensor self) -> Tensor
+
+- func: trunc_out(Tensor result, Tensor self) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _trunc_out_cpu
+    CUDA: _trunc_out_cuda

 - func: type_as(Tensor self, Tensor other) -> Tensor
  variants: method
@ -584,15 +646,3 @@
  dispatch:
    CPU: _s_poisson_cpu
    CUDA: _s_poisson_cuda
-
- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
-  variants: function
-
- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
-  variants: function
-
- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor grad_output, Tensor grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
-  variants: function
-
- func: _cudnn_init_dropout_state(Type ty, double dropout, bool train, int64_t dropout_seed) -> Tensor
-  variants: function
--- a/test/test_torch.py
+++ b/test/test_torch.py
@ -229,23 +229,27 @@ class TestTorch(TestCase):
        self.assertRaises(RuntimeError, lambda: torch.addr(m, v, s))
        self.assertRaises(RuntimeError, lambda: torch.addr(m, s, v))

-    def _testMath(self, torchfn, mathfn):
-        size = (10, 5)
-        # contiguous
-        m1 = torch.randn(*size)
-        res1 = torchfn(m1[4])
-        res2 = res1.clone().zero_()
-        for i, v in enumerate(m1[4]):
-            res2[i] = mathfn(v.item())
-        self.assertEqual(res1, res2)
+    def _testMath(self, torchfn, mathfn, large=True):
+        def _testMathSize(size, self, torchfn, mathfn):
+            # contiguous
+            m1 = torch.randn(*size)
+            res1 = torchfn(m1[4])
+            res2 = res1.clone().zero_()
+            for i, v in enumerate(m1[4]):
+                res2[i] = mathfn(v.item())
+            self.assertEqual(res1, res2)

-        # non-contiguous
-        m1 = torch.randn(*size)
-        res1 = torchfn(m1[:, 4])
-        res2 = res1.clone().zero_()
-        for i, v in enumerate(m1[:, 4]):
-            res2[i] = mathfn(v.item())
-        self.assertEqual(res1, res2)
+            # non-contiguous
+            m1 = torch.randn(*size)
+            res1 = torchfn(m1[:, 4])
+            res2 = res1.clone().zero_()
+            for i, v in enumerate(m1[:, 4]):
+                res2[i] = mathfn(v.item())
+            self.assertEqual(res1, res2)
+        _testMathSize((10, 5), self, torchfn, mathfn)
+        if large:
+            # Trigger parallelism
+            _testMathSize((10, 50000), self, torchfn, mathfn)

    def _testMathByName(self, function_name):
        torchfn = getattr(torch, function_name)
@ -269,8 +273,9 @@ class TestTorch(TestCase):
    @unittest.skipIf(not TEST_SCIPY, "Scipy not found")
    def test_polygamma(self):
        from scipy.special import polygamma
+        # This test won't work when using many samples
        for n in [0, 1]:
-            self._testMath(lambda x: torch.polygamma(n, x), lambda x: polygamma(n, x)[()])
+            self._testMath(lambda x: torch.polygamma(n, x), lambda x: polygamma(n, x)[()], large=False)

    def test_asin(self):
        self._testMath(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else float('nan'))