ATen Unary Ops (#6030)

Implements a few unary operations for which there are AVX intrinsics.

The perf comparison script is here:
https://paste.fedoraproject.org/paste/f1adcJhpGtzDNWImS34XzQ
This commit is contained in:
cpuhrsch 2018-03-27 20:39:28 -04:00 committed by Sam Gross
parent ebc0194950
commit bde2f6b298
13 changed files with 797 additions and 289 deletions

View File

@ -1665,20 +1665,8 @@
- THTensor* self
]]
[[
name: sqrt_
types:
- floating_point
backends:
- CPU
- CUDA
name: _sqrt
cname: sqrt
return: self
arguments:
- THTensor* self
- THTensor* self
]]
[[
name: sqrt
types:
- floating_point
backends:
@ -1723,20 +1711,8 @@
- THTensor* self
]]
[[
name: ceil_
types:
- floating_point
backends:
- CPU
- CUDA
name: _ceil
cname: ceil
return: self
arguments:
- THTensor* self
- THTensor* self
]]
[[
name: ceil
types:
- floating_point
backends:
@ -1752,20 +1728,8 @@
- THTensor* self
]]
[[
name: floor_
types:
- floating_point
backends:
- CPU
- CUDA
name: _floor
cname: floor
return: self
arguments:
- THTensor* self
- THTensor* self
]]
[[
name: floor
types:
- floating_point
backends:
@ -1781,20 +1745,8 @@
- THTensor* self
]]
[[
name: round_
types:
- floating_point
backends:
- CPU
- CUDA
name: _round
cname: round
return: self
arguments:
- THTensor* self
- THTensor* self
]]
[[
name: round
types:
- floating_point
backends:
@ -1810,20 +1762,8 @@
- THTensor* self
]]
[[
name: trunc_
types:
- floating_point
backends:
- CPU
- CUDA
name: _trunc
cname: trunc
return: self
arguments:
- THTensor* self
- THTensor* self
]]
[[
name: trunc
types:
- floating_point
backends:

View File

@ -4,15 +4,15 @@
#include <tbb/parallel_reduce.h>
#include <tbb/partitioner.h>
#include <tbb/tbb.h>
#include <thread>
#include <cassert>
#include <thread>
namespace at {
namespace internal {
namespace at { namespace internal {
// thread_local variable with internal linkage
// requires no guarding as it's storage duration is defined to be per thread
static thread_local tbb::task_scheduler_init tbbinit(tbb::task_scheduler_init::deferred);
static thread_local tbb::task_scheduler_init tbbinit(
tbb::task_scheduler_init::deferred);
// Tracks number of threads uses which TBB doesn't track.
static thread_local int num_threads_ = -1;
@ -22,9 +22,9 @@ void init_tbb_num_threads() {
int num_threads = at::get_num_threads();
// In order to have control over the number of threads this function
// must be called first before any other tbb parallel construct is
// excercised within a particular thread. Otherwise the default
// scheduler will be created over which we do not have control.
// The following code will and must throw an error if tbb has
// excercised within a particular thread. Otherwise the default
// scheduler will be created over which we do not have control.
// The following code will and must throw an error if tbb has
// already been initialized before this function was called.
if (!tbbinit.is_active() && !first_call)
throw std::runtime_error(
@ -51,5 +51,4 @@ void init_tbb_num_threads() {
num_threads_ = num_threads;
}
}
}
}
}} // namespace at::internal

View File

@ -1,6 +1,7 @@
#pragma once
#include <cstddef>
#include <ATen/ATen.h>
#include <tbb/tbb.h>
#include <cstddef>
namespace at {
namespace internal {
@ -21,12 +22,15 @@ void init_tbb_num_threads();
// no parallel algorithm (such as parallel_reduce) should split work into
// smaller than GRAIN_SIZE chunks.
constexpr size_t TBB_GRAIN_SIZE = 32768;
}
} // namespace internal
template <class T, template <class> class OP>
T parallel_reduce(T (*f)(const T *, size_t, size_t, T), const T *data,
size_t start, size_t end, T init_) {
T parallel_reduce(
T (*f)(const T*, size_t, size_t, T),
const T* data,
size_t start,
size_t end,
T init_) {
internal::init_tbb_num_threads();
T result_;
@ -35,19 +39,25 @@ T parallel_reduce(T (*f)(const T *, size_t, size_t, T), const T *data,
result_ = f(data, start, end, init_);
} else {
result_ = tbb::parallel_reduce(
tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE), init_,
tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE),
init_,
[&data, &f](const tbb::blocked_range<size_t> r, T init) -> T {
return f(data, r.begin(), r.end(), init);
},
OP<T>(), ap);
OP<T>(),
ap);
}
return result_;
}
template <class T>
void parallel_reduce_2d(void (*f)(const T *, T *, size_t, size_t), size_t num_rows,
size_t num_cols, size_t numel, const T *arr_, T *outarr_) {
void parallel_reduce_2d(
void (*f)(const T*, T*, size_t, size_t),
size_t num_rows,
size_t num_cols,
size_t numel,
const T* arr_,
T* outarr_) {
internal::init_tbb_num_threads();
static tbb::affinity_partitioner ap;
@ -63,19 +73,44 @@ void parallel_reduce_2d(void (*f)(const T *, T *, size_t, size_t), size_t num_ro
f(arr, outarr, num_rows, num_cols);
}
} else {
tbb::parallel_for(tbb::blocked_range<size_t>(
0, max_i_, 1),
[&arr_, &outarr_, num_rows, num_cols,
&f](const tbb::blocked_range<size_t> r) {
for (size_t i_ = r.begin(); i_ < r.end(); i_++) {
int64_t i = i_ * num_rows * num_cols;
int64_t i_r = i_ * num_cols;
const T *arr = arr_ + i;
T *outarr = outarr_ + i_r;
f(arr, outarr, num_rows, num_cols);
}
},
ap);
tbb::parallel_for(
tbb::blocked_range<size_t>(0, max_i_, 1),
[&arr_, &outarr_, num_rows, num_cols, &f](
const tbb::blocked_range<size_t> r) {
for (size_t i_ = r.begin(); i_ < r.end(); i_++) {
int64_t i = i_ * num_rows * num_cols;
int64_t i_r = i_ * num_cols;
const T* arr = arr_ + i;
T* outarr = outarr_ + i_r;
f(arr, outarr, num_rows, num_cols);
}
},
ap);
}
}
template <class T>
void parallel_for_1d(
void (*f)(T*, const T*, size_t, size_t),
Tensor& result,
const Tensor& self) {
internal::init_tbb_num_threads();
static tbb::affinity_partitioner ap;
T* arr_out = result.data<T>();
const T* arr_in = self.data<T>();
size_t start = 0;
size_t end = self.numel();
if (end - start < internal::TBB_GRAIN_SIZE) {
f(arr_out, arr_in, start, end);
} else {
tbb::parallel_for(
tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE),
[&arr_out, &arr_in, &f](const tbb::blocked_range<size_t> r) {
f(arr_out, arr_in, r.begin(), r.end());
},
ap);
}
}
} // namespace at

View File

@ -12,43 +12,46 @@
#include <map>
namespace at {
namespace native {
namespace at { namespace native {
using reduce_type = void(Tensor &, const Tensor &, size_t, bool);
reduce_type *sumImpl = &DispatchStub<reduce_type>::init<sumImplC, &sumImpl>;
reduce_type *prodImpl = &DispatchStub<reduce_type>::init<prodImplC, &prodImpl>;
using reduce_type = void(Tensor&, const Tensor&, size_t, bool);
reduce_type* sumImpl = &DispatchStub<reduce_type>::init<sumImplC, &sumImpl>;
reduce_type* prodImpl = &DispatchStub<reduce_type>::init<prodImplC, &prodImpl>;
// ALL REDUCE #################################################################
Tensor _reduce_cpu(reduce_type *f, const Tensor &self) {
Tensor _reduce_cpu(reduce_type* f, const Tensor& self) {
Tensor result = self.type().tensor({});
f(result, self, 0, true);
return result;
}
Tensor _sum_cpu(const Tensor &self) {
Tensor _sum_cpu(const Tensor& self) {
if (self.is_contiguous())
return _reduce_cpu(sumImpl, self);
return self._sumall();
}
Tensor _prod_cpu(const Tensor &self) {
Tensor _prod_cpu(const Tensor& self) {
if (self.is_contiguous())
return _reduce_cpu(prodImpl, self);
return self._prodall();
}
Tensor _sum_cuda(const Tensor &self_) { return self_._sumall(); }
Tensor _sum_cuda(const Tensor& self_) {
return self_._sumall();
}
Tensor _prod_cuda(const Tensor &self_) { return self_._prodall(); }
Tensor _prod_cuda(const Tensor& self_) {
return self_._prodall();
}
// \ALL REDUCE ################################################################
// DIM REDUCE #################################################################
static bool _dimreduce_return_trivial(Tensor &result, const Tensor &self,
int64_t ident) {
static bool
_dimreduce_return_trivial(Tensor& result, const Tensor& self, int64_t ident) {
if (self.numel() == 1 && self.ndimension() == 0) {
result.resize_({});
result.fill_(self);
@ -63,8 +66,8 @@ static bool _dimreduce_return_trivial(Tensor &result, const Tensor &self,
return false;
}
static Tensor &_dimreduce_setup(Tensor &result, const Tensor &self,
int64_t dim) {
static Tensor&
_dimreduce_setup(Tensor& result, const Tensor& self, int64_t dim) {
IntList self_sizes = self.sizes();
std::vector<int64_t> result_sizes;
result_sizes.insert(result_sizes.end(), self_sizes.begin(), self_sizes.end());
@ -73,8 +76,12 @@ static Tensor &_dimreduce_setup(Tensor &result, const Tensor &self,
return result;
}
Tensor &_reduce_out_cpu(reduce_type *f, Tensor &result, const Tensor &self,
int64_t dim, bool keepdim) {
Tensor& _reduce_out_cpu(
reduce_type* f,
Tensor& result,
const Tensor& self,
int64_t dim,
bool keepdim) {
result = _dimreduce_setup(result, self, dim);
f(result, self, dim, false);
if (!keepdim)
@ -82,8 +89,8 @@ Tensor &_reduce_out_cpu(reduce_type *f, Tensor &result, const Tensor &self,
return result;
}
Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
bool keepdim) {
Tensor&
_sum_out_cpu(Tensor& result, const Tensor& self, int64_t dim_, bool keepdim) {
int64_t dim = maybe_wrap_dim(dim_, self.dim());
if (_dimreduce_return_trivial(result, self, 0))
return result;
@ -93,8 +100,8 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
return at::_sum_out(result, self, dim, keepdim);
}
Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
bool keepdim) {
Tensor&
_prod_out_cpu(Tensor& result, const Tensor& self, int64_t dim_, bool keepdim) {
int64_t dim = maybe_wrap_dim(dim_, self.dim());
if (_dimreduce_return_trivial(result, self, 1))
return result;
@ -104,28 +111,27 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
return at::_prod_out(result, self, dim, keepdim);
}
Tensor &_sum_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
bool keepdim) {
Tensor&
_sum_out_cuda(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) {
return at::_sum_out(result, self, dim, keepdim);
}
Tensor &_prod_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
bool keepdim) {
Tensor&
_prod_out_cuda(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) {
return at::_prod_out(result, self, dim, keepdim);
}
Tensor sum(const Tensor &self, int64_t dim_, bool keepdim) {
Tensor sum(const Tensor& self, int64_t dim_, bool keepdim) {
int64_t dim = maybe_wrap_dim(dim_, self.dim());
Tensor result = self.type().tensor();
return at::sum_out(result, self, dim, keepdim);
}
Tensor prod(const Tensor &self, int64_t dim_, bool keepdim) {
Tensor prod(const Tensor& self, int64_t dim_, bool keepdim) {
int64_t dim = maybe_wrap_dim(dim_, self.dim());
Tensor result = self.type().tensor();
return at::prod_out(result, self, dim, keepdim);
}
// \DIM REDUCE ################################################################
}
}
}} // namespace at::native

View File

@ -0,0 +1,118 @@
#include "ATen/ATen.h"
#include "ATen/Dispatch.h"
#include "ATen/ExpandUtils.h"
#include "ATen/NativeFunctions.h"
#include "ATen/WrapDimUtils.h"
#include "cpu/UnaryOpsKernel.h"
#include <algorithm>
#include <functional>
#include <numeric>
#include <vector>
#include <map>
namespace at { namespace native {
using unary_type = void(Tensor&, const Tensor&);
unary_type* ceilImpl = DispatchStub<unary_type>::init<ceilImplC, &ceilImpl>;
unary_type* floorImpl = DispatchStub<unary_type>::init<floorImplC, &floorImpl>;
unary_type* roundImpl = DispatchStub<unary_type>::init<roundImplC, &roundImpl>;
unary_type* truncImpl = DispatchStub<unary_type>::init<truncImplC, &truncImpl>;
unary_type* sqrtImpl = DispatchStub<unary_type>::init<sqrtImplC, &sqrtImpl>;
// WRAP OPS #################################################################
Tensor ceil(const Tensor& self) {
Tensor result = self.type().tensor();
return at::ceil_out(result, self);
}
Tensor floor(const Tensor& self) {
Tensor result = self.type().tensor();
return at::floor_out(result, self);
}
Tensor round(const Tensor& self) {
Tensor result = self.type().tensor();
return at::round_out(result, self);
}
Tensor trunc(const Tensor& self) {
Tensor result = self.type().tensor();
return at::trunc_out(result, self);
}
Tensor sqrt(const Tensor& self) {
Tensor result = self.type().tensor();
return at::sqrt_out(result, self);
}
Tensor& ceil_(Tensor& self) {
return at::ceil_out(self, self);
}
Tensor& floor_(Tensor& self) {
return at::floor_out(self, self);
}
Tensor& round_(Tensor& self) {
return at::round_out(self, self);
}
Tensor& trunc_(Tensor& self) {
return at::trunc_out(self, self);
}
Tensor& sqrt_(Tensor& self) {
return at::sqrt_out(self, self);
}
// \WRAP OPS #################################################################
bool _unops_out_cpu(unary_type* f, Tensor& result, const Tensor& self) {
if (result.is_contiguous() && self.is_contiguous()) {
result.resize_(self.sizes());
f(result, self);
return true;
}
return false;
}
// CPU OPS ###################################################################
Tensor& _ceil_out_cpu(Tensor& result, const Tensor& self) {
return _unops_out_cpu(ceilImpl, result, self) ? result
: at::_ceil_out(result, self);
}
Tensor& _floor_out_cpu(Tensor& result, const Tensor& self) {
return _unops_out_cpu(floorImpl, result, self) ? result
: at::_floor_out(result, self);
}
Tensor& _round_out_cpu(Tensor& result, const Tensor& self) {
return _unops_out_cpu(roundImpl, result, self) ? result
: at::_round_out(result, self);
}
Tensor& _trunc_out_cpu(Tensor& result, const Tensor& self) {
return _unops_out_cpu(truncImpl, result, self) ? result
: at::_trunc_out(result, self);
}
Tensor& _sqrt_out_cpu(Tensor& result, const Tensor& self) {
return _unops_out_cpu(sqrtImpl, result, self) ? result
: at::_sqrt_out(result, self);
}
// \CPU OPS #################################################################
// CUDA OPS #################################################################
Tensor& _ceil_out_cuda(Tensor& result, const Tensor& self) {
return at::_ceil_out(result, self);
}
Tensor& _floor_out_cuda(Tensor& result, const Tensor& self) {
return at::_floor_out(result, self);
}
Tensor& _round_out_cuda(Tensor& result, const Tensor& self) {
return at::_round_out(result, self);
}
Tensor& _trunc_out_cuda(Tensor& result, const Tensor& self) {
return at::_trunc_out(result, self);
}
Tensor& _sqrt_out_cuda(Tensor& result, const Tensor& self) {
return at::_sqrt_out(result, self);
}
// \CUDA OPS ################################################################
}} // namespace at::native

View File

@ -1,9 +1,8 @@
#include "ATen/cpu/cpuinfo/include/cpuinfo.h"
#include <type_traits>
#include <iostream>
#include <type_traits>
#include "ATen/cpu/cpuinfo/include/cpuinfo.h"
namespace at {
namespace native {
namespace at { namespace native {
enum class CPUCapability { DEFAULT, AVX, AVX2 };
@ -15,19 +14,19 @@ constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::AVX;
constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::AVX2;
#endif
template <typename FnType> struct DispatchStub {};
template <typename FnType>
struct DispatchStub {};
template <typename... ArgTypes>
struct DispatchStub<void(ArgTypes...)> {
using FnType = void(ArgTypes...);
template <template <CPUCapability> class allImpl,
FnType **dispatch_ptr>
template <template <CPUCapability> class allImpl, FnType** dispatch_ptr>
static void init(ArgTypes... args) {
*dispatch_ptr = allImpl<CPUCapability::DEFAULT>::function;
// Check if platform is supported
if (cpuinfo_initialize()) {
// Set function pointer to best implementation last
// Set function pointer to best implementation last
#if defined(HAVE_AVX_CPU_DEFINITION)
if (cpuinfo_has_x86_avx()) {
*dispatch_ptr = allImpl<CPUCapability::AVX>::function;
@ -43,5 +42,4 @@ struct DispatchStub<void(ArgTypes...)> {
}
};
}
}
}} // namespace at::native

View File

@ -1,16 +1,16 @@
#include "ATen/native/cpu/ReduceOpsKernel.h"
#include "ATen/Dispatch.h"
#include "ATen/Parallel.h"
#include "ATen/native/cpu/Vec256.h"
namespace at {
namespace native {
namespace at { namespace native {
using namespace vec256;
// This adds the content of arr to sum
template <class scalar_t, template <class> class OP, CPUCapability C>
inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,
scalar_t sum) {
inline scalar_t
allreduce_kernel_(const scalar_t* arr, size_t start, size_t end, scalar_t sum) {
Vec256<scalar_t> part_sum;
// Use all 16 registers.
Vec256<scalar_t> tmp_sum[4], tmp_sum1, tmp_sum2, tmp_sum3;
@ -38,7 +38,7 @@ inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,
if (k > 0) {
scalar_t sarr[32 / sizeof(scalar_t)];
part_sum.store(sarr);
for (size_t i = 0; i < part_sum.size(); i++) {
for (size_t i = 0; i < part_sum.size; i++) {
sum = OP<scalar_t>()(sum, sarr[i]);
}
}
@ -51,8 +51,11 @@ inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,
// This overwrites the content of outarr
template <class scalar_t, template <class> class OP, CPUCapability C>
inline void dimreduce_kernel_(const scalar_t *arr, scalar_t *outarr,
size_t num_rows, size_t num_cols) {
inline void dimreduce_kernel_(
const scalar_t* arr,
scalar_t* outarr,
size_t num_rows,
size_t num_cols) {
size_t width =
256 / (sizeof(scalar_t)); // primitives per 256 bytes (two cache lines)
Vec256<scalar_t> a[8];
@ -88,31 +91,49 @@ inline void dimreduce_kernel_(const scalar_t *arr, scalar_t *outarr,
}
template <template <class> class OP, CPUCapability C>
inline void allImpl(Tensor & result, const Tensor & self, size_t dim, bool all, const char* name, int64_t init) {
inline void allImpl(
Tensor& result,
const Tensor& self,
size_t dim,
bool all,
const char* name,
int64_t init) {
AT_DISPATCH_ALL_TYPES(self.type(), name, [&] {
if (all) {
result.fill_(at::parallel_reduce<scalar_t, OP>(
&allreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>, self.data<scalar_t>(),
(size_t)0, (size_t)self.numel(), (scalar_t)init));
&allreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>,
self.data<scalar_t>(),
(size_t)0,
(size_t)self.numel(),
(scalar_t)init));
} else {
at::parallel_reduce_2d<scalar_t>(
&dimreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>,
self.sizes()[dim], self.strides()[dim], self.numel(),
self.data<scalar_t>(), result.data<scalar_t>());
self.sizes()[dim],
self.strides()[dim],
self.numel(),
self.data<scalar_t>(),
result.data<scalar_t>());
}
});
});
}
template <>
void sumImplC<CURRENT_CAPABILITY>::function(Tensor &result, const Tensor &self,
size_t dim, bool all) {
void sumImplC<CURRENT_CAPABILITY>::function(
Tensor& result,
const Tensor& self,
size_t dim,
bool all) {
allImpl<std::plus, CURRENT_CAPABILITY>(result, self, dim, all, "sum", 0);
}
template <>
void prodImplC<CURRENT_CAPABILITY>::function(Tensor &result, const Tensor &self,
size_t dim, bool all) {
allImpl<std::multiplies, CURRENT_CAPABILITY>(result, self, dim, all, "prod", 1);
}
}
void prodImplC<CURRENT_CAPABILITY>::function(
Tensor& result,
const Tensor& self,
size_t dim,
bool all) {
allImpl<std::multiplies, CURRENT_CAPABILITY>(
result, self, dim, all, "prod", 1);
}
}} // namespace at::native

View File

@ -1,23 +1,21 @@
#pragma once
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include "CapabilityDispatch.h"
#include "Vec256.h"
#include <stdexcept>
#include "CapabilityDispatch.h"
namespace at { namespace native {
namespace at {
namespace native {
template <CPUCapability C> struct sumImplC {
static void function(Tensor &result, const Tensor &self, size_t dim,
bool all);
template <CPUCapability C>
struct sumImplC {
static void
function(Tensor& result, const Tensor& self, size_t dim, bool all);
};
template <CPUCapability C> struct prodImplC {
static void function(Tensor &result, const Tensor &self, size_t dim,
bool all);
template <CPUCapability C>
struct prodImplC {
static void
function(Tensor& result, const Tensor& self, size_t dim, bool all);
};
}
}
}} // namespace at::native

View File

@ -0,0 +1,121 @@
#include "ATen/native/cpu/UnaryOpsKernel.h"
#include <cmath>
#include <iostream>
#include "ATen/Dispatch.h"
#include "ATen/Parallel.h"
#include "ATen/native/cpu/Vec256.h"
namespace at { namespace native {
using namespace vec256;
// This modifies arr in place with given OP
template <class scalar_t, template <class> class VOP, CPUCapability C>
inline void
kernel_(scalar_t* arr_out, const scalar_t* arr_in, size_t start, size_t end) {
Vec256<scalar_t> a;
size_t epr = 32 / sizeof(scalar_t); // primitives per Vec256
size_t k = start;
size_t vec_end = end > epr ? end - epr : 0;
for (; k < vec_end; k += epr) {
a.load(arr_in + k);
VOP<scalar_t>()(a).store(arr_out + k);
}
size_t leftover = std::min((end - k), a.size);
a.load(arr_in + k, leftover);
VOP<scalar_t>()(a).store(arr_out + k, leftover);
}
template <template <class> class VOP, CPUCapability C>
inline void allImpl(Tensor& result, const Tensor& self, const char* name) {
AT_DISPATCH_FLOATING_TYPES(self.type(), name, [&] {
at::parallel_for_1d<scalar_t>(
&kernel_<scalar_t, VOP, CURRENT_CAPABILITY>, result, self);
});
}
namespace {
template <typename T>
struct ceilVOP {
Vec256<T> operator()(Vec256<T>& x) const {
return x.ceil();
}
};
} // namespace
template <>
void ceilImplC<CURRENT_CAPABILITY>::function(
Tensor& result,
const Tensor& self) {
allImpl<ceilVOP, CURRENT_CAPABILITY>(result, self, "ceil");
}
namespace {
template <typename T>
struct floorVOP {
Vec256<T> operator()(Vec256<T>& x) const {
return x.floor();
}
};
} // namespace
template <>
void floorImplC<CURRENT_CAPABILITY>::function(
Tensor& result,
const Tensor& self) {
allImpl<floorVOP, CURRENT_CAPABILITY>(result, self, "floor");
}
namespace {
template <typename T>
struct roundVOP {
Vec256<T> operator()(Vec256<T>& x) const {
return x.round();
}
};
} // namespace
template <>
void roundImplC<CURRENT_CAPABILITY>::function(
Tensor& result,
const Tensor& self) {
allImpl<roundVOP, CURRENT_CAPABILITY>(result, self, "round");
}
namespace {
template <typename T>
struct truncVOP {
Vec256<T> operator()(Vec256<T>& x) const {
return x.trunc();
}
};
} // namespace
template <>
void truncImplC<CURRENT_CAPABILITY>::function(
Tensor& result,
const Tensor& self) {
allImpl<truncVOP, CURRENT_CAPABILITY>(result, self, "trunc");
}
namespace {
template <typename T>
struct sqrtVOP {
Vec256<T> operator()(Vec256<T>& x) const {
return x.sqrt();
}
};
} // namespace
template <>
void sqrtImplC<CURRENT_CAPABILITY>::function(
Tensor& result,
const Tensor& self) {
allImpl<sqrtVOP, CURRENT_CAPABILITY>(result, self, "sqrt");
}
}} // namespace at::native

View File

@ -0,0 +1,56 @@
#pragma once
#include <ATen/ATen.h>
#include <ATen/Parallel.h>
#include <stdexcept>
#include "CapabilityDispatch.h"
namespace at { namespace native {
template <CPUCapability C>
struct ceilImplC {
static void function(Tensor& result, const Tensor& self);
};
template <CPUCapability C>
struct floorImplC {
static void function(Tensor& result, const Tensor& self);
};
template <CPUCapability C>
struct roundImplC {
static void function(Tensor& result, const Tensor& self);
};
template <CPUCapability C>
struct truncImplC {
static void function(Tensor& result, const Tensor& self);
};
template <CPUCapability C>
struct sqrtImplC {
static void function(Tensor& result, const Tensor& self);
};
// Missing unary functions
// TODO: Add generic apply function for contiguous and non-contiguous tensors
// The goal here is to move more ops entirely into ATen and take advantage of
// automatic vectorization with file-specific flags
// acos
// asin
// atan
// cos
// cosh
// digamma
// erf
// erfinv
// exp
// expm1
// frac
// lgamma
// log1p
// log
// rsqrt
// sigmoid
// sin
// sinh
// tan
// tanh
// trunc
}} // namespace at::native

View File

@ -15,7 +15,7 @@
#elif defined(__GNUC__) && defined(__IWMMXT__)
/* GCC-compatible compiler, targeting ARM with WMMX */
#include <mmintrin.h>
#elif (defined(__GNUC__) || defined(__xlC__)) && \
#elif (defined(__GNUC__) || defined(__xlC__)) && \
(defined(__VEC__) || defined(__ALTIVEC__))
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
#include <altivec.h>
@ -25,94 +25,212 @@
#endif
#include <algorithm>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iostream>
// NOTE:
// If you specialize on a type, you must define all operations!
// C arrays and intrinsic types don't mix
namespace at {
namespace native {
namespace vec256 {
namespace at { namespace native { namespace vec256 {
template <class T> class Vec256 {
public:
template <class T>
class Vec256 {
public:
T values[32 / sizeof(T)]; // Mimics AVX behavior
inline void load(const T *ptr) {
std::memcpy(values, ptr, 32);
inline void load(const T* ptr) {
std::memcpy(values, ptr, 32);
};
inline void store(T *ptr) { std::memcpy(ptr, values, 32); }
inline size_t size() { return 32 / sizeof(T); }
inline void operator=(const Vec256<T> &b) {
inline void store(T* ptr) const {
std::memcpy(ptr, values, 32);
}
inline void load(const T* ptr, size_t count) {
std::memcpy(values, ptr, 32 / sizeof(T) * count);
};
inline void store(T* ptr, size_t count) const {
std::memcpy(ptr, values, 32 / sizeof(T) * count);
}
size_t size = 32 / sizeof(T);
inline void operator=(const Vec256<T>& b) {
std::memcpy(values, b.values, 32);
}
inline Vec256<T> map(T (*f)(T)) {
Vec256<T> ret;
for (size_t i = 0; i < size; i++)
ret.values[i] = f(values[i]);
return ret;
}
inline Vec256<T> ceil() {
return map(std::ceil);
}
inline Vec256<T> floor() {
return map(std::floor);
}
inline Vec256<T> round() {
return map(std::round);
}
inline Vec256<T> trunc() {
return map(std::trunc);
}
inline Vec256<T> sqrt() {
return map(std::sqrt);
}
};
template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
template <class T>
Vec256<T> operator+(const Vec256<T>& a, const Vec256<T>& b) {
Vec256<T> c = Vec256<T>();
for (size_t i = 0; i < c.size(); i++) {
for (size_t i = 0; i < a.size; i++)
c.values[i] = a.values[i] + b.values[i];
}
return c;
}
template <class T> Vec256<T> operator*(const Vec256<T> &a, const Vec256<T> &b) {
template <class T>
Vec256<T> operator*(const Vec256<T>& a, const Vec256<T>& b) {
Vec256<T> c = Vec256<T>();
for (size_t i = 0; i < c.size(); i++) {
for (size_t i = 0; i < a.size; i++)
c.values[i] = a.values[i] * b.values[i];
}
return c;
}
#ifdef __AVX__
template <> class Vec256<float> {
public:
template <>
class Vec256<float> {
public:
__m256 values;
Vec256<float>() {}
inline void load(const float *ptr) { values = _mm256_loadu_ps(ptr); }
inline void store(float *ptr) { _mm256_storeu_ps(ptr, values); }
inline size_t size() { return 32 / sizeof(float); }
inline void operator=(const Vec256<float> &b) { values = b.values; }
};
template <> class Vec256<double> {
public:
__m256d values;
Vec256<double>() {}
inline void load(const double *ptr) { values = _mm256_loadu_pd(ptr); }
inline void store(double *ptr) { _mm256_storeu_pd(ptr, values); }
inline size_t size() { return 32 / sizeof(double); }
inline void operator=(const Vec256<double> &b) { values = b.values; }
inline void load(const float* ptr) {
values = _mm256_loadu_ps(ptr);
}
inline void store(float* ptr) const {
_mm256_storeu_ps(ptr, values);
}
inline void load(const float* ptr, size_t count) {
float tmp_values[8];
std::memcpy(tmp_values, ptr, count * sizeof(float));
load(tmp_values);
}
inline void store(float* ptr, size_t count) const {
float tmp_values[8];
store(tmp_values);
std::memcpy(ptr, tmp_values, count * sizeof(float));
}
size_t size = 8;
inline void operator=(const Vec256<float>& b) {
values = b.values;
}
inline Vec256<float> ceil() {
Vec256<float> ret;
ret.values = _mm256_ceil_ps(values);
return ret;
}
inline Vec256<float> floor() {
Vec256<float> ret;
ret.values = _mm256_floor_ps(values);
return ret;
}
inline Vec256<float> round() {
Vec256<float> ret;
ret.values = _mm256_round_ps(
values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
return ret;
}
inline Vec256<float> trunc() {
Vec256<float> ret;
ret.values =
_mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
return ret;
}
inline Vec256<float> sqrt() {
Vec256<float> ret;
ret.values = _mm256_sqrt_ps(values);
return ret;
}
};
template <>
Vec256<float> inline operator+(const Vec256<float> &a, const Vec256<float> &b) {
class Vec256<double> {
public:
__m256d values;
Vec256<double>() {}
inline void load(const double* ptr) {
values = _mm256_loadu_pd(ptr);
}
inline void store(double* ptr) const {
_mm256_storeu_pd(ptr, values);
}
inline void load(const double* ptr, size_t count) {
double tmp_values[4];
std::memcpy(tmp_values, ptr, count * sizeof(double));
load(tmp_values);
}
inline void store(double* ptr, size_t count) const {
double tmp_values[4];
store(tmp_values);
std::memcpy(ptr, tmp_values, count * sizeof(double));
}
size_t size = 4;
inline void operator=(const Vec256<double>& b) {
values = b.values;
}
inline Vec256<double> ceil() {
Vec256<double> ret;
ret.values = _mm256_ceil_pd(values);
return ret;
}
inline Vec256<double> floor() {
Vec256<double> ret;
ret.values = _mm256_floor_pd(values);
return ret;
}
inline Vec256<double> round() {
Vec256<double> ret;
ret.values = _mm256_round_pd(
values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
return ret;
}
inline Vec256<double> trunc() {
Vec256<double> ret;
ret.values =
_mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
return ret;
}
inline Vec256<double> sqrt() {
Vec256<double> ret;
ret.values = _mm256_sqrt_pd(values);
return ret;
}
};
template <>
Vec256<float> inline operator+(const Vec256<float>& a, const Vec256<float>& b) {
Vec256<float> c = Vec256<float>();
c.values = _mm256_add_ps(a.values, b.values);
return c;
}
template <>
Vec256<float> inline operator*(const Vec256<float> &a, const Vec256<float> &b) {
Vec256<float> inline operator*(const Vec256<float>& a, const Vec256<float>& b) {
Vec256<float> c = Vec256<float>();
c.values = _mm256_mul_ps(a.values, b.values);
return c;
}
template <>
Vec256<double> inline operator+(const Vec256<double> &a,
const Vec256<double> &b) {
Vec256<double> inline operator+(
const Vec256<double>& a,
const Vec256<double>& b) {
Vec256<double> c = Vec256<double>();
c.values = _mm256_add_pd(a.values, b.values);
return c;
}
template <>
Vec256<double> inline operator*(const Vec256<double> &a,
const Vec256<double> &b) {
Vec256<double> inline operator*(
const Vec256<double>& a,
const Vec256<double>& b) {
Vec256<double> c = Vec256<double>();
c.values = _mm256_mul_pd(a.values, b.values);
return c;
@ -120,67 +238,109 @@ Vec256<double> inline operator*(const Vec256<double> &a,
#endif
#ifdef __AVX2__
template <> class Vec256<int64_t> {
public:
template <>
class Vec256<int64_t> {
public:
__m256i values;
Vec256<int64_t>() {}
inline void load(const int64_t *ptr) {
values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
inline void load(const int64_t* ptr) {
values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
}
inline void store(int64_t *ptr) {
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
inline void store(int64_t* ptr) const {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
}
inline size_t size() { return 32 / sizeof(int64_t); }
inline void operator=(const Vec256<int64_t> &b) { values = b.values; }
};
template <> class Vec256<int32_t> {
public:
__m256i values;
Vec256<int32_t>() {}
inline void load(const int32_t *ptr) {
values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
inline void load(const int64_t* ptr, size_t count) {
int64_t tmp_values[4];
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
load(tmp_values);
}
inline void store(int32_t *ptr) {
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
inline void store(int64_t* ptr, size_t count) const {
int64_t tmp_values[4];
store(tmp_values);
std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
}
inline size_t size() { return 32 / sizeof(int32_t); }
inline void operator=(const Vec256<int32_t> &b) { values = b.values; }
};
template <> class Vec256<int16_t> {
public:
__m256i values;
Vec256<int16_t>() {}
inline void load(const int16_t *ptr) {
values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
size_t size = 4;
inline void operator=(const Vec256<int64_t>& b) {
values = b.values;
}
inline void store(int16_t *ptr) {
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
}
inline size_t size() { return 32 / sizeof(int16_t); }
inline void operator=(const Vec256<int16_t> &b) { values = b.values; }
};
template <>
Vec256<int64_t> inline operator+(const Vec256<int64_t> &a,
const Vec256<int64_t> &b) {
class Vec256<int32_t> {
public:
__m256i values;
Vec256<int32_t>() {}
inline void load(const int32_t* ptr) {
values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
}
inline void store(int32_t* ptr) const {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
}
inline void load(const int32_t* ptr, size_t count) {
int32_t tmp_values[8];
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
load(tmp_values);
}
inline void store(int32_t* ptr, size_t count) const {
int32_t tmp_values[8];
store(tmp_values);
std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
}
size_t size = 8;
inline void operator=(const Vec256<int32_t>& b) {
values = b.values;
}
};
template <>
class Vec256<int16_t> {
public:
__m256i values;
Vec256<int16_t>() {}
inline void load(const int16_t* ptr) {
values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
}
inline void store(int16_t* ptr) const {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
}
inline void load(const int16_t* ptr, size_t count) {
int16_t tmp_values[16];
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
load(tmp_values);
}
inline void store(int16_t* ptr, size_t count) const {
int16_t tmp_values[16];
store(tmp_values);
std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
}
size_t size = 16;
inline void operator=(const Vec256<int16_t>& b) {
values = b.values;
}
};
template <>
Vec256<int64_t> inline operator+(
const Vec256<int64_t>& a,
const Vec256<int64_t>& b) {
Vec256<int64_t> c = Vec256<int64_t>();
c.values = _mm256_add_epi64(a.values, b.values);
return c;
}
template <>
Vec256<int32_t> inline operator+(const Vec256<int32_t> &a,
const Vec256<int32_t> &b) {
Vec256<int32_t> inline operator+(
const Vec256<int32_t>& a,
const Vec256<int32_t>& b) {
Vec256<int32_t> c = Vec256<int32_t>();
c.values = _mm256_add_epi32(a.values, b.values);
return c;
}
template <>
Vec256<int16_t> inline operator+(const Vec256<int16_t> &a,
const Vec256<int16_t> &b) {
Vec256<int16_t> inline operator+(
const Vec256<int16_t>& a,
const Vec256<int16_t>& b) {
Vec256<int16_t> c = Vec256<int16_t>();
c.values = _mm256_add_epi16(a.values, b.values);
return c;
@ -191,8 +351,9 @@ Vec256<int16_t> inline operator+(const Vec256<int16_t> &a,
// This is also technically avx compatible, but then we'll need AVX
// code for add as well.
template <>
Vec256<int64_t> inline operator*(const Vec256<int64_t> &a,
const Vec256<int64_t> &b) {
Vec256<int64_t> inline operator*(
const Vec256<int64_t>& a,
const Vec256<int64_t>& b) {
Vec256<int64_t> c = Vec256<int64_t>();
int64_t a0 = _mm256_extract_epi64(a.values, 0);
@ -215,21 +376,21 @@ Vec256<int64_t> inline operator*(const Vec256<int64_t> &a,
}
template <>
Vec256<int32_t> inline operator*(const Vec256<int32_t> &a,
const Vec256<int32_t> &b) {
Vec256<int32_t> inline operator*(
const Vec256<int32_t>& a,
const Vec256<int32_t>& b) {
Vec256<int32_t> c = Vec256<int32_t>();
c.values = _mm256_mullo_epi32(a.values, b.values);
return c;
}
template <>
Vec256<int16_t> inline operator*(const Vec256<int16_t> &a,
const Vec256<int16_t> &b) {
Vec256<int16_t> inline operator*(
const Vec256<int16_t>& a,
const Vec256<int16_t>& b) {
Vec256<int16_t> c = Vec256<int16_t>();
c.values = _mm256_mullo_epi16(a.values, b.values);
return c;
}
#endif
}
}
}
}}} // namespace at::native::vec256

View File

@ -29,6 +29,18 @@
- func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
variants: function, method
- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
variants: function
- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
variants: function
- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor grad_output, Tensor grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
variants: function
- func: _cudnn_init_dropout_state(Type ty, double dropout, bool train, int64_t dropout_seed) -> Tensor
variants: function
- func: adaptive_avg_pool1d(Tensor self, IntList[1] output_size) -> Tensor
variants: function
@ -80,6 +92,16 @@
- func: cat_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor
variants: function
- func: ceil(Tensor self) -> Tensor
- func: ceil_(Tensor self) -> Tensor
- func: ceil_out(Tensor result, Tensor self) -> Tensor
variants: function
dispatch:
CPU: _ceil_out_cpu
CUDA: _ceil_out_cuda
- func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList
- func: cudnn_is_acceptable(Tensor self) -> bool
@ -271,6 +293,16 @@
CPU: eye_out_cpu
CUDA: eye_out_cuda
- func: floor(Tensor self) -> Tensor
- func: floor_(Tensor self) -> Tensor
- func: floor_out(Tensor result, Tensor self) -> Tensor
variants: function
dispatch:
CPU: _floor_out_cpu
CUDA: _floor_out_cuda
- func: full(Type dtype, IntList size, Scalar fill_value) -> Tensor
variants: function
@ -437,18 +469,28 @@
CPU: RoiPooling2d_backward_cpu
CUDA: RoiPooling2d_backward_cuda
- func: round(Tensor self) -> Tensor
- func: round_(Tensor self) -> Tensor
- func: round_out(Tensor result, Tensor self) -> Tensor
variants: function
dispatch:
CPU: _round_out_cpu
CUDA: _round_out_cuda
- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
variants: function
- func: rrelu_(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
variants: function
- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
- func: relu(Tensor self) -> Tensor
- func: relu_(Tensor self) -> Tensor
- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
- func: selu(Tensor self) -> Tensor
variants: function
@ -512,6 +554,16 @@
CPU: _sum_out_cpu
CUDA: _sum_out_cuda
- func: sqrt(Tensor self) -> Tensor
- func: sqrt_(Tensor self) -> Tensor
- func: sqrt_out(Tensor result, Tensor self) -> Tensor
variants: function
dispatch:
CPU: _sqrt_out_cpu
CUDA: _sqrt_out_cuda
- func: prod(Tensor self) -> Tensor
dispatch:
CPU: _prod_cpu
@ -525,14 +577,24 @@
CPU: _prod_out_cpu
CUDA: _prod_out_cuda
- func: t_(Tensor self) -> Tensor
variants: method
- func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
variants: method
- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, double margin=1.0, double p=2, double eps=1e-6, bool swap=false, bool size_average=true, bool reduce=true) -> Tensor
variants: function
- func: t_(Tensor self) -> Tensor
variants: method
- func: trunc(Tensor self) -> Tensor
- func: trunc_(Tensor self) -> Tensor
- func: trunc_out(Tensor result, Tensor self) -> Tensor
variants: function
dispatch:
CPU: _trunc_out_cpu
CUDA: _trunc_out_cuda
- func: type_as(Tensor self, Tensor other) -> Tensor
variants: method
@ -584,15 +646,3 @@
dispatch:
CPU: _s_poisson_cpu
CUDA: _s_poisson_cuda
- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
variants: function
- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
variants: function
- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor grad_output, Tensor grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
variants: function
- func: _cudnn_init_dropout_state(Type ty, double dropout, bool train, int64_t dropout_seed) -> Tensor
variants: function

View File

@ -229,23 +229,27 @@ class TestTorch(TestCase):
self.assertRaises(RuntimeError, lambda: torch.addr(m, v, s))
self.assertRaises(RuntimeError, lambda: torch.addr(m, s, v))
def _testMath(self, torchfn, mathfn):
size = (10, 5)
# contiguous
m1 = torch.randn(*size)
res1 = torchfn(m1[4])
res2 = res1.clone().zero_()
for i, v in enumerate(m1[4]):
res2[i] = mathfn(v.item())
self.assertEqual(res1, res2)
def _testMath(self, torchfn, mathfn, large=True):
def _testMathSize(size, self, torchfn, mathfn):
# contiguous
m1 = torch.randn(*size)
res1 = torchfn(m1[4])
res2 = res1.clone().zero_()
for i, v in enumerate(m1[4]):
res2[i] = mathfn(v.item())
self.assertEqual(res1, res2)
# non-contiguous
m1 = torch.randn(*size)
res1 = torchfn(m1[:, 4])
res2 = res1.clone().zero_()
for i, v in enumerate(m1[:, 4]):
res2[i] = mathfn(v.item())
self.assertEqual(res1, res2)
# non-contiguous
m1 = torch.randn(*size)
res1 = torchfn(m1[:, 4])
res2 = res1.clone().zero_()
for i, v in enumerate(m1[:, 4]):
res2[i] = mathfn(v.item())
self.assertEqual(res1, res2)
_testMathSize((10, 5), self, torchfn, mathfn)
if large:
# Trigger parallelism
_testMathSize((10, 50000), self, torchfn, mathfn)
def _testMathByName(self, function_name):
torchfn = getattr(torch, function_name)
@ -269,8 +273,9 @@ class TestTorch(TestCase):
@unittest.skipIf(not TEST_SCIPY, "Scipy not found")
def test_polygamma(self):
from scipy.special import polygamma
# This test won't work when using many samples
for n in [0, 1]:
self._testMath(lambda x: torch.polygamma(n, x), lambda x: polygamma(n, x)[()])
self._testMath(lambda x: torch.polygamma(n, x), lambda x: polygamma(n, x)[()], large=False)
def test_asin(self):
self._testMath(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else float('nan'))