mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
ATen Unary Ops (#6030)
Implements a few unary operations for which there are AVX intrinsics. The perf comparison script is here: https://paste.fedoraproject.org/paste/f1adcJhpGtzDNWImS34XzQ
This commit is contained in:
parent
ebc0194950
commit
bde2f6b298
|
|
@ -1665,20 +1665,8 @@
|
|||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: sqrt_
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
- CPU
|
||||
- CUDA
|
||||
name: _sqrt
|
||||
cname: sqrt
|
||||
return: self
|
||||
arguments:
|
||||
- THTensor* self
|
||||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: sqrt
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
|
|
@ -1723,20 +1711,8 @@
|
|||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: ceil_
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
- CPU
|
||||
- CUDA
|
||||
name: _ceil
|
||||
cname: ceil
|
||||
return: self
|
||||
arguments:
|
||||
- THTensor* self
|
||||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: ceil
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
|
|
@ -1752,20 +1728,8 @@
|
|||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: floor_
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
- CPU
|
||||
- CUDA
|
||||
name: _floor
|
||||
cname: floor
|
||||
return: self
|
||||
arguments:
|
||||
- THTensor* self
|
||||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: floor
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
|
|
@ -1781,20 +1745,8 @@
|
|||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: round_
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
- CPU
|
||||
- CUDA
|
||||
name: _round
|
||||
cname: round
|
||||
return: self
|
||||
arguments:
|
||||
- THTensor* self
|
||||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: round
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
|
|
@ -1810,20 +1762,8 @@
|
|||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: trunc_
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
- CPU
|
||||
- CUDA
|
||||
name: _trunc
|
||||
cname: trunc
|
||||
return: self
|
||||
arguments:
|
||||
- THTensor* self
|
||||
- THTensor* self
|
||||
]]
|
||||
[[
|
||||
name: trunc
|
||||
types:
|
||||
- floating_point
|
||||
backends:
|
||||
|
|
|
|||
|
|
@ -4,15 +4,15 @@
|
|||
#include <tbb/parallel_reduce.h>
|
||||
#include <tbb/partitioner.h>
|
||||
#include <tbb/tbb.h>
|
||||
#include <thread>
|
||||
#include <cassert>
|
||||
#include <thread>
|
||||
|
||||
namespace at {
|
||||
namespace internal {
|
||||
namespace at { namespace internal {
|
||||
|
||||
// thread_local variable with internal linkage
|
||||
// requires no guarding as it's storage duration is defined to be per thread
|
||||
static thread_local tbb::task_scheduler_init tbbinit(tbb::task_scheduler_init::deferred);
|
||||
static thread_local tbb::task_scheduler_init tbbinit(
|
||||
tbb::task_scheduler_init::deferred);
|
||||
// Tracks number of threads uses which TBB doesn't track.
|
||||
static thread_local int num_threads_ = -1;
|
||||
|
||||
|
|
@ -22,9 +22,9 @@ void init_tbb_num_threads() {
|
|||
int num_threads = at::get_num_threads();
|
||||
// In order to have control over the number of threads this function
|
||||
// must be called first before any other tbb parallel construct is
|
||||
// excercised within a particular thread. Otherwise the default
|
||||
// scheduler will be created over which we do not have control.
|
||||
// The following code will and must throw an error if tbb has
|
||||
// excercised within a particular thread. Otherwise the default
|
||||
// scheduler will be created over which we do not have control.
|
||||
// The following code will and must throw an error if tbb has
|
||||
// already been initialized before this function was called.
|
||||
if (!tbbinit.is_active() && !first_call)
|
||||
throw std::runtime_error(
|
||||
|
|
@ -51,5 +51,4 @@ void init_tbb_num_threads() {
|
|||
num_threads_ = num_threads;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}} // namespace at::internal
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
#include <cstddef>
|
||||
#include <ATen/ATen.h>
|
||||
#include <tbb/tbb.h>
|
||||
#include <cstddef>
|
||||
|
||||
namespace at {
|
||||
namespace internal {
|
||||
|
|
@ -21,12 +22,15 @@ void init_tbb_num_threads();
|
|||
// no parallel algorithm (such as parallel_reduce) should split work into
|
||||
// smaller than GRAIN_SIZE chunks.
|
||||
constexpr size_t TBB_GRAIN_SIZE = 32768;
|
||||
}
|
||||
} // namespace internal
|
||||
|
||||
template <class T, template <class> class OP>
|
||||
T parallel_reduce(T (*f)(const T *, size_t, size_t, T), const T *data,
|
||||
size_t start, size_t end, T init_) {
|
||||
|
||||
T parallel_reduce(
|
||||
T (*f)(const T*, size_t, size_t, T),
|
||||
const T* data,
|
||||
size_t start,
|
||||
size_t end,
|
||||
T init_) {
|
||||
internal::init_tbb_num_threads();
|
||||
|
||||
T result_;
|
||||
|
|
@ -35,19 +39,25 @@ T parallel_reduce(T (*f)(const T *, size_t, size_t, T), const T *data,
|
|||
result_ = f(data, start, end, init_);
|
||||
} else {
|
||||
result_ = tbb::parallel_reduce(
|
||||
tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE), init_,
|
||||
tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE),
|
||||
init_,
|
||||
[&data, &f](const tbb::blocked_range<size_t> r, T init) -> T {
|
||||
return f(data, r.begin(), r.end(), init);
|
||||
},
|
||||
OP<T>(), ap);
|
||||
OP<T>(),
|
||||
ap);
|
||||
}
|
||||
return result_;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void parallel_reduce_2d(void (*f)(const T *, T *, size_t, size_t), size_t num_rows,
|
||||
size_t num_cols, size_t numel, const T *arr_, T *outarr_) {
|
||||
|
||||
void parallel_reduce_2d(
|
||||
void (*f)(const T*, T*, size_t, size_t),
|
||||
size_t num_rows,
|
||||
size_t num_cols,
|
||||
size_t numel,
|
||||
const T* arr_,
|
||||
T* outarr_) {
|
||||
internal::init_tbb_num_threads();
|
||||
|
||||
static tbb::affinity_partitioner ap;
|
||||
|
|
@ -63,19 +73,44 @@ void parallel_reduce_2d(void (*f)(const T *, T *, size_t, size_t), size_t num_ro
|
|||
f(arr, outarr, num_rows, num_cols);
|
||||
}
|
||||
} else {
|
||||
tbb::parallel_for(tbb::blocked_range<size_t>(
|
||||
0, max_i_, 1),
|
||||
[&arr_, &outarr_, num_rows, num_cols,
|
||||
&f](const tbb::blocked_range<size_t> r) {
|
||||
for (size_t i_ = r.begin(); i_ < r.end(); i_++) {
|
||||
int64_t i = i_ * num_rows * num_cols;
|
||||
int64_t i_r = i_ * num_cols;
|
||||
const T *arr = arr_ + i;
|
||||
T *outarr = outarr_ + i_r;
|
||||
f(arr, outarr, num_rows, num_cols);
|
||||
}
|
||||
},
|
||||
ap);
|
||||
tbb::parallel_for(
|
||||
tbb::blocked_range<size_t>(0, max_i_, 1),
|
||||
[&arr_, &outarr_, num_rows, num_cols, &f](
|
||||
const tbb::blocked_range<size_t> r) {
|
||||
for (size_t i_ = r.begin(); i_ < r.end(); i_++) {
|
||||
int64_t i = i_ * num_rows * num_cols;
|
||||
int64_t i_r = i_ * num_cols;
|
||||
const T* arr = arr_ + i;
|
||||
T* outarr = outarr_ + i_r;
|
||||
f(arr, outarr, num_rows, num_cols);
|
||||
}
|
||||
},
|
||||
ap);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
void parallel_for_1d(
|
||||
void (*f)(T*, const T*, size_t, size_t),
|
||||
Tensor& result,
|
||||
const Tensor& self) {
|
||||
internal::init_tbb_num_threads();
|
||||
|
||||
static tbb::affinity_partitioner ap;
|
||||
|
||||
T* arr_out = result.data<T>();
|
||||
const T* arr_in = self.data<T>();
|
||||
size_t start = 0;
|
||||
size_t end = self.numel();
|
||||
if (end - start < internal::TBB_GRAIN_SIZE) {
|
||||
f(arr_out, arr_in, start, end);
|
||||
} else {
|
||||
tbb::parallel_for(
|
||||
tbb::blocked_range<size_t>(start, end, internal::TBB_GRAIN_SIZE),
|
||||
[&arr_out, &arr_in, &f](const tbb::blocked_range<size_t> r) {
|
||||
f(arr_out, arr_in, r.begin(), r.end());
|
||||
},
|
||||
ap);
|
||||
}
|
||||
}
|
||||
} // namespace at
|
||||
|
|
|
|||
|
|
@ -12,43 +12,46 @@
|
|||
|
||||
#include <map>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace at { namespace native {
|
||||
|
||||
using reduce_type = void(Tensor &, const Tensor &, size_t, bool);
|
||||
reduce_type *sumImpl = &DispatchStub<reduce_type>::init<sumImplC, &sumImpl>;
|
||||
reduce_type *prodImpl = &DispatchStub<reduce_type>::init<prodImplC, &prodImpl>;
|
||||
using reduce_type = void(Tensor&, const Tensor&, size_t, bool);
|
||||
reduce_type* sumImpl = &DispatchStub<reduce_type>::init<sumImplC, &sumImpl>;
|
||||
reduce_type* prodImpl = &DispatchStub<reduce_type>::init<prodImplC, &prodImpl>;
|
||||
|
||||
// ALL REDUCE #################################################################
|
||||
|
||||
Tensor _reduce_cpu(reduce_type *f, const Tensor &self) {
|
||||
Tensor _reduce_cpu(reduce_type* f, const Tensor& self) {
|
||||
Tensor result = self.type().tensor({});
|
||||
f(result, self, 0, true);
|
||||
return result;
|
||||
}
|
||||
|
||||
Tensor _sum_cpu(const Tensor &self) {
|
||||
Tensor _sum_cpu(const Tensor& self) {
|
||||
if (self.is_contiguous())
|
||||
return _reduce_cpu(sumImpl, self);
|
||||
return self._sumall();
|
||||
}
|
||||
|
||||
Tensor _prod_cpu(const Tensor &self) {
|
||||
Tensor _prod_cpu(const Tensor& self) {
|
||||
if (self.is_contiguous())
|
||||
return _reduce_cpu(prodImpl, self);
|
||||
return self._prodall();
|
||||
}
|
||||
|
||||
Tensor _sum_cuda(const Tensor &self_) { return self_._sumall(); }
|
||||
Tensor _sum_cuda(const Tensor& self_) {
|
||||
return self_._sumall();
|
||||
}
|
||||
|
||||
Tensor _prod_cuda(const Tensor &self_) { return self_._prodall(); }
|
||||
Tensor _prod_cuda(const Tensor& self_) {
|
||||
return self_._prodall();
|
||||
}
|
||||
|
||||
// \ALL REDUCE ################################################################
|
||||
|
||||
// DIM REDUCE #################################################################
|
||||
|
||||
static bool _dimreduce_return_trivial(Tensor &result, const Tensor &self,
|
||||
int64_t ident) {
|
||||
static bool
|
||||
_dimreduce_return_trivial(Tensor& result, const Tensor& self, int64_t ident) {
|
||||
if (self.numel() == 1 && self.ndimension() == 0) {
|
||||
result.resize_({});
|
||||
result.fill_(self);
|
||||
|
|
@ -63,8 +66,8 @@ static bool _dimreduce_return_trivial(Tensor &result, const Tensor &self,
|
|||
return false;
|
||||
}
|
||||
|
||||
static Tensor &_dimreduce_setup(Tensor &result, const Tensor &self,
|
||||
int64_t dim) {
|
||||
static Tensor&
|
||||
_dimreduce_setup(Tensor& result, const Tensor& self, int64_t dim) {
|
||||
IntList self_sizes = self.sizes();
|
||||
std::vector<int64_t> result_sizes;
|
||||
result_sizes.insert(result_sizes.end(), self_sizes.begin(), self_sizes.end());
|
||||
|
|
@ -73,8 +76,12 @@ static Tensor &_dimreduce_setup(Tensor &result, const Tensor &self,
|
|||
return result;
|
||||
}
|
||||
|
||||
Tensor &_reduce_out_cpu(reduce_type *f, Tensor &result, const Tensor &self,
|
||||
int64_t dim, bool keepdim) {
|
||||
Tensor& _reduce_out_cpu(
|
||||
reduce_type* f,
|
||||
Tensor& result,
|
||||
const Tensor& self,
|
||||
int64_t dim,
|
||||
bool keepdim) {
|
||||
result = _dimreduce_setup(result, self, dim);
|
||||
f(result, self, dim, false);
|
||||
if (!keepdim)
|
||||
|
|
@ -82,8 +89,8 @@ Tensor &_reduce_out_cpu(reduce_type *f, Tensor &result, const Tensor &self,
|
|||
return result;
|
||||
}
|
||||
|
||||
Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
|
||||
bool keepdim) {
|
||||
Tensor&
|
||||
_sum_out_cpu(Tensor& result, const Tensor& self, int64_t dim_, bool keepdim) {
|
||||
int64_t dim = maybe_wrap_dim(dim_, self.dim());
|
||||
if (_dimreduce_return_trivial(result, self, 0))
|
||||
return result;
|
||||
|
|
@ -93,8 +100,8 @@ Tensor &_sum_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
|
|||
return at::_sum_out(result, self, dim, keepdim);
|
||||
}
|
||||
|
||||
Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
|
||||
bool keepdim) {
|
||||
Tensor&
|
||||
_prod_out_cpu(Tensor& result, const Tensor& self, int64_t dim_, bool keepdim) {
|
||||
int64_t dim = maybe_wrap_dim(dim_, self.dim());
|
||||
if (_dimreduce_return_trivial(result, self, 1))
|
||||
return result;
|
||||
|
|
@ -104,28 +111,27 @@ Tensor &_prod_out_cpu(Tensor &result, const Tensor &self, int64_t dim_,
|
|||
return at::_prod_out(result, self, dim, keepdim);
|
||||
}
|
||||
|
||||
Tensor &_sum_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
|
||||
bool keepdim) {
|
||||
Tensor&
|
||||
_sum_out_cuda(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) {
|
||||
return at::_sum_out(result, self, dim, keepdim);
|
||||
}
|
||||
|
||||
Tensor &_prod_out_cuda(Tensor &result, const Tensor &self, int64_t dim,
|
||||
bool keepdim) {
|
||||
Tensor&
|
||||
_prod_out_cuda(Tensor& result, const Tensor& self, int64_t dim, bool keepdim) {
|
||||
return at::_prod_out(result, self, dim, keepdim);
|
||||
}
|
||||
|
||||
Tensor sum(const Tensor &self, int64_t dim_, bool keepdim) {
|
||||
Tensor sum(const Tensor& self, int64_t dim_, bool keepdim) {
|
||||
int64_t dim = maybe_wrap_dim(dim_, self.dim());
|
||||
Tensor result = self.type().tensor();
|
||||
return at::sum_out(result, self, dim, keepdim);
|
||||
}
|
||||
|
||||
Tensor prod(const Tensor &self, int64_t dim_, bool keepdim) {
|
||||
Tensor prod(const Tensor& self, int64_t dim_, bool keepdim) {
|
||||
int64_t dim = maybe_wrap_dim(dim_, self.dim());
|
||||
Tensor result = self.type().tensor();
|
||||
return at::prod_out(result, self, dim, keepdim);
|
||||
}
|
||||
|
||||
// \DIM REDUCE ################################################################
|
||||
}
|
||||
}
|
||||
}} // namespace at::native
|
||||
|
|
|
|||
118
aten/src/ATen/native/UnaryOps.cpp
Normal file
118
aten/src/ATen/native/UnaryOps.cpp
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
#include "ATen/ATen.h"
|
||||
#include "ATen/Dispatch.h"
|
||||
#include "ATen/ExpandUtils.h"
|
||||
#include "ATen/NativeFunctions.h"
|
||||
#include "ATen/WrapDimUtils.h"
|
||||
#include "cpu/UnaryOpsKernel.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#include <map>
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
using unary_type = void(Tensor&, const Tensor&);
|
||||
unary_type* ceilImpl = DispatchStub<unary_type>::init<ceilImplC, &ceilImpl>;
|
||||
unary_type* floorImpl = DispatchStub<unary_type>::init<floorImplC, &floorImpl>;
|
||||
unary_type* roundImpl = DispatchStub<unary_type>::init<roundImplC, &roundImpl>;
|
||||
unary_type* truncImpl = DispatchStub<unary_type>::init<truncImplC, &truncImpl>;
|
||||
unary_type* sqrtImpl = DispatchStub<unary_type>::init<sqrtImplC, &sqrtImpl>;
|
||||
|
||||
// WRAP OPS #################################################################
|
||||
|
||||
Tensor ceil(const Tensor& self) {
|
||||
Tensor result = self.type().tensor();
|
||||
return at::ceil_out(result, self);
|
||||
}
|
||||
Tensor floor(const Tensor& self) {
|
||||
Tensor result = self.type().tensor();
|
||||
return at::floor_out(result, self);
|
||||
}
|
||||
Tensor round(const Tensor& self) {
|
||||
Tensor result = self.type().tensor();
|
||||
return at::round_out(result, self);
|
||||
}
|
||||
Tensor trunc(const Tensor& self) {
|
||||
Tensor result = self.type().tensor();
|
||||
return at::trunc_out(result, self);
|
||||
}
|
||||
Tensor sqrt(const Tensor& self) {
|
||||
Tensor result = self.type().tensor();
|
||||
return at::sqrt_out(result, self);
|
||||
}
|
||||
|
||||
Tensor& ceil_(Tensor& self) {
|
||||
return at::ceil_out(self, self);
|
||||
}
|
||||
Tensor& floor_(Tensor& self) {
|
||||
return at::floor_out(self, self);
|
||||
}
|
||||
Tensor& round_(Tensor& self) {
|
||||
return at::round_out(self, self);
|
||||
}
|
||||
Tensor& trunc_(Tensor& self) {
|
||||
return at::trunc_out(self, self);
|
||||
}
|
||||
Tensor& sqrt_(Tensor& self) {
|
||||
return at::sqrt_out(self, self);
|
||||
}
|
||||
|
||||
// \WRAP OPS #################################################################
|
||||
|
||||
bool _unops_out_cpu(unary_type* f, Tensor& result, const Tensor& self) {
|
||||
if (result.is_contiguous() && self.is_contiguous()) {
|
||||
result.resize_(self.sizes());
|
||||
f(result, self);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// CPU OPS ###################################################################
|
||||
|
||||
Tensor& _ceil_out_cpu(Tensor& result, const Tensor& self) {
|
||||
return _unops_out_cpu(ceilImpl, result, self) ? result
|
||||
: at::_ceil_out(result, self);
|
||||
}
|
||||
Tensor& _floor_out_cpu(Tensor& result, const Tensor& self) {
|
||||
return _unops_out_cpu(floorImpl, result, self) ? result
|
||||
: at::_floor_out(result, self);
|
||||
}
|
||||
Tensor& _round_out_cpu(Tensor& result, const Tensor& self) {
|
||||
return _unops_out_cpu(roundImpl, result, self) ? result
|
||||
: at::_round_out(result, self);
|
||||
}
|
||||
Tensor& _trunc_out_cpu(Tensor& result, const Tensor& self) {
|
||||
return _unops_out_cpu(truncImpl, result, self) ? result
|
||||
: at::_trunc_out(result, self);
|
||||
}
|
||||
Tensor& _sqrt_out_cpu(Tensor& result, const Tensor& self) {
|
||||
return _unops_out_cpu(sqrtImpl, result, self) ? result
|
||||
: at::_sqrt_out(result, self);
|
||||
}
|
||||
|
||||
// \CPU OPS #################################################################
|
||||
|
||||
// CUDA OPS #################################################################
|
||||
|
||||
Tensor& _ceil_out_cuda(Tensor& result, const Tensor& self) {
|
||||
return at::_ceil_out(result, self);
|
||||
}
|
||||
Tensor& _floor_out_cuda(Tensor& result, const Tensor& self) {
|
||||
return at::_floor_out(result, self);
|
||||
}
|
||||
Tensor& _round_out_cuda(Tensor& result, const Tensor& self) {
|
||||
return at::_round_out(result, self);
|
||||
}
|
||||
Tensor& _trunc_out_cuda(Tensor& result, const Tensor& self) {
|
||||
return at::_trunc_out(result, self);
|
||||
}
|
||||
Tensor& _sqrt_out_cuda(Tensor& result, const Tensor& self) {
|
||||
return at::_sqrt_out(result, self);
|
||||
}
|
||||
|
||||
// \CUDA OPS ################################################################
|
||||
}} // namespace at::native
|
||||
|
|
@ -1,9 +1,8 @@
|
|||
#include "ATen/cpu/cpuinfo/include/cpuinfo.h"
|
||||
#include <type_traits>
|
||||
#include <iostream>
|
||||
#include <type_traits>
|
||||
#include "ATen/cpu/cpuinfo/include/cpuinfo.h"
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace at { namespace native {
|
||||
|
||||
enum class CPUCapability { DEFAULT, AVX, AVX2 };
|
||||
|
||||
|
|
@ -15,19 +14,19 @@ constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::AVX;
|
|||
constexpr CPUCapability CURRENT_CAPABILITY = CPUCapability::AVX2;
|
||||
#endif
|
||||
|
||||
template <typename FnType> struct DispatchStub {};
|
||||
template <typename FnType>
|
||||
struct DispatchStub {};
|
||||
|
||||
template <typename... ArgTypes>
|
||||
struct DispatchStub<void(ArgTypes...)> {
|
||||
using FnType = void(ArgTypes...);
|
||||
|
||||
template <template <CPUCapability> class allImpl,
|
||||
FnType **dispatch_ptr>
|
||||
template <template <CPUCapability> class allImpl, FnType** dispatch_ptr>
|
||||
static void init(ArgTypes... args) {
|
||||
*dispatch_ptr = allImpl<CPUCapability::DEFAULT>::function;
|
||||
// Check if platform is supported
|
||||
if (cpuinfo_initialize()) {
|
||||
// Set function pointer to best implementation last
|
||||
// Set function pointer to best implementation last
|
||||
#if defined(HAVE_AVX_CPU_DEFINITION)
|
||||
if (cpuinfo_has_x86_avx()) {
|
||||
*dispatch_ptr = allImpl<CPUCapability::AVX>::function;
|
||||
|
|
@ -43,5 +42,4 @@ struct DispatchStub<void(ArgTypes...)> {
|
|||
}
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}} // namespace at::native
|
||||
|
|
|
|||
|
|
@ -1,16 +1,16 @@
|
|||
#include "ATen/native/cpu/ReduceOpsKernel.h"
|
||||
#include "ATen/Dispatch.h"
|
||||
#include "ATen/Parallel.h"
|
||||
#include "ATen/native/cpu/Vec256.h"
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace at { namespace native {
|
||||
|
||||
using namespace vec256;
|
||||
|
||||
// This adds the content of arr to sum
|
||||
template <class scalar_t, template <class> class OP, CPUCapability C>
|
||||
inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,
|
||||
scalar_t sum) {
|
||||
inline scalar_t
|
||||
allreduce_kernel_(const scalar_t* arr, size_t start, size_t end, scalar_t sum) {
|
||||
Vec256<scalar_t> part_sum;
|
||||
// Use all 16 registers.
|
||||
Vec256<scalar_t> tmp_sum[4], tmp_sum1, tmp_sum2, tmp_sum3;
|
||||
|
|
@ -38,7 +38,7 @@ inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,
|
|||
if (k > 0) {
|
||||
scalar_t sarr[32 / sizeof(scalar_t)];
|
||||
part_sum.store(sarr);
|
||||
for (size_t i = 0; i < part_sum.size(); i++) {
|
||||
for (size_t i = 0; i < part_sum.size; i++) {
|
||||
sum = OP<scalar_t>()(sum, sarr[i]);
|
||||
}
|
||||
}
|
||||
|
|
@ -51,8 +51,11 @@ inline scalar_t allreduce_kernel_(const scalar_t *arr, size_t start, size_t end,
|
|||
|
||||
// This overwrites the content of outarr
|
||||
template <class scalar_t, template <class> class OP, CPUCapability C>
|
||||
inline void dimreduce_kernel_(const scalar_t *arr, scalar_t *outarr,
|
||||
size_t num_rows, size_t num_cols) {
|
||||
inline void dimreduce_kernel_(
|
||||
const scalar_t* arr,
|
||||
scalar_t* outarr,
|
||||
size_t num_rows,
|
||||
size_t num_cols) {
|
||||
size_t width =
|
||||
256 / (sizeof(scalar_t)); // primitives per 256 bytes (two cache lines)
|
||||
Vec256<scalar_t> a[8];
|
||||
|
|
@ -88,31 +91,49 @@ inline void dimreduce_kernel_(const scalar_t *arr, scalar_t *outarr,
|
|||
}
|
||||
|
||||
template <template <class> class OP, CPUCapability C>
|
||||
inline void allImpl(Tensor & result, const Tensor & self, size_t dim, bool all, const char* name, int64_t init) {
|
||||
inline void allImpl(
|
||||
Tensor& result,
|
||||
const Tensor& self,
|
||||
size_t dim,
|
||||
bool all,
|
||||
const char* name,
|
||||
int64_t init) {
|
||||
AT_DISPATCH_ALL_TYPES(self.type(), name, [&] {
|
||||
if (all) {
|
||||
result.fill_(at::parallel_reduce<scalar_t, OP>(
|
||||
&allreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>, self.data<scalar_t>(),
|
||||
(size_t)0, (size_t)self.numel(), (scalar_t)init));
|
||||
&allreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>,
|
||||
self.data<scalar_t>(),
|
||||
(size_t)0,
|
||||
(size_t)self.numel(),
|
||||
(scalar_t)init));
|
||||
} else {
|
||||
at::parallel_reduce_2d<scalar_t>(
|
||||
&dimreduce_kernel_<scalar_t, OP, CURRENT_CAPABILITY>,
|
||||
self.sizes()[dim], self.strides()[dim], self.numel(),
|
||||
self.data<scalar_t>(), result.data<scalar_t>());
|
||||
self.sizes()[dim],
|
||||
self.strides()[dim],
|
||||
self.numel(),
|
||||
self.data<scalar_t>(),
|
||||
result.data<scalar_t>());
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <>
|
||||
void sumImplC<CURRENT_CAPABILITY>::function(Tensor &result, const Tensor &self,
|
||||
size_t dim, bool all) {
|
||||
void sumImplC<CURRENT_CAPABILITY>::function(
|
||||
Tensor& result,
|
||||
const Tensor& self,
|
||||
size_t dim,
|
||||
bool all) {
|
||||
allImpl<std::plus, CURRENT_CAPABILITY>(result, self, dim, all, "sum", 0);
|
||||
}
|
||||
|
||||
template <>
|
||||
void prodImplC<CURRENT_CAPABILITY>::function(Tensor &result, const Tensor &self,
|
||||
size_t dim, bool all) {
|
||||
allImpl<std::multiplies, CURRENT_CAPABILITY>(result, self, dim, all, "prod", 1);
|
||||
}
|
||||
}
|
||||
void prodImplC<CURRENT_CAPABILITY>::function(
|
||||
Tensor& result,
|
||||
const Tensor& self,
|
||||
size_t dim,
|
||||
bool all) {
|
||||
allImpl<std::multiplies, CURRENT_CAPABILITY>(
|
||||
result, self, dim, all, "prod", 1);
|
||||
}
|
||||
}} // namespace at::native
|
||||
|
|
|
|||
|
|
@ -1,23 +1,21 @@
|
|||
#pragma once
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include "CapabilityDispatch.h"
|
||||
#include "Vec256.h"
|
||||
#include <stdexcept>
|
||||
#include "CapabilityDispatch.h"
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
template <CPUCapability C> struct sumImplC {
|
||||
static void function(Tensor &result, const Tensor &self, size_t dim,
|
||||
bool all);
|
||||
template <CPUCapability C>
|
||||
struct sumImplC {
|
||||
static void
|
||||
function(Tensor& result, const Tensor& self, size_t dim, bool all);
|
||||
};
|
||||
|
||||
template <CPUCapability C> struct prodImplC {
|
||||
static void function(Tensor &result, const Tensor &self, size_t dim,
|
||||
bool all);
|
||||
template <CPUCapability C>
|
||||
struct prodImplC {
|
||||
static void
|
||||
function(Tensor& result, const Tensor& self, size_t dim, bool all);
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}} // namespace at::native
|
||||
|
|
|
|||
121
aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
Normal file
121
aten/src/ATen/native/cpu/UnaryOpsKernel.cpp
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
#include "ATen/native/cpu/UnaryOpsKernel.h"
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include "ATen/Dispatch.h"
|
||||
#include "ATen/Parallel.h"
|
||||
#include "ATen/native/cpu/Vec256.h"
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
using namespace vec256;
|
||||
|
||||
// This modifies arr in place with given OP
|
||||
template <class scalar_t, template <class> class VOP, CPUCapability C>
|
||||
inline void
|
||||
kernel_(scalar_t* arr_out, const scalar_t* arr_in, size_t start, size_t end) {
|
||||
Vec256<scalar_t> a;
|
||||
size_t epr = 32 / sizeof(scalar_t); // primitives per Vec256
|
||||
size_t k = start;
|
||||
size_t vec_end = end > epr ? end - epr : 0;
|
||||
for (; k < vec_end; k += epr) {
|
||||
a.load(arr_in + k);
|
||||
VOP<scalar_t>()(a).store(arr_out + k);
|
||||
}
|
||||
size_t leftover = std::min((end - k), a.size);
|
||||
a.load(arr_in + k, leftover);
|
||||
VOP<scalar_t>()(a).store(arr_out + k, leftover);
|
||||
}
|
||||
|
||||
template <template <class> class VOP, CPUCapability C>
|
||||
inline void allImpl(Tensor& result, const Tensor& self, const char* name) {
|
||||
AT_DISPATCH_FLOATING_TYPES(self.type(), name, [&] {
|
||||
at::parallel_for_1d<scalar_t>(
|
||||
&kernel_<scalar_t, VOP, CURRENT_CAPABILITY>, result, self);
|
||||
});
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct ceilVOP {
|
||||
Vec256<T> operator()(Vec256<T>& x) const {
|
||||
return x.ceil();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
void ceilImplC<CURRENT_CAPABILITY>::function(
|
||||
Tensor& result,
|
||||
const Tensor& self) {
|
||||
allImpl<ceilVOP, CURRENT_CAPABILITY>(result, self, "ceil");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct floorVOP {
|
||||
Vec256<T> operator()(Vec256<T>& x) const {
|
||||
return x.floor();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
void floorImplC<CURRENT_CAPABILITY>::function(
|
||||
Tensor& result,
|
||||
const Tensor& self) {
|
||||
allImpl<floorVOP, CURRENT_CAPABILITY>(result, self, "floor");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct roundVOP {
|
||||
Vec256<T> operator()(Vec256<T>& x) const {
|
||||
return x.round();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
void roundImplC<CURRENT_CAPABILITY>::function(
|
||||
Tensor& result,
|
||||
const Tensor& self) {
|
||||
allImpl<roundVOP, CURRENT_CAPABILITY>(result, self, "round");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct truncVOP {
|
||||
Vec256<T> operator()(Vec256<T>& x) const {
|
||||
return x.trunc();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
void truncImplC<CURRENT_CAPABILITY>::function(
|
||||
Tensor& result,
|
||||
const Tensor& self) {
|
||||
allImpl<truncVOP, CURRENT_CAPABILITY>(result, self, "trunc");
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct sqrtVOP {
|
||||
Vec256<T> operator()(Vec256<T>& x) const {
|
||||
return x.sqrt();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
void sqrtImplC<CURRENT_CAPABILITY>::function(
|
||||
Tensor& result,
|
||||
const Tensor& self) {
|
||||
allImpl<sqrtVOP, CURRENT_CAPABILITY>(result, self, "sqrt");
|
||||
}
|
||||
}} // namespace at::native
|
||||
56
aten/src/ATen/native/cpu/UnaryOpsKernel.h
Normal file
56
aten/src/ATen/native/cpu/UnaryOpsKernel.h
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
#pragma once
|
||||
#include <ATen/ATen.h>
|
||||
#include <ATen/Parallel.h>
|
||||
#include <stdexcept>
|
||||
#include "CapabilityDispatch.h"
|
||||
|
||||
namespace at { namespace native {
|
||||
|
||||
template <CPUCapability C>
|
||||
struct ceilImplC {
|
||||
static void function(Tensor& result, const Tensor& self);
|
||||
};
|
||||
template <CPUCapability C>
|
||||
struct floorImplC {
|
||||
static void function(Tensor& result, const Tensor& self);
|
||||
};
|
||||
template <CPUCapability C>
|
||||
struct roundImplC {
|
||||
static void function(Tensor& result, const Tensor& self);
|
||||
};
|
||||
template <CPUCapability C>
|
||||
struct truncImplC {
|
||||
static void function(Tensor& result, const Tensor& self);
|
||||
};
|
||||
template <CPUCapability C>
|
||||
struct sqrtImplC {
|
||||
static void function(Tensor& result, const Tensor& self);
|
||||
};
|
||||
|
||||
// Missing unary functions
|
||||
// TODO: Add generic apply function for contiguous and non-contiguous tensors
|
||||
// The goal here is to move more ops entirely into ATen and take advantage of
|
||||
// automatic vectorization with file-specific flags
|
||||
// acos
|
||||
// asin
|
||||
// atan
|
||||
// cos
|
||||
// cosh
|
||||
// digamma
|
||||
// erf
|
||||
// erfinv
|
||||
// exp
|
||||
// expm1
|
||||
// frac
|
||||
// lgamma
|
||||
// log1p
|
||||
// log
|
||||
// rsqrt
|
||||
// sigmoid
|
||||
// sin
|
||||
// sinh
|
||||
// tan
|
||||
// tanh
|
||||
// trunc
|
||||
|
||||
}} // namespace at::native
|
||||
|
|
@ -15,7 +15,7 @@
|
|||
#elif defined(__GNUC__) && defined(__IWMMXT__)
|
||||
/* GCC-compatible compiler, targeting ARM with WMMX */
|
||||
#include <mmintrin.h>
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && \
|
||||
#elif (defined(__GNUC__) || defined(__xlC__)) && \
|
||||
(defined(__VEC__) || defined(__ALTIVEC__))
|
||||
/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
|
||||
#include <altivec.h>
|
||||
|
|
@ -25,94 +25,212 @@
|
|||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
// NOTE:
|
||||
// If you specialize on a type, you must define all operations!
|
||||
// C arrays and intrinsic types don't mix
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace vec256 {
|
||||
namespace at { namespace native { namespace vec256 {
|
||||
|
||||
template <class T> class Vec256 {
|
||||
public:
|
||||
template <class T>
|
||||
class Vec256 {
|
||||
public:
|
||||
T values[32 / sizeof(T)]; // Mimics AVX behavior
|
||||
inline void load(const T *ptr) {
|
||||
std::memcpy(values, ptr, 32);
|
||||
inline void load(const T* ptr) {
|
||||
std::memcpy(values, ptr, 32);
|
||||
};
|
||||
inline void store(T *ptr) { std::memcpy(ptr, values, 32); }
|
||||
inline size_t size() { return 32 / sizeof(T); }
|
||||
inline void operator=(const Vec256<T> &b) {
|
||||
inline void store(T* ptr) const {
|
||||
std::memcpy(ptr, values, 32);
|
||||
}
|
||||
inline void load(const T* ptr, size_t count) {
|
||||
std::memcpy(values, ptr, 32 / sizeof(T) * count);
|
||||
};
|
||||
inline void store(T* ptr, size_t count) const {
|
||||
std::memcpy(ptr, values, 32 / sizeof(T) * count);
|
||||
}
|
||||
size_t size = 32 / sizeof(T);
|
||||
inline void operator=(const Vec256<T>& b) {
|
||||
std::memcpy(values, b.values, 32);
|
||||
}
|
||||
inline Vec256<T> map(T (*f)(T)) {
|
||||
Vec256<T> ret;
|
||||
for (size_t i = 0; i < size; i++)
|
||||
ret.values[i] = f(values[i]);
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<T> ceil() {
|
||||
return map(std::ceil);
|
||||
}
|
||||
inline Vec256<T> floor() {
|
||||
return map(std::floor);
|
||||
}
|
||||
inline Vec256<T> round() {
|
||||
return map(std::round);
|
||||
}
|
||||
inline Vec256<T> trunc() {
|
||||
return map(std::trunc);
|
||||
}
|
||||
inline Vec256<T> sqrt() {
|
||||
return map(std::sqrt);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T> Vec256<T> operator+(const Vec256<T> &a, const Vec256<T> &b) {
|
||||
template <class T>
|
||||
Vec256<T> operator+(const Vec256<T>& a, const Vec256<T>& b) {
|
||||
Vec256<T> c = Vec256<T>();
|
||||
for (size_t i = 0; i < c.size(); i++) {
|
||||
for (size_t i = 0; i < a.size; i++)
|
||||
c.values[i] = a.values[i] + b.values[i];
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
template <class T> Vec256<T> operator*(const Vec256<T> &a, const Vec256<T> &b) {
|
||||
template <class T>
|
||||
Vec256<T> operator*(const Vec256<T>& a, const Vec256<T>& b) {
|
||||
Vec256<T> c = Vec256<T>();
|
||||
for (size_t i = 0; i < c.size(); i++) {
|
||||
for (size_t i = 0; i < a.size; i++)
|
||||
c.values[i] = a.values[i] * b.values[i];
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
#ifdef __AVX__
|
||||
template <> class Vec256<float> {
|
||||
public:
|
||||
template <>
|
||||
class Vec256<float> {
|
||||
public:
|
||||
__m256 values;
|
||||
Vec256<float>() {}
|
||||
inline void load(const float *ptr) { values = _mm256_loadu_ps(ptr); }
|
||||
inline void store(float *ptr) { _mm256_storeu_ps(ptr, values); }
|
||||
inline size_t size() { return 32 / sizeof(float); }
|
||||
inline void operator=(const Vec256<float> &b) { values = b.values; }
|
||||
};
|
||||
|
||||
template <> class Vec256<double> {
|
||||
public:
|
||||
__m256d values;
|
||||
Vec256<double>() {}
|
||||
inline void load(const double *ptr) { values = _mm256_loadu_pd(ptr); }
|
||||
inline void store(double *ptr) { _mm256_storeu_pd(ptr, values); }
|
||||
inline size_t size() { return 32 / sizeof(double); }
|
||||
inline void operator=(const Vec256<double> &b) { values = b.values; }
|
||||
inline void load(const float* ptr) {
|
||||
values = _mm256_loadu_ps(ptr);
|
||||
}
|
||||
inline void store(float* ptr) const {
|
||||
_mm256_storeu_ps(ptr, values);
|
||||
}
|
||||
inline void load(const float* ptr, size_t count) {
|
||||
float tmp_values[8];
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(float));
|
||||
load(tmp_values);
|
||||
}
|
||||
inline void store(float* ptr, size_t count) const {
|
||||
float tmp_values[8];
|
||||
store(tmp_values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(float));
|
||||
}
|
||||
size_t size = 8;
|
||||
inline void operator=(const Vec256<float>& b) {
|
||||
values = b.values;
|
||||
}
|
||||
inline Vec256<float> ceil() {
|
||||
Vec256<float> ret;
|
||||
ret.values = _mm256_ceil_ps(values);
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<float> floor() {
|
||||
Vec256<float> ret;
|
||||
ret.values = _mm256_floor_ps(values);
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<float> round() {
|
||||
Vec256<float> ret;
|
||||
ret.values = _mm256_round_ps(
|
||||
values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<float> trunc() {
|
||||
Vec256<float> ret;
|
||||
ret.values =
|
||||
_mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<float> sqrt() {
|
||||
Vec256<float> ret;
|
||||
ret.values = _mm256_sqrt_ps(values);
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
Vec256<float> inline operator+(const Vec256<float> &a, const Vec256<float> &b) {
|
||||
class Vec256<double> {
|
||||
public:
|
||||
__m256d values;
|
||||
Vec256<double>() {}
|
||||
inline void load(const double* ptr) {
|
||||
values = _mm256_loadu_pd(ptr);
|
||||
}
|
||||
inline void store(double* ptr) const {
|
||||
_mm256_storeu_pd(ptr, values);
|
||||
}
|
||||
inline void load(const double* ptr, size_t count) {
|
||||
double tmp_values[4];
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(double));
|
||||
load(tmp_values);
|
||||
}
|
||||
inline void store(double* ptr, size_t count) const {
|
||||
double tmp_values[4];
|
||||
store(tmp_values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(double));
|
||||
}
|
||||
size_t size = 4;
|
||||
inline void operator=(const Vec256<double>& b) {
|
||||
values = b.values;
|
||||
}
|
||||
inline Vec256<double> ceil() {
|
||||
Vec256<double> ret;
|
||||
ret.values = _mm256_ceil_pd(values);
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<double> floor() {
|
||||
Vec256<double> ret;
|
||||
ret.values = _mm256_floor_pd(values);
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<double> round() {
|
||||
Vec256<double> ret;
|
||||
ret.values = _mm256_round_pd(
|
||||
values, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<double> trunc() {
|
||||
Vec256<double> ret;
|
||||
ret.values =
|
||||
_mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));
|
||||
return ret;
|
||||
}
|
||||
inline Vec256<double> sqrt() {
|
||||
Vec256<double> ret;
|
||||
ret.values = _mm256_sqrt_pd(values);
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
Vec256<float> inline operator+(const Vec256<float>& a, const Vec256<float>& b) {
|
||||
Vec256<float> c = Vec256<float>();
|
||||
c.values = _mm256_add_ps(a.values, b.values);
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vec256<float> inline operator*(const Vec256<float> &a, const Vec256<float> &b) {
|
||||
Vec256<float> inline operator*(const Vec256<float>& a, const Vec256<float>& b) {
|
||||
Vec256<float> c = Vec256<float>();
|
||||
c.values = _mm256_mul_ps(a.values, b.values);
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vec256<double> inline operator+(const Vec256<double> &a,
|
||||
const Vec256<double> &b) {
|
||||
Vec256<double> inline operator+(
|
||||
const Vec256<double>& a,
|
||||
const Vec256<double>& b) {
|
||||
Vec256<double> c = Vec256<double>();
|
||||
c.values = _mm256_add_pd(a.values, b.values);
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vec256<double> inline operator*(const Vec256<double> &a,
|
||||
const Vec256<double> &b) {
|
||||
Vec256<double> inline operator*(
|
||||
const Vec256<double>& a,
|
||||
const Vec256<double>& b) {
|
||||
Vec256<double> c = Vec256<double>();
|
||||
c.values = _mm256_mul_pd(a.values, b.values);
|
||||
return c;
|
||||
|
|
@ -120,67 +238,109 @@ Vec256<double> inline operator*(const Vec256<double> &a,
|
|||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
template <> class Vec256<int64_t> {
|
||||
public:
|
||||
template <>
|
||||
class Vec256<int64_t> {
|
||||
public:
|
||||
__m256i values;
|
||||
Vec256<int64_t>() {}
|
||||
inline void load(const int64_t *ptr) {
|
||||
values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
|
||||
inline void load(const int64_t* ptr) {
|
||||
values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
|
||||
}
|
||||
inline void store(int64_t *ptr) {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
|
||||
inline void store(int64_t* ptr) const {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
|
||||
}
|
||||
inline size_t size() { return 32 / sizeof(int64_t); }
|
||||
inline void operator=(const Vec256<int64_t> &b) { values = b.values; }
|
||||
};
|
||||
|
||||
template <> class Vec256<int32_t> {
|
||||
public:
|
||||
__m256i values;
|
||||
Vec256<int32_t>() {}
|
||||
inline void load(const int32_t *ptr) {
|
||||
values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
|
||||
inline void load(const int64_t* ptr, size_t count) {
|
||||
int64_t tmp_values[4];
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int64_t));
|
||||
load(tmp_values);
|
||||
}
|
||||
inline void store(int32_t *ptr) {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
|
||||
inline void store(int64_t* ptr, size_t count) const {
|
||||
int64_t tmp_values[4];
|
||||
store(tmp_values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(int64_t));
|
||||
}
|
||||
inline size_t size() { return 32 / sizeof(int32_t); }
|
||||
inline void operator=(const Vec256<int32_t> &b) { values = b.values; }
|
||||
};
|
||||
|
||||
template <> class Vec256<int16_t> {
|
||||
public:
|
||||
__m256i values;
|
||||
Vec256<int16_t>() {}
|
||||
inline void load(const int16_t *ptr) {
|
||||
values = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
|
||||
size_t size = 4;
|
||||
inline void operator=(const Vec256<int64_t>& b) {
|
||||
values = b.values;
|
||||
}
|
||||
inline void store(int16_t *ptr) {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), values);
|
||||
}
|
||||
inline size_t size() { return 32 / sizeof(int16_t); }
|
||||
inline void operator=(const Vec256<int16_t> &b) { values = b.values; }
|
||||
};
|
||||
|
||||
template <>
|
||||
Vec256<int64_t> inline operator+(const Vec256<int64_t> &a,
|
||||
const Vec256<int64_t> &b) {
|
||||
class Vec256<int32_t> {
|
||||
public:
|
||||
__m256i values;
|
||||
Vec256<int32_t>() {}
|
||||
inline void load(const int32_t* ptr) {
|
||||
values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
|
||||
}
|
||||
inline void store(int32_t* ptr) const {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
|
||||
}
|
||||
inline void load(const int32_t* ptr, size_t count) {
|
||||
int32_t tmp_values[8];
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int32_t));
|
||||
load(tmp_values);
|
||||
}
|
||||
inline void store(int32_t* ptr, size_t count) const {
|
||||
int32_t tmp_values[8];
|
||||
store(tmp_values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(int32_t));
|
||||
}
|
||||
size_t size = 8;
|
||||
inline void operator=(const Vec256<int32_t>& b) {
|
||||
values = b.values;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class Vec256<int16_t> {
|
||||
public:
|
||||
__m256i values;
|
||||
Vec256<int16_t>() {}
|
||||
inline void load(const int16_t* ptr) {
|
||||
values = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
|
||||
}
|
||||
inline void store(int16_t* ptr) const {
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), values);
|
||||
}
|
||||
inline void load(const int16_t* ptr, size_t count) {
|
||||
int16_t tmp_values[16];
|
||||
std::memcpy(tmp_values, ptr, count * sizeof(int16_t));
|
||||
load(tmp_values);
|
||||
}
|
||||
inline void store(int16_t* ptr, size_t count) const {
|
||||
int16_t tmp_values[16];
|
||||
store(tmp_values);
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(int16_t));
|
||||
}
|
||||
size_t size = 16;
|
||||
inline void operator=(const Vec256<int16_t>& b) {
|
||||
values = b.values;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
Vec256<int64_t> inline operator+(
|
||||
const Vec256<int64_t>& a,
|
||||
const Vec256<int64_t>& b) {
|
||||
Vec256<int64_t> c = Vec256<int64_t>();
|
||||
c.values = _mm256_add_epi64(a.values, b.values);
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vec256<int32_t> inline operator+(const Vec256<int32_t> &a,
|
||||
const Vec256<int32_t> &b) {
|
||||
Vec256<int32_t> inline operator+(
|
||||
const Vec256<int32_t>& a,
|
||||
const Vec256<int32_t>& b) {
|
||||
Vec256<int32_t> c = Vec256<int32_t>();
|
||||
c.values = _mm256_add_epi32(a.values, b.values);
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vec256<int16_t> inline operator+(const Vec256<int16_t> &a,
|
||||
const Vec256<int16_t> &b) {
|
||||
Vec256<int16_t> inline operator+(
|
||||
const Vec256<int16_t>& a,
|
||||
const Vec256<int16_t>& b) {
|
||||
Vec256<int16_t> c = Vec256<int16_t>();
|
||||
c.values = _mm256_add_epi16(a.values, b.values);
|
||||
return c;
|
||||
|
|
@ -191,8 +351,9 @@ Vec256<int16_t> inline operator+(const Vec256<int16_t> &a,
|
|||
// This is also technically avx compatible, but then we'll need AVX
|
||||
// code for add as well.
|
||||
template <>
|
||||
Vec256<int64_t> inline operator*(const Vec256<int64_t> &a,
|
||||
const Vec256<int64_t> &b) {
|
||||
Vec256<int64_t> inline operator*(
|
||||
const Vec256<int64_t>& a,
|
||||
const Vec256<int64_t>& b) {
|
||||
Vec256<int64_t> c = Vec256<int64_t>();
|
||||
|
||||
int64_t a0 = _mm256_extract_epi64(a.values, 0);
|
||||
|
|
@ -215,21 +376,21 @@ Vec256<int64_t> inline operator*(const Vec256<int64_t> &a,
|
|||
}
|
||||
|
||||
template <>
|
||||
Vec256<int32_t> inline operator*(const Vec256<int32_t> &a,
|
||||
const Vec256<int32_t> &b) {
|
||||
Vec256<int32_t> inline operator*(
|
||||
const Vec256<int32_t>& a,
|
||||
const Vec256<int32_t>& b) {
|
||||
Vec256<int32_t> c = Vec256<int32_t>();
|
||||
c.values = _mm256_mullo_epi32(a.values, b.values);
|
||||
return c;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vec256<int16_t> inline operator*(const Vec256<int16_t> &a,
|
||||
const Vec256<int16_t> &b) {
|
||||
Vec256<int16_t> inline operator*(
|
||||
const Vec256<int16_t>& a,
|
||||
const Vec256<int16_t>& b) {
|
||||
Vec256<int16_t> c = Vec256<int16_t>();
|
||||
c.values = _mm256_mullo_epi16(a.values, b.values);
|
||||
return c;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
}}} // namespace at::native::vec256
|
||||
|
|
|
|||
|
|
@ -29,6 +29,18 @@
|
|||
- func: _cast_Half(Tensor self, bool non_blocking=false) -> Tensor
|
||||
variants: function, method
|
||||
|
||||
- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
|
||||
variants: function
|
||||
|
||||
- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
|
||||
variants: function
|
||||
|
||||
- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor grad_output, Tensor grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
|
||||
variants: function
|
||||
|
||||
- func: _cudnn_init_dropout_state(Type ty, double dropout, bool train, int64_t dropout_seed) -> Tensor
|
||||
variants: function
|
||||
|
||||
- func: adaptive_avg_pool1d(Tensor self, IntList[1] output_size) -> Tensor
|
||||
variants: function
|
||||
|
||||
|
|
@ -80,6 +92,16 @@
|
|||
- func: cat_out(Tensor result, TensorList tensors, int64_t dim=0) -> Tensor
|
||||
variants: function
|
||||
|
||||
- func: ceil(Tensor self) -> Tensor
|
||||
|
||||
- func: ceil_(Tensor self) -> Tensor
|
||||
|
||||
- func: ceil_out(Tensor result, Tensor self) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
CPU: _ceil_out_cpu
|
||||
CUDA: _ceil_out_cuda
|
||||
|
||||
- func: chunk(Tensor self, int64_t chunks, int64_t dim=0) -> TensorList
|
||||
|
||||
- func: cudnn_is_acceptable(Tensor self) -> bool
|
||||
|
|
@ -271,6 +293,16 @@
|
|||
CPU: eye_out_cpu
|
||||
CUDA: eye_out_cuda
|
||||
|
||||
- func: floor(Tensor self) -> Tensor
|
||||
|
||||
- func: floor_(Tensor self) -> Tensor
|
||||
|
||||
- func: floor_out(Tensor result, Tensor self) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
CPU: _floor_out_cpu
|
||||
CUDA: _floor_out_cuda
|
||||
|
||||
- func: full(Type dtype, IntList size, Scalar fill_value) -> Tensor
|
||||
variants: function
|
||||
|
||||
|
|
@ -437,18 +469,28 @@
|
|||
CPU: RoiPooling2d_backward_cpu
|
||||
CUDA: RoiPooling2d_backward_cuda
|
||||
|
||||
- func: round(Tensor self) -> Tensor
|
||||
|
||||
- func: round_(Tensor self) -> Tensor
|
||||
|
||||
- func: round_out(Tensor result, Tensor self) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
CPU: _round_out_cpu
|
||||
CUDA: _round_out_cuda
|
||||
|
||||
- func: rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
|
||||
variants: function
|
||||
|
||||
- func: rrelu_(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=false, Generator* generator=nullptr) -> Tensor
|
||||
variants: function
|
||||
|
||||
- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
|
||||
|
||||
- func: relu(Tensor self) -> Tensor
|
||||
|
||||
- func: relu_(Tensor self) -> Tensor
|
||||
|
||||
- func: select(Tensor self, int64_t dim, int64_t index) -> Tensor
|
||||
|
||||
- func: selu(Tensor self) -> Tensor
|
||||
variants: function
|
||||
|
||||
|
|
@ -512,6 +554,16 @@
|
|||
CPU: _sum_out_cpu
|
||||
CUDA: _sum_out_cuda
|
||||
|
||||
- func: sqrt(Tensor self) -> Tensor
|
||||
|
||||
- func: sqrt_(Tensor self) -> Tensor
|
||||
|
||||
- func: sqrt_out(Tensor result, Tensor self) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
CPU: _sqrt_out_cpu
|
||||
CUDA: _sqrt_out_cuda
|
||||
|
||||
- func: prod(Tensor self) -> Tensor
|
||||
dispatch:
|
||||
CPU: _prod_cpu
|
||||
|
|
@ -525,14 +577,24 @@
|
|||
CPU: _prod_out_cpu
|
||||
CUDA: _prod_out_cuda
|
||||
|
||||
- func: t_(Tensor self) -> Tensor
|
||||
variants: method
|
||||
|
||||
- func: transpose_(Tensor self, int64_t dim0, int64_t dim1) -> Tensor
|
||||
variants: method
|
||||
|
||||
- func: triplet_margin_loss(Tensor anchor, Tensor positive, Tensor negative, double margin=1.0, double p=2, double eps=1e-6, bool swap=false, bool size_average=true, bool reduce=true) -> Tensor
|
||||
variants: function
|
||||
|
||||
- func: t_(Tensor self) -> Tensor
|
||||
variants: method
|
||||
- func: trunc(Tensor self) -> Tensor
|
||||
|
||||
- func: trunc_(Tensor self) -> Tensor
|
||||
|
||||
- func: trunc_out(Tensor result, Tensor self) -> Tensor
|
||||
variants: function
|
||||
dispatch:
|
||||
CPU: _trunc_out_cpu
|
||||
CUDA: _trunc_out_cuda
|
||||
|
||||
- func: type_as(Tensor self, Tensor other) -> Tensor
|
||||
variants: method
|
||||
|
|
@ -584,15 +646,3 @@
|
|||
dispatch:
|
||||
CPU: _s_poisson_cpu
|
||||
CUDA: _s_poisson_cuda
|
||||
|
||||
- func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
|
||||
variants: function
|
||||
|
||||
- func: _cudnn_rnn(Tensor input, TensorList weight, int64_t weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)
|
||||
variants: function
|
||||
|
||||
- func: _cudnn_rnn_backward(Tensor input, TensorList weight, int64_t weight_stride0, Tensor weight_buf, Tensor hx, Tensor? cx, Tensor output, Tensor grad_output, Tensor grad_hy, Tensor? grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, IntList batch_sizes, BoolTensor? dropout_state, Tensor reserve, std::array<bool,4> output_mask) -> (Tensor, Tensor, Tensor, TensorList)
|
||||
variants: function
|
||||
|
||||
- func: _cudnn_init_dropout_state(Type ty, double dropout, bool train, int64_t dropout_seed) -> Tensor
|
||||
variants: function
|
||||
|
|
|
|||
|
|
@ -229,23 +229,27 @@ class TestTorch(TestCase):
|
|||
self.assertRaises(RuntimeError, lambda: torch.addr(m, v, s))
|
||||
self.assertRaises(RuntimeError, lambda: torch.addr(m, s, v))
|
||||
|
||||
def _testMath(self, torchfn, mathfn):
|
||||
size = (10, 5)
|
||||
# contiguous
|
||||
m1 = torch.randn(*size)
|
||||
res1 = torchfn(m1[4])
|
||||
res2 = res1.clone().zero_()
|
||||
for i, v in enumerate(m1[4]):
|
||||
res2[i] = mathfn(v.item())
|
||||
self.assertEqual(res1, res2)
|
||||
def _testMath(self, torchfn, mathfn, large=True):
|
||||
def _testMathSize(size, self, torchfn, mathfn):
|
||||
# contiguous
|
||||
m1 = torch.randn(*size)
|
||||
res1 = torchfn(m1[4])
|
||||
res2 = res1.clone().zero_()
|
||||
for i, v in enumerate(m1[4]):
|
||||
res2[i] = mathfn(v.item())
|
||||
self.assertEqual(res1, res2)
|
||||
|
||||
# non-contiguous
|
||||
m1 = torch.randn(*size)
|
||||
res1 = torchfn(m1[:, 4])
|
||||
res2 = res1.clone().zero_()
|
||||
for i, v in enumerate(m1[:, 4]):
|
||||
res2[i] = mathfn(v.item())
|
||||
self.assertEqual(res1, res2)
|
||||
# non-contiguous
|
||||
m1 = torch.randn(*size)
|
||||
res1 = torchfn(m1[:, 4])
|
||||
res2 = res1.clone().zero_()
|
||||
for i, v in enumerate(m1[:, 4]):
|
||||
res2[i] = mathfn(v.item())
|
||||
self.assertEqual(res1, res2)
|
||||
_testMathSize((10, 5), self, torchfn, mathfn)
|
||||
if large:
|
||||
# Trigger parallelism
|
||||
_testMathSize((10, 50000), self, torchfn, mathfn)
|
||||
|
||||
def _testMathByName(self, function_name):
|
||||
torchfn = getattr(torch, function_name)
|
||||
|
|
@ -269,8 +273,9 @@ class TestTorch(TestCase):
|
|||
@unittest.skipIf(not TEST_SCIPY, "Scipy not found")
|
||||
def test_polygamma(self):
|
||||
from scipy.special import polygamma
|
||||
# This test won't work when using many samples
|
||||
for n in [0, 1]:
|
||||
self._testMath(lambda x: torch.polygamma(n, x), lambda x: polygamma(n, x)[()])
|
||||
self._testMath(lambda x: torch.polygamma(n, x), lambda x: polygamma(n, x)[()], large=False)
|
||||
|
||||
def test_asin(self):
|
||||
self._testMath(torch.asin, lambda x: math.asin(x) if abs(x) <= 1 else float('nan'))
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user