mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[1/N][Fix] Fix typo in aten folder (#166126)
Fix typo in aten folder. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166126 Approved by: https://github.com/cyyever, https://github.com/slayton58
This commit is contained in:
parent
f2c81635c8
commit
f89a7e9fe8
|
|
@ -38,7 +38,7 @@ set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT)
|
||||||
|
|
||||||
configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
|
configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
|
||||||
# TODO: Do not generate CUDAConfig.h for ROCm BUILDS
|
# TODO: Do not generate CUDAConfig.h for ROCm BUILDS
|
||||||
# At the moment, `jit_macors.h` include CUDAConfig.h for both CUDA and HIP builds
|
# At the moment, `jit_macros.h` include CUDAConfig.h for both CUDA and HIP builds
|
||||||
if(USE_CUDA OR USE_ROCM)
|
if(USE_CUDA OR USE_ROCM)
|
||||||
configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
|
configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
|
||||||
endif()
|
endif()
|
||||||
|
|
|
||||||
|
|
@ -122,7 +122,7 @@ void FunctionalTensorWrapper::freeze_storage() const {
|
||||||
// | have their own storages, but backends like functorch |
|
// | have their own storages, but backends like functorch |
|
||||||
// \/ are allowed to re-alias underneath the pass \/
|
// \/ are allowed to re-alias underneath the pass \/
|
||||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||||
// | underyling_storage | | underyling_storage |
|
// | underlying_storage | | underlying_storage |
|
||||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||||
//
|
//
|
||||||
// This constructor is only used by view ops.
|
// This constructor is only used by view ops.
|
||||||
|
|
|
||||||
|
|
@ -1534,7 +1534,7 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
|
||||||
|
|
||||||
// XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
|
// XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
|
||||||
// Nothing beyond this point is important for meta functions, so it's fine to exit early here.
|
// Nothing beyond this point is important for meta functions, so it's fine to exit early here.
|
||||||
// Extend the condition to MAIA tesnors as MAIA tensors also don't have storage.
|
// Extend the condition to MAIA tensors as MAIA tensors also don't have storage.
|
||||||
if (privateuse1_without_storage ||
|
if (privateuse1_without_storage ||
|
||||||
common_device_.type() == DeviceType::XLA ||
|
common_device_.type() == DeviceType::XLA ||
|
||||||
common_device_.type() == DeviceType::IPU ||
|
common_device_.type() == DeviceType::IPU ||
|
||||||
|
|
|
||||||
|
|
@ -94,11 +94,11 @@ struct PinnedReserveSegment {
|
||||||
struct TORCH_API HostStats {
|
struct TORCH_API HostStats {
|
||||||
// COUNT: total allocations (active)
|
// COUNT: total allocations (active)
|
||||||
Stat active_requests;
|
Stat active_requests;
|
||||||
// SUM: bytes allocated/reserved by this memory alocator. (active)
|
// SUM: bytes allocated/reserved by this memory allocator. (active)
|
||||||
Stat active_bytes;
|
Stat active_bytes;
|
||||||
// COUNT: total allocations (active + free)
|
// COUNT: total allocations (active + free)
|
||||||
Stat allocations;
|
Stat allocations;
|
||||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
// SUM: bytes allocated/reserved by this memory allocator. This accounts
|
||||||
// for both free and in-use blocks.
|
// for both free and in-use blocks.
|
||||||
Stat allocated_bytes;
|
Stat allocated_bytes;
|
||||||
|
|
||||||
|
|
@ -127,7 +127,7 @@ struct alignas(hardware_destructive_interference_size) HostStatsStaged {
|
||||||
// COUNT: total allocations (active + free)
|
// COUNT: total allocations (active + free)
|
||||||
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
||||||
Stat allocations;
|
Stat allocations;
|
||||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
// SUM: bytes allocated/reserved by this memory allocator. This accounts
|
||||||
// for both free and in-use blocks.
|
// for both free and in-use blocks.
|
||||||
Stat allocated_bytes;
|
Stat allocated_bytes;
|
||||||
// COUNT: number of allocations per bucket (active)
|
// COUNT: number of allocations per bucket (active)
|
||||||
|
|
@ -455,7 +455,7 @@ struct CachingHostAllocatorImpl {
|
||||||
}
|
}
|
||||||
|
|
||||||
void resetAccumulatedStats() {
|
void resetAccumulatedStats() {
|
||||||
// Reseting accumulated memory stats requires concurrently holding both the
|
// Resetting accumulated memory stats requires concurrently holding both the
|
||||||
// free list mutexes and the blocks mutex. Previously, this was only done in
|
// free list mutexes and the blocks mutex. Previously, this was only done in
|
||||||
// empty_cache function.
|
// empty_cache function.
|
||||||
for (size_t i = 0; i < free_list_.size(); ++i) {
|
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||||
|
|
@ -482,7 +482,7 @@ struct CachingHostAllocatorImpl {
|
||||||
}
|
}
|
||||||
|
|
||||||
void resetPeakStats() {
|
void resetPeakStats() {
|
||||||
// Reseting peak memory stats requires concurrently holding both the
|
// Resetting peak memory stats requires concurrently holding both the
|
||||||
// free list mutexes and the blocks mutex. Previously, this was only done in
|
// free list mutexes and the blocks mutex. Previously, this was only done in
|
||||||
// empty_cache function.
|
// empty_cache function.
|
||||||
for (size_t i = 0; i < free_list_.size(); ++i) {
|
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||||
|
|
|
||||||
|
|
@ -148,7 +148,7 @@ struct TORCH_API ClassType : public NamedType {
|
||||||
|
|
||||||
void checkNotExist(const std::string& name, const std::string& what) const;
|
void checkNotExist(const std::string& name, const std::string& what) const;
|
||||||
|
|
||||||
// Attributes are stored in a specific slot at runtime for effiency.
|
// Attributes are stored in a specific slot at runtime for efficiency.
|
||||||
// When emitting instructions we specify the slot so that attribute access is
|
// When emitting instructions we specify the slot so that attribute access is
|
||||||
// a constant lookup
|
// a constant lookup
|
||||||
std::optional<size_t> findAttributeSlot(const std::string& name) const {
|
std::optional<size_t> findAttributeSlot(const std::string& name) const {
|
||||||
|
|
@ -412,7 +412,7 @@ struct TORCH_API ClassType : public NamedType {
|
||||||
// Holds method attributes
|
// Holds method attributes
|
||||||
std::weak_ptr<CompilationUnit> compilation_unit_;
|
std::weak_ptr<CompilationUnit> compilation_unit_;
|
||||||
|
|
||||||
// Holds all atrributes, attribute details are found on ClassAttribute
|
// Holds all attributes, attribute details are found on ClassAttribute
|
||||||
std::vector<ClassAttribute> attributes_;
|
std::vector<ClassAttribute> attributes_;
|
||||||
// Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
|
// Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
|
||||||
// Never fill this without using the appropriate provideNewClassAttribute method
|
// Never fill this without using the appropriate provideNewClassAttribute method
|
||||||
|
|
|
||||||
|
|
@ -537,7 +537,7 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK
|
||||||
|
|
||||||
// Note: this records a sequence number for both Autograd keys, and for
|
// Note: this records a sequence number for both Autograd keys, and for
|
||||||
// non-Autograd keys where the dispatchKeySet still contains an autograd key.
|
// non-Autograd keys where the dispatchKeySet still contains an autograd key.
|
||||||
// This means that we might collect the same sequence nubmer two different
|
// This means that we might collect the same sequence number two different
|
||||||
// events if they all occurred above Autograd and still had the Autograd
|
// events if they all occurred above Autograd and still had the Autograd
|
||||||
// dispatch key in the dispatch key set.
|
// dispatch key in the dispatch key set.
|
||||||
// However, this usually doesn't happen: normally the first call will
|
// However, this usually doesn't happen: normally the first call will
|
||||||
|
|
|
||||||
|
|
@ -585,7 +585,7 @@ class TORCH_API OperatorHandle {
|
||||||
|
|
||||||
// We need to store this iterator in order to make
|
// We need to store this iterator in order to make
|
||||||
// Dispatcher::cleanup() fast -- it runs a lot on program
|
// Dispatcher::cleanup() fast -- it runs a lot on program
|
||||||
// termination (and presuambly library unloading).
|
// termination (and presumably library unloading).
|
||||||
std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
|
std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -365,7 +365,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
|
||||||
// For autograd keys, we only use kernel from CompositeImplicitAutograd when there's no direct registration
|
// For autograd keys, we only use kernel from CompositeImplicitAutograd when there's no direct registration
|
||||||
// to its corresponding backend key or CompositeExplicitAutograd. See Note [CompositeExplicitAutograd and CompositeImplicitAutograd].
|
// to its corresponding backend key or CompositeExplicitAutograd. See Note [CompositeExplicitAutograd and CompositeImplicitAutograd].
|
||||||
// For AutogradOther, we eagerly return ambiguousAutogradOtherKernel() if there's registration to any of
|
// For AutogradOther, we eagerly return ambiguousAutogradOtherKernel() if there's registration to any of
|
||||||
// its backends and ask backend extender to request a decicated Autograd key for the backend.
|
// its backends and ask backend extender to request a dedicated Autograd key for the backend.
|
||||||
// See Note [Ambiguity in AutogradOther kernel] for more details.
|
// See Note [Ambiguity in AutogradOther kernel] for more details.
|
||||||
// A CompositeExplicitAutograd kernel prevents CompositeImplicitAutograd kernel being used for Autograd keys, but it doesn't
|
// A CompositeExplicitAutograd kernel prevents CompositeImplicitAutograd kernel being used for Autograd keys, but it doesn't
|
||||||
// cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available)
|
// cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available)
|
||||||
|
|
|
||||||
|
|
@ -261,7 +261,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
|
||||||
//
|
//
|
||||||
// There are 2 cases
|
// There are 2 cases
|
||||||
// 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
|
// 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
|
||||||
// without the extra parenthesis, the c++ schem parser can not parse it.
|
// without the extra parenthesis, the c++ scheme parser can not parse it.
|
||||||
// 2. something like '-> ((str, str))'. Need extra parenthesis so the return
|
// 2. something like '-> ((str, str))'. Need extra parenthesis so the return
|
||||||
// type is a single tuple rather than two strings.
|
// type is a single tuple rather than two strings.
|
||||||
// PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
|
// PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
|
||||||
|
|
|
||||||
|
|
@ -1176,7 +1176,7 @@ struct TORCH_API IValue final {
|
||||||
using HashIdentityIValueMap =
|
using HashIdentityIValueMap =
|
||||||
std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
|
std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
|
||||||
|
|
||||||
// Chechs if this and rhs has a subvalues in common.
|
// Checks if this and rhs has a subvalues in common.
|
||||||
// [t1,t2] and [t2, t3] returns true.
|
// [t1,t2] and [t2, t3] returns true.
|
||||||
bool overlaps(const IValue& rhs) const;
|
bool overlaps(const IValue& rhs) const;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1501,7 +1501,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
|
||||||
// However, the CompilationUnit holds ownership of the type's graphs, so
|
// However, the CompilationUnit holds ownership of the type's graphs, so
|
||||||
// inserting a constant object into a Graph would create a reference cycle if
|
// inserting a constant object into a Graph would create a reference cycle if
|
||||||
// that constant object held a shared_ptr to its CU. For these objects we
|
// that constant object held a shared_ptr to its CU. For these objects we
|
||||||
// instatiate them with non-owning references to its CU
|
// instantiate them with non-owning references to its CU
|
||||||
Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
|
Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
|
||||||
slots_.resize(numSlots);
|
slots_.resize(numSlots);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -373,7 +373,7 @@ struct TORCH_API SymbolicShape {
|
||||||
// Unranked shape constructor.
|
// Unranked shape constructor.
|
||||||
SymbolicShape() : dims_(std::nullopt) {}
|
SymbolicShape() : dims_(std::nullopt) {}
|
||||||
|
|
||||||
// Known rank but unknown dimentions.
|
// Known rank but unknown dimensions.
|
||||||
SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
|
SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
|
||||||
if(!rank) {
|
if(!rank) {
|
||||||
return;
|
return;
|
||||||
|
|
@ -884,9 +884,9 @@ struct TORCH_API ListType
|
||||||
|
|
||||||
// global singleton
|
// global singleton
|
||||||
// Given an inner type T and an identifier,
|
// Given an inner type T and an identifier,
|
||||||
// this function wil return the global singleton type pointer
|
// this function will return the global singleton type pointer
|
||||||
// the type List<T>.
|
// the type List<T>.
|
||||||
// The extra "identifier" argument is needed beccause we have multiple container types
|
// The extra "identifier" argument is needed because we have multiple container types
|
||||||
// that all re-use this function (List<T>, array<T, N>, etc.)
|
// that all re-use this function (List<T>, array<T, N>, etc.)
|
||||||
static TypePtr get(const std::string& identifier, TypePtr inner);
|
static TypePtr get(const std::string& identifier, TypePtr inner);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ namespace c10 {
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
|
// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
|
||||||
// We do this because every argument in a function schema is expected to be convertable
|
// We do this because every argument in a function schema is expected to be convertible
|
||||||
// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
|
// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
|
||||||
// See Note [Plumbing Keys Through The Dispatcher]
|
// See Note [Plumbing Keys Through The Dispatcher]
|
||||||
template<class KernelFunctor>
|
template<class KernelFunctor>
|
||||||
|
|
|
||||||
|
|
@ -251,7 +251,7 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox
|
||||||
callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CPU), dummyTensor(c10::DispatchKey::CUDA));
|
callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CPU), dummyTensor(c10::DispatchKey::CUDA));
|
||||||
EXPECT_TRUE(called_kernel_cpu);
|
EXPECT_TRUE(called_kernel_cpu);
|
||||||
|
|
||||||
// Ensure that disptach key from tensor is not used here.
|
// Ensure that dispatch key from tensor is not used here.
|
||||||
called_kernel_cpu = false;
|
called_kernel_cpu = false;
|
||||||
expectThrows<c10::Error>([&] {
|
expectThrows<c10::Error>([&] {
|
||||||
callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CUDA), dummyTensor(c10::DispatchKey::CPU));
|
callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CUDA), dummyTensor(c10::DispatchKey::CPU));
|
||||||
|
|
|
||||||
|
|
@ -172,7 +172,7 @@ VaryingShape<Stride> TensorType::computeStrideProps(
|
||||||
// The logic below follows what TensorIterator uses in its logic:
|
// The logic below follows what TensorIterator uses in its logic:
|
||||||
// 1. Fast_set_up is the short-cut to identify a. channels_last and
|
// 1. Fast_set_up is the short-cut to identify a. channels_last and
|
||||||
// b. contiguous format, which is what we have in the below logic.
|
// b. contiguous format, which is what we have in the below logic.
|
||||||
// 2. In more generla cases, it does best effort to preserve permutatoin.
|
// 2. In more general cases, it does best effort to preserve permutatoin.
|
||||||
if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
|
if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
|
||||||
// case 1.a. short cut channels last
|
// case 1.a. short cut channels last
|
||||||
std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);
|
std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);
|
||||||
|
|
|
||||||
|
|
@ -234,7 +234,7 @@ class Vectorized<c10::Half> : public Vectorized16<
|
||||||
vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
|
vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
|
||||||
return vaddvq_u16(bits_vec);
|
return vaddvq_u16(bits_vec);
|
||||||
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||||
// use known working implmentation.
|
// use known working implementation.
|
||||||
__at_align__ value_type tmp[size()];
|
__at_align__ value_type tmp[size()];
|
||||||
store(tmp);
|
store(tmp);
|
||||||
int mask = 0;
|
int mask = 0;
|
||||||
|
|
|
||||||
|
|
@ -1740,7 +1740,7 @@ Vectorized<int16_t> inline shift_256_16(
|
||||||
|
|
||||||
// Control masks for shuffle operation, treating 256 bits as an
|
// Control masks for shuffle operation, treating 256 bits as an
|
||||||
// array of 16-bit elements, and considering pairs of neighboring
|
// array of 16-bit elements, and considering pairs of neighboring
|
||||||
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
|
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||||
// M!=N) is set so that shuffle will move element with index M from
|
// M!=N) is set so that shuffle will move element with index M from
|
||||||
// input pair into element with index N in output pair, and element
|
// input pair into element with index N in output pair, and element
|
||||||
// with index M in output pair will be set to all 0s.
|
// with index M in output pair will be set to all 0s.
|
||||||
|
|
@ -1875,7 +1875,7 @@ Vectorized<T> inline shift_256_8(
|
||||||
|
|
||||||
// Control masks for shuffle operation, treating 256 bits as an
|
// Control masks for shuffle operation, treating 256 bits as an
|
||||||
// array of 8-bit elements, and considering quadruples of
|
// array of 8-bit elements, and considering quadruples of
|
||||||
// neighboring elements. Specifially, a mask named "ctl_M_N" (M,N
|
// neighboring elements. Specifically, a mask named "ctl_M_N" (M,N
|
||||||
// in [0,1,2,3], and M!=N) is set so that shuffle will move element
|
// in [0,1,2,3], and M!=N) is set so that shuffle will move element
|
||||||
// with index M from input quadruple into element with index N in
|
// with index M from input quadruple into element with index N in
|
||||||
// output quadruple, and other elements in output quadruple will be
|
// output quadruple, and other elements in output quadruple will be
|
||||||
|
|
|
||||||
|
|
@ -143,7 +143,7 @@ class Vectorized<double> {
|
||||||
const Vectorized<double>& a,
|
const Vectorized<double>& a,
|
||||||
const Vectorized<double>& b,
|
const Vectorized<double>& b,
|
||||||
const Vectorized<double>& mask) {
|
const Vectorized<double>& mask) {
|
||||||
// the mask used here returned by comparision of vec256
|
// the mask used here returned by comparison of vec256
|
||||||
|
|
||||||
return {
|
return {
|
||||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||||
|
|
|
||||||
|
|
@ -142,7 +142,7 @@ class Vectorized<float> {
|
||||||
const Vectorized<float>& a,
|
const Vectorized<float>& a,
|
||||||
const Vectorized<float>& b,
|
const Vectorized<float>& b,
|
||||||
const Vectorized<float>& mask) {
|
const Vectorized<float>& mask) {
|
||||||
// the mask used here returned by comparision of vec256
|
// the mask used here returned by comparison of vec256
|
||||||
// assuming this we can use the same mask directly with vec_sel
|
// assuming this we can use the same mask directly with vec_sel
|
||||||
return {
|
return {
|
||||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||||
|
|
|
||||||
|
|
@ -202,7 +202,7 @@ class Vectorized<int16_t> {
|
||||||
const Vectorized<int16_t>& a,
|
const Vectorized<int16_t>& a,
|
||||||
const Vectorized<int16_t>& b,
|
const Vectorized<int16_t>& b,
|
||||||
const Vectorized<int16_t>& mask) {
|
const Vectorized<int16_t>& mask) {
|
||||||
// the mask used here returned by comparision of vec256
|
// the mask used here returned by comparison of vec256
|
||||||
// assuming this we can use the same mask directly with vec_sel
|
// assuming this we can use the same mask directly with vec_sel
|
||||||
// warning intel style mask will not work properly
|
// warning intel style mask will not work properly
|
||||||
return {
|
return {
|
||||||
|
|
|
||||||
|
|
@ -155,7 +155,7 @@ class Vectorized<int32_t> {
|
||||||
const Vectorized<int32_t>& a,
|
const Vectorized<int32_t>& a,
|
||||||
const Vectorized<int32_t>& b,
|
const Vectorized<int32_t>& b,
|
||||||
const Vectorized<int32_t>& mask) {
|
const Vectorized<int32_t>& mask) {
|
||||||
// the mask used here returned by comparision of vec256
|
// the mask used here returned by comparison of vec256
|
||||||
// assuming this we can use the same mask directly with vec_sel
|
// assuming this we can use the same mask directly with vec_sel
|
||||||
// warning intel style mask will not work properly
|
// warning intel style mask will not work properly
|
||||||
return {
|
return {
|
||||||
|
|
|
||||||
|
|
@ -119,7 +119,7 @@ class Vectorized<int64_t> {
|
||||||
const Vectorized<int64_t>& a,
|
const Vectorized<int64_t>& a,
|
||||||
const Vectorized<int64_t>& b,
|
const Vectorized<int64_t>& b,
|
||||||
const Vectorized<int64_t>& mask) {
|
const Vectorized<int64_t>& mask) {
|
||||||
// the mask used here returned by comparision of vec256
|
// the mask used here returned by comparison of vec256
|
||||||
|
|
||||||
return {
|
return {
|
||||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||||
|
|
|
||||||
|
|
@ -397,7 +397,7 @@ inline Vectorized<bool> operator&&(
|
||||||
const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
|
const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
|
||||||
__m512i out = _mm512_and_si512(*self_, *other_);
|
__m512i out = _mm512_and_si512(*self_, *other_);
|
||||||
Vectorized<bool> ret;
|
Vectorized<bool> ret;
|
||||||
// We do not have a constructer that takes __m512i, so we need to memcpy
|
// We do not have a constructor that takes __m512i, so we need to memcpy
|
||||||
std::memcpy(ret, &out, ret.size() * sizeof(bool));
|
std::memcpy(ret, &out, ret.size() * sizeof(bool));
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1852,7 +1852,7 @@ Vectorized<T> inline shift_512_8(
|
||||||
|
|
||||||
// Control masks for shuffle operation, treating 512 bits as an
|
// Control masks for shuffle operation, treating 512 bits as an
|
||||||
// array of 8-bit elements, and considering pairs of neighboring
|
// array of 8-bit elements, and considering pairs of neighboring
|
||||||
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
|
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||||
// M!=N) is set so that shuffle will move element with index M from
|
// M!=N) is set so that shuffle will move element with index M from
|
||||||
// input pair into element with index N in output pair, and element
|
// input pair into element with index N in output pair, and element
|
||||||
// with index M in output pair will be set to all 0s.
|
// with index M in output pair will be set to all 0s.
|
||||||
|
|
|
||||||
|
|
@ -634,7 +634,7 @@ struct Vectorized {
|
||||||
}
|
}
|
||||||
Vectorized<T> neg() const {
|
Vectorized<T> neg() const {
|
||||||
// NB: the trailing return type is needed because we need to coerce the
|
// NB: the trailing return type is needed because we need to coerce the
|
||||||
// return value back to T in the case of unary operator- incuring a
|
// return value back to T in the case of unary operator- incurring a
|
||||||
// promotion
|
// promotion
|
||||||
return map([](T x) -> T { return -x; });
|
return map([](T x) -> T { return -x; });
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1958,7 +1958,7 @@ void scaled_gemm(
|
||||||
ScalarType result_dtype,
|
ScalarType result_dtype,
|
||||||
bool use_fast_accum,
|
bool use_fast_accum,
|
||||||
const std::optional<Tensor>& alpha) {
|
const std::optional<Tensor>& alpha) {
|
||||||
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
|
// Note: see `cublasCommonArgs` for various non-intuitive manipulations
|
||||||
// of input arguments to this function.
|
// of input arguments to this function.
|
||||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||||
const auto scaleType = CUDA_R_32F;
|
const auto scaleType = CUDA_R_32F;
|
||||||
|
|
|
||||||
|
|
@ -307,7 +307,7 @@ CUDAGraph::~CUDAGraph() {
|
||||||
// There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
|
// There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
|
||||||
// They wait for next sync point in order to free the memory, this is to ensure that all
|
// They wait for next sync point in order to free the memory, this is to ensure that all
|
||||||
// hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
|
// hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
|
||||||
// We need to ensure all async opreations finish before deleting the object.
|
// We need to ensure all async operations finish before deleting the object.
|
||||||
#if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
|
#if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
|
||||||
if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
|
if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -137,7 +137,7 @@ struct CUDACachingHostAllocatorImpl
|
||||||
void free_block_slowpath(Block* block) {
|
void free_block_slowpath(Block* block) {
|
||||||
auto start = std::chrono::steady_clock::now();
|
auto start = std::chrono::steady_clock::now();
|
||||||
// Users may change the allocator config at will. torch unit tests do this.
|
// Users may change the allocator config at will. torch unit tests do this.
|
||||||
// However, allocations using cudaHostRegister should use corresonding
|
// However, allocations using cudaHostRegister should use corresponding
|
||||||
// cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
|
// cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
|
||||||
void* ptr = block->ptr_;
|
void* ptr = block->ptr_;
|
||||||
bool use_register = false;
|
bool use_register = false;
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@
|
||||||
#include <ATen/cuda/CUDAConfig.h>
|
#include <ATen/cuda/CUDAConfig.h>
|
||||||
|
|
||||||
// NOTE: These templates are intentionally not defined in this header,
|
// NOTE: These templates are intentionally not defined in this header,
|
||||||
// which aviods re-compiling them for each translation unit. If you get
|
// which avoids re-compiling them for each translation unit. If you get
|
||||||
// a link error, you need to add an explicit instantiation for your
|
// a link error, you need to add an explicit instantiation for your
|
||||||
// types in cub.cu
|
// types in cub.cu
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
|
||||||
GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
|
GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect
|
Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect
|
||||||
this and reject the tunings file because the prior tunings are likely affected by other software changes.
|
this and reject the tunings file because the prior tunings are likely affected by other software changes.
|
||||||
|
|
||||||
The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
|
The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
|
||||||
|
|
|
||||||
|
|
@ -235,7 +235,7 @@ class TunableOp {
|
||||||
// numeric check option is controlled by non-static env var, so check it once per tuned operator
|
// numeric check option is controlled by non-static env var, so check it once per tuned operator
|
||||||
bool do_numerics_check = ctx->IsNumericsCheckEnabled();
|
bool do_numerics_check = ctx->IsNumericsCheckEnabled();
|
||||||
|
|
||||||
// calcaulte a reference answer for numerical check
|
// calculate a reference answer for numerical check
|
||||||
if (do_numerics_check) {
|
if (do_numerics_check) {
|
||||||
reference_params = params->DeepCopy(false);
|
reference_params = params->DeepCopy(false);
|
||||||
TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
|
TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ namespace at {
|
||||||
|
|
||||||
// AcceleratorHooksInterface is a shared interface provided by all
|
// AcceleratorHooksInterface is a shared interface provided by all
|
||||||
// accelerators to allow generic code.
|
// accelerators to allow generic code.
|
||||||
// This inferface is hook-based as it corresponds to all the functions
|
// This interface is hook-based as it corresponds to all the functions
|
||||||
// that are going to be called in a generic way from the CPU code.
|
// that are going to be called in a generic way from the CPU code.
|
||||||
|
|
||||||
struct TORCH_API AcceleratorHooksInterface {
|
struct TORCH_API AcceleratorHooksInterface {
|
||||||
|
|
|
||||||
|
|
@ -38,7 +38,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
|
||||||
|
|
||||||
Generator getNewGenerator(
|
Generator getNewGenerator(
|
||||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||||
// TODO(FFFrog): Perserved for BC and will be removed in the future.
|
// TODO(FFFrog): Preserved for BC and will be removed in the future.
|
||||||
if (at::GetGeneratorPrivate().has_value())
|
if (at::GetGeneratorPrivate().has_value())
|
||||||
return at::GetGeneratorForPrivateuse1(device_index);
|
return at::GetGeneratorForPrivateuse1(device_index);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -283,7 +283,7 @@ inline void boxed_existing_bdim_all_batch_rule(
|
||||||
// Use when all tensors arguments accept one (normal) batch dim.
|
// Use when all tensors arguments accept one (normal) batch dim.
|
||||||
// This batching rule expands the batch dim on all Tensors, reshapes it into
|
// This batching rule expands the batch dim on all Tensors, reshapes it into
|
||||||
// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
|
// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
|
||||||
// This is not the most efficient thing; if there are alternatives, plese try
|
// This is not the most efficient thing; if there are alternatives, please try
|
||||||
// to use them. Use this only as a last resort.
|
// to use them. Use this only as a last resort.
|
||||||
#define EXISTING_BDIM_ALL_BOXED(op) \
|
#define EXISTING_BDIM_ALL_BOXED(op) \
|
||||||
m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
|
m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
|
||||||
|
|
|
||||||
|
|
@ -384,7 +384,7 @@ fourOutputs solve_ex_batch_rule(
|
||||||
|
|
||||||
// NOTE [ solve_ex Batch Rule Contiguity ]
|
// NOTE [ solve_ex Batch Rule Contiguity ]
|
||||||
// A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
|
// A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
|
||||||
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior
|
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior
|
||||||
// differs based on whether or not the optimized path was taken
|
// differs based on whether or not the optimized path was taken
|
||||||
const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
|
const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
|
||||||
if (batched_A_was_contiguous && !A.is_complex()) {
|
if (batched_A_was_contiguous && !A.is_complex()) {
|
||||||
|
|
|
||||||
|
|
@ -282,7 +282,7 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(
|
||||||
|
|
||||||
dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);
|
dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);
|
||||||
|
|
||||||
// Not sure why output_ needs to be marked as .contiguous(). Someting must
|
// Not sure why output_ needs to be marked as .contiguous(). Something must
|
||||||
// have changed in PyTorch (and output of softmax is probably always contiguous)
|
// have changed in PyTorch (and output of softmax is probably always contiguous)
|
||||||
return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
|
return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -224,7 +224,7 @@ static Tensor safeStack(TensorList tensors) {
|
||||||
// is possible for the backward function to return an undefined grad for some
|
// is possible for the backward function to return an undefined grad for some
|
||||||
// grad_input for each example. In that case, we return an undefined grad.
|
// grad_input for each example. In that case, we return an undefined grad.
|
||||||
//
|
//
|
||||||
// It is theoretically posssible for *some* of the examples to produce an
|
// It is theoretically possible for *some* of the examples to produce an
|
||||||
// undefined grad (a kernel could peek at the gradient values and return an
|
// undefined grad (a kernel could peek at the gradient values and return an
|
||||||
// undefined tensor if it determines the gradient is full of zeros). We
|
// undefined tensor if it determines the gradient is full of zeros). We
|
||||||
// could handle this by treating the undefined grad as a zero-filled tensor
|
// could handle this by treating the undefined grad as a zero-filled tensor
|
||||||
|
|
|
||||||
|
|
@ -113,7 +113,7 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const {
|
||||||
return sym_sizes_default();
|
return sym_sizes_default();
|
||||||
}
|
}
|
||||||
|
|
||||||
// The following are publically exposed as methods of Tensor
|
// The following are publicly exposed as methods of Tensor
|
||||||
|
|
||||||
IntArrayRef BatchedTensorImpl::strides_custom() const {
|
IntArrayRef BatchedTensorImpl::strides_custom() const {
|
||||||
return strides_default();
|
return strides_default();
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ namespace at::functorch {
|
||||||
// how to perform the transform.
|
// how to perform the transform.
|
||||||
//
|
//
|
||||||
// TODO: we can excise DynamicLayer in favor of Interpreter,
|
// TODO: we can excise DynamicLayer in favor of Interpreter,
|
||||||
// But I am going to leave it for now as a compatiblity shim to avoid
|
// But I am going to leave it for now as a compatibility shim to avoid
|
||||||
// needing to refactor a lot of callsites...
|
// needing to refactor a lot of callsites...
|
||||||
struct TORCH_API DynamicLayer {
|
struct TORCH_API DynamicLayer {
|
||||||
explicit DynamicLayer(
|
explicit DynamicLayer(
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,7 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
|
||||||
// >>> VmapInterpreterPtr(&interpreter).batchSize()
|
// >>> VmapInterpreterPtr(&interpreter).batchSize()
|
||||||
//
|
//
|
||||||
// Finally, Interpreter::process switches on the type of the interpreter
|
// Finally, Interpreter::process switches on the type of the interpreter
|
||||||
// and calls one of {Transform}Intepreter::processImpl under the hood.
|
// and calls one of {Transform}Interpreter::processImpl under the hood.
|
||||||
// Same for Interpreter::sendToNextInterpreter :)
|
// Same for Interpreter::sendToNextInterpreter :)
|
||||||
|
|
||||||
struct VmapInterpreterMeta {
|
struct VmapInterpreterMeta {
|
||||||
|
|
|
||||||
|
|
@ -733,7 +733,7 @@ TORCH_LIBRARY_IMPL(_, FuncTorchBatched, m) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
|
TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
|
||||||
// still legacy b/c teturns multiple tensors
|
// still legacy b/c returns multiple tensors
|
||||||
m.impl("split.Tensor", split_batching_rule);
|
m.impl("split.Tensor", split_batching_rule);
|
||||||
m.impl("split_with_sizes", split_with_sizes_batching_rule);
|
m.impl("split_with_sizes", split_with_sizes_batching_rule);
|
||||||
m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule);
|
m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule);
|
||||||
|
|
|
||||||
|
|
@ -158,7 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
|
||||||
endKernelCoalescing();
|
endKernelCoalescing();
|
||||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||||
|
|
||||||
// For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
|
// For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26
|
||||||
// See https://github.com/pytorch/pytorch/issues/163962
|
// See https://github.com/pytorch/pytorch/issues/163962
|
||||||
// Workaround by batching copy commands into 4Gb chunks
|
// Workaround by batching copy commands into 4Gb chunks
|
||||||
constexpr size_t max_copy_size = 0x100000000; // 4GB
|
constexpr size_t max_copy_size = 0x100000000; // 4GB
|
||||||
|
|
|
||||||
|
|
@ -148,7 +148,7 @@ inline void checkInputsSolver(const Tensor& A,
|
||||||
|
|
||||||
inline bool is_row_or_column_contiguous(const Tensor& t) {
|
inline bool is_row_or_column_contiguous(const Tensor& t) {
|
||||||
// This could be made more general, similar to how it's checked in matmul, which would allow to
|
// This could be made more general, similar to how it's checked in matmul, which would allow to
|
||||||
// ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
|
// elide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
|
||||||
// We choose to be conservative for simplicity
|
// We choose to be conservative for simplicity
|
||||||
return t.is_contiguous() || t.transpose(-2, -1).is_contiguous();
|
return t.is_contiguous() || t.transpose(-2, -1).is_contiguous();
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ enum class fft_norm_mode {
|
||||||
// NOTE [ Fourier Transform Conjugate Symmetry ]
|
// NOTE [ Fourier Transform Conjugate Symmetry ]
|
||||||
//
|
//
|
||||||
// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is,
|
// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is,
|
||||||
// assuming X is the transformed K-dimensionsal signal, we have
|
// assuming X is the transformed K-dimensional signal, we have
|
||||||
//
|
//
|
||||||
// X[i_1, ..., i_K] = X[j_i, ..., j_K]*,
|
// X[i_1, ..., i_K] = X[j_i, ..., j_K]*,
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
|
||||||
auto* input_tr_ptr =
|
auto* input_tr_ptr =
|
||||||
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
|
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
|
||||||
// TODO: Activation transpose before and after the kernel can be removed if we
|
// TODO: Activation transpose before and after the kernel can be removed if we
|
||||||
// keep activation tensor always tranposed.
|
// keep activation tensor always transposed.
|
||||||
fbgemm::transpose_simd<uint8_t>(
|
fbgemm::transpose_simd<uint8_t>(
|
||||||
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);
|
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -520,7 +520,7 @@ cpu_adaptive_avg_pool3d_channels_last(
|
||||||
scalar_t* out = output_data + i * channels;
|
scalar_t* out = output_data + i * channels;
|
||||||
int64_t size = channels;
|
int64_t size = channels;
|
||||||
|
|
||||||
// Note: For oridinary usage scenario, each out lane should
|
// Note: For ordinary usage scenario, each out lane should
|
||||||
// fit in L1 cache; otherwise consider block dim C.
|
// fit in L1 cache; otherwise consider block dim C.
|
||||||
// Pass I: zero the out lane
|
// Pass I: zero the out lane
|
||||||
int64_t d1 = 0;
|
int64_t d1 = 0;
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ struct Dist {
|
||||||
// finish : This tells what to do with the aggregated value to compute
|
// finish : This tells what to do with the aggregated value to compute
|
||||||
// the norm. Generally this is the result of val ^ (1 / p).
|
// the norm. Generally this is the result of val ^ (1 / p).
|
||||||
// backward : This is the gradient for that norm. Arguments are pretty
|
// backward : This is the gradient for that norm. Arguments are pretty
|
||||||
// self explanitory.
|
// self explanatory.
|
||||||
//
|
//
|
||||||
// There are a few cases where these aren't used. The 0 norm has no backward,
|
// There are a few cases where these aren't used. The 0 norm has no backward,
|
||||||
// because it's always 0, so that's shortcircuited earlier. There's a special
|
// because it's always 0, so that's shortcircuited earlier. There's a special
|
||||||
|
|
|
||||||
|
|
@ -30,7 +30,7 @@ vec::Vectorized<scalar_t> is_nan_vec(vec::Vectorized<scalar_t> vec) {
|
||||||
return vec.isnan();
|
return vec.isnan();
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: use is_integeral/is_same to check the scalar_t and simplify the implementation
|
// TODO: use is_integral/is_same to check the scalar_t and simplify the implementation
|
||||||
// currently it does not work
|
// currently it does not work
|
||||||
template <>
|
template <>
|
||||||
vec::Vectorized<unsigned char> is_nan_vec<unsigned char>(vec::Vectorized<unsigned char> vec) {
|
vec::Vectorized<unsigned char> is_nan_vec<unsigned char>(vec::Vectorized<unsigned char> vec) {
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
|
||||||
|
|
||||||
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
|
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
|
||||||
compiler flags it is currently compiled. This allows the programmer to write
|
compiler flags it is currently compiled. This allows the programmer to write
|
||||||
generic code, which will be compiled under multipled compilation settings.
|
generic code, which will be compiled under multiplied compilation settings.
|
||||||
|
|
||||||
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
|
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
|
||||||
a generic definition of `sumImplAll`. This function allows the user to reduce
|
a generic definition of `sumImplAll`. This function allows the user to reduce
|
||||||
|
|
|
||||||
|
|
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
|
||||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
|
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
|
||||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
|
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
|
||||||
|
|
||||||
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
|
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
|
||||||
// RGBA: pix1 = [
|
// RGBA: pix1 = [
|
||||||
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
|
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
|
||||||
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
|
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
|
||||||
|
|
|
||||||
|
|
@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
|
||||||
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
||||||
|
|
||||||
|
|
||||||
/* evaluation of 8 sines at onces using AVX intrinsics
|
/* evaluation of 8 sines at once using AVX intrinsics
|
||||||
|
|
||||||
The code is the exact rewriting of the cephes sinf function.
|
The code is the exact rewriting of the cephes sinf function.
|
||||||
Precision is excellent as long as x < 8192 (I did not bother to
|
Precision is excellent as long as x < 8192 (I did not bother to
|
||||||
|
|
|
||||||
|
|
@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
|
||||||
const bool gamma_null = (gamma_data == nullptr);
|
const bool gamma_null = (gamma_data == nullptr);
|
||||||
const bool beta_null = beta_data == nullptr;
|
const bool beta_null = beta_data == nullptr;
|
||||||
|
|
||||||
// NB: About algorithm choosen:
|
// NB: About algorithm chosen:
|
||||||
//
|
//
|
||||||
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
|
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
|
||||||
// Mean and rstd are collected per each n and g, which involves reduction
|
// Mean and rstd are collected per each n and g, which involves reduction
|
||||||
|
|
|
||||||
|
|
@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Dynamically Quantize the float32 input to 8 bit assymetric
|
// Dynamically Quantize the float32 input to 8 bit asymmetric
|
||||||
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
|
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
|
||||||
|
|
||||||
const size_t lhs_stride =
|
const size_t lhs_stride =
|
||||||
|
|
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
|
||||||
const int64_t weight_packed_size =
|
const int64_t weight_packed_size =
|
||||||
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
|
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
|
||||||
if (weight_packed_size == packed_weights.numel()) {
|
if (weight_packed_size == packed_weights.numel()) {
|
||||||
// KleidiAI interface intenally handles the Channelwise and groupwise
|
// KleidiAI interface internally handles the Channelwise and groupwise
|
||||||
// distinction
|
// distinction
|
||||||
kleidiai::kai_quant_pack_lhs_int4_mm(
|
kleidiai::kai_quant_pack_lhs_int4_mm(
|
||||||
output, inp, packed_weights, M, N, K, block_size);
|
output, inp, packed_weights, M, N, K, block_size);
|
||||||
|
|
|
||||||
|
|
@ -505,7 +505,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||||
// Handle whether to use the Lt interface {
|
// Handle whether to use the Lt interface {
|
||||||
static bool persistent_disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device());
|
static bool persistent_disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device());
|
||||||
// if lt path fails, we recurse back into this function here and force the lt path to off
|
// if lt path fails, we recurse back into this function here and force the lt path to off
|
||||||
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
|
// we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||||
bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
// Conditioned on the device index, which is not persistent
|
// Conditioned on the device index, which is not persistent
|
||||||
|
|
|
||||||
|
|
@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
|
||||||
auto value = static_cast<scalar_t>(rand * range + from);
|
auto value = static_cast<scalar_t>(rand * range + from);
|
||||||
// reverse the bounds of curand4 from (0, 1] to [0, 1)
|
// reverse the bounds of curand4 from (0, 1] to [0, 1)
|
||||||
// Note that this method is from legacy THCTensorRandom and is likely to give
|
// Note that this method is from legacy THCTensorRandom and is likely to give
|
||||||
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
|
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
|
||||||
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
|
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
|
||||||
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
|
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
|
||||||
auto reverse_bound_value = value == to ? from : value;
|
auto reverse_bound_value = value == to ? from : value;
|
||||||
|
|
|
||||||
|
|
@ -154,7 +154,7 @@ REGISTER_CUDA_DISPATCH(lstsq_stub, &lazy_lstsq_kernel)
|
||||||
|
|
||||||
// Old style dispatches
|
// Old style dispatches
|
||||||
// torch_cuda_linalg dynamic library should have a global constructor
|
// torch_cuda_linalg dynamic library should have a global constructor
|
||||||
// that calls regiserLinaglDispatch so in order ot lazy bind
|
// that calls registerLinalgDispatch so in order ot lazy bind
|
||||||
// old style dispatch all one have to do is to load library and call disp.func_name
|
// old style dispatch all one have to do is to load library and call disp.func_name
|
||||||
// Protect from infinite recursion by initializing dispatch to self and checking
|
// Protect from infinite recursion by initializing dispatch to self and checking
|
||||||
// that values are different after linalg library were loaded
|
// that values are different after linalg library were loaded
|
||||||
|
|
|
||||||
|
|
@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
|
||||||
|
|
||||||
std::string file_path;
|
std::string file_path;
|
||||||
if (cache_dir.has_value()) {
|
if (cache_dir.has_value()) {
|
||||||
// Attemps to read from the cache.
|
// Attempts to read from the cache.
|
||||||
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
|
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
|
||||||
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
|
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
|
||||||
// because we hash on the CUDA code, but we save the compiled ptx or sass
|
// because we hash on the CUDA code, but we save the compiled ptx or sass
|
||||||
|
|
|
||||||
|
|
@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
|
||||||
});
|
});
|
||||||
|
|
||||||
if (input.dim() > 2) {
|
if (input.dim() > 2) {
|
||||||
// if upper=true we need to tranpose and conjugate the result tensor
|
// if upper=true we need to transpose and conjugate the result tensor
|
||||||
// because the cholesky decomposition is stored in the lower triangular part
|
// because the cholesky decomposition is stored in the lower triangular part
|
||||||
if (upper) {
|
if (upper) {
|
||||||
input.copy_(result.mH());
|
input.copy_(result.mH());
|
||||||
|
|
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
||||||
|
|
||||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||||
switch (preferred_backend) {
|
switch (preferred_backend) {
|
||||||
// TODO Investigate whether the following magma bug is still occuring.
|
// TODO Investigate whether the following magma bug is still occurring.
|
||||||
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
|
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
|
||||||
// geqrf_magma currently uses geqrf2_gpu
|
// geqrf_magma currently uses geqrf2_gpu
|
||||||
//
|
//
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
|
||||||
#if defined(BUILD_LAZY_CUDA_LINALG)
|
#if defined(BUILD_LAZY_CUDA_LINALG)
|
||||||
namespace cuda { namespace detail {
|
namespace cuda { namespace detail {
|
||||||
// This is only used for an old-style dispatches
|
// This is only used for an old-style dispatches
|
||||||
// Please do not add any new entires to it
|
// Please do not add any new entries to it
|
||||||
struct LinalgDispatch {
|
struct LinalgDispatch {
|
||||||
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
|
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
|
||||||
// blocked format will propagate between layers. Input, output will be in blocked format.
|
// blocked format will propagate between layers. Input, output will be in blocked format.
|
||||||
//
|
//
|
||||||
// For inference case, weight can be prepacked into blocked format by
|
// For inference case, weight can be prepacked into blocked format by
|
||||||
// (so as to save weight reoder overhead):
|
// (so as to save weight reorder overhead):
|
||||||
// model = torch.utils.mkldnn.to_mkldnn(model)
|
// model = torch.utils.mkldnn.to_mkldnn(model)
|
||||||
//
|
//
|
||||||
// For training case, grad_output can be CPU tensor or MKLDNN tensor,
|
// For training case, grad_output can be CPU tensor or MKLDNN tensor,
|
||||||
|
|
@ -723,7 +723,7 @@ Tensor _mkldnn_convolution_transpose(
|
||||||
ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
|
ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
|
||||||
if (!weight.is_mkldnn()) {
|
if (!weight.is_mkldnn()) {
|
||||||
// mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
|
// mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
|
||||||
// while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy).
|
// while PyTorch has IOHW or IODHW, `._transpose()` switches strides (no memory copy).
|
||||||
w.transpose_(0, 1);
|
w.transpose_(0, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
|
||||||
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
|
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
|
||||||
args.insert({DNNL_ARG_DST, dst});
|
args.insert({DNNL_ARG_DST, dst});
|
||||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
||||||
// Create primitve and execute
|
// Create primitive and execute
|
||||||
auto primitive = dnnl::matmul(prim_desc);
|
auto primitive = dnnl::matmul(prim_desc);
|
||||||
primitive.execute(ideep::stream::default_stream(), args);
|
primitive.execute(ideep::stream::default_stream(), args);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -439,7 +439,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> mkldnn_rnn_la
|
||||||
// I. Memory Formats
|
// I. Memory Formats
|
||||||
// a. mkldnn will use plain formats for input, hx/cx, output, hy/cy
|
// a. mkldnn will use plain formats for input, hx/cx, output, hy/cy
|
||||||
// and possibly use blocked formats for weights depending shape info.
|
// and possibly use blocked formats for weights depending shape info.
|
||||||
// b. All mkldnn memorys are created (in plain format) as views on ATen tensor,
|
// b. All mkldnn memories are created (in plain format) as views on ATen tensor,
|
||||||
// the weight reorder(if any) is handed automatically inside ideep (mkldnn bridge)
|
// the weight reorder(if any) is handed automatically inside ideep (mkldnn bridge)
|
||||||
//
|
//
|
||||||
// II. MKLDNN Primitive Mapping
|
// II. MKLDNN Primitive Mapping
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ void check_mkldnn_binary_fusion_inputs(
|
||||||
inline std::vector<int64_t> padding_r(
|
inline std::vector<int64_t> padding_r(
|
||||||
IntArrayRef padding, IntArrayRef output_padding)
|
IntArrayRef padding, IntArrayRef output_padding)
|
||||||
{
|
{
|
||||||
// ConvTranpose padding adjustment
|
// ConvTranspose padding adjustment
|
||||||
//
|
//
|
||||||
// PyTorch uses padding/output_padding:
|
// PyTorch uses padding/output_padding:
|
||||||
// osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1
|
// osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1
|
||||||
|
|
|
||||||
|
|
@ -75,7 +75,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool can_use_flash_attention(sdp::sdp_params const& params, bool debug) {
|
bool can_use_flash_attention(sdp::sdp_params const& params, bool debug) {
|
||||||
// Currently, XPU fallbacks flash attention to overrideable
|
// Currently, XPU fallbacks flash attention to overridable
|
||||||
return can_use_overrideable_attention(params, debug);
|
return can_use_overrideable_attention(params, debug);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -115,7 +115,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
||||||
// 1. Flash Attention
|
// 1. Flash Attention
|
||||||
// 2. Math fallback
|
// 2. Math fallback
|
||||||
auto& ctx = at::globalContext();
|
auto& ctx = at::globalContext();
|
||||||
// use overrideable linked to onednn as overrideable implementation
|
// use overridable linked to onednn as overridable implementation
|
||||||
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
|
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
|
||||||
!ctx.userEnabledFlashSDP()) {
|
!ctx.userEnabledFlashSDP()) {
|
||||||
return sdp::SDPBackend::error;
|
return sdp::SDPBackend::error;
|
||||||
|
|
@ -165,7 +165,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// If we have gotten to this point then two things have happened:
|
// If we have gotten to this point then two things have happened:
|
||||||
// 1. can_use_overrideable_attention did not satisfy the constraints to be ran
|
// 1. can_use_overridable_attention did not satisfy the constraints to be ran
|
||||||
// 2. The user has explicitly disabled the math kernel
|
// 2. The user has explicitly disabled the math kernel
|
||||||
// We then re-run the kernel checks with debug enabled to print out the
|
// We then re-run the kernel checks with debug enabled to print out the
|
||||||
// reason why the kernel was not selected
|
// reason why the kernel was not selected
|
||||||
|
|
|
||||||
|
|
@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
|
||||||
// For optional additive mask
|
// For optional additive mask
|
||||||
std::optional<op> mask_add;
|
std::optional<op> mask_add;
|
||||||
|
|
||||||
// For optional implicite causal mask
|
// For optional implicit causal mask
|
||||||
std::optional<op> mask_gen_idx_row;
|
std::optional<op> mask_gen_idx_row;
|
||||||
std::optional<logical_tensor> mask_row_idx;
|
std::optional<logical_tensor> mask_row_idx;
|
||||||
std::optional<op> mask_gen_idx_col;
|
std::optional<op> mask_gen_idx_col;
|
||||||
|
|
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
|
||||||
// For optional additive mask
|
// For optional additive mask
|
||||||
std::optional<op> mask_add;
|
std::optional<op> mask_add;
|
||||||
|
|
||||||
// For optional implicite causal mask
|
// For optional implicit causal mask
|
||||||
std::optional<op> mask_gen_idx_row;
|
std::optional<op> mask_gen_idx_row;
|
||||||
std::optional<logical_tensor> mask_row_idx;
|
std::optional<logical_tensor> mask_row_idx;
|
||||||
std::optional<op> mask_gen_idx_col;
|
std::optional<op> mask_gen_idx_col;
|
||||||
|
|
|
||||||
|
|
@ -345,7 +345,7 @@ class Attr {
|
||||||
dnnl::memory binary_m;
|
dnnl::memory binary_m;
|
||||||
auto binary = ops_params_[i].binary_;
|
auto binary = ops_params_[i].binary_;
|
||||||
auto md = ops_params_[i].meta_;
|
auto md = ops_params_[i].meta_;
|
||||||
// qeury expected_md to achieve peak performance
|
// query expected_md to achieve peak performance
|
||||||
auto expected_md = pd.query_md(
|
auto expected_md = pd.query_md(
|
||||||
dnnl::query::exec_arg_md,
|
dnnl::query::exec_arg_md,
|
||||||
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
|
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
|
||||||
|
|
|
||||||
|
|
@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// the overlaped cases are not supported
|
// the overlapped cases are not supported
|
||||||
dnnl::memory::dims strides = get_onednn_strides(tensor);
|
dnnl::memory::dims strides = get_onednn_strides(tensor);
|
||||||
int64_t storage_size = 1;
|
int64_t storage_size = 1;
|
||||||
for (size_t dim = 0; dim < tensor_dim; ++dim)
|
for (size_t dim = 0; dim < tensor_dim; ++dim)
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,7 @@
|
||||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||||
name:(NSString*)name {
|
name:(NSString*)name {
|
||||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||||
// agruments results in
|
// arguments results in
|
||||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||||
|
|
@ -42,7 +42,7 @@
|
||||||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||||
name:(NSString*)name {
|
name:(NSString*)name {
|
||||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||||
// agruments results in
|
// arguments results in
|
||||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||||
|
|
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
||||||
|
|
||||||
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||||
// Use gather kernel to solve strides for macOS < 15.0
|
// Use gather kernel to solve strides for macOS < 15.0
|
||||||
// Starting with macOS 15.0, MPS supports native strides direclty in the kernels
|
// Starting with macOS 15.0, MPS supports native strides directly in the kernels
|
||||||
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
|
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
|
||||||
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
|
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
|
||||||
Tensor emptyShell = Tensor();
|
Tensor emptyShell = Tensor();
|
||||||
|
|
|
||||||
|
|
@ -158,7 +158,7 @@ static void reduction_out_mps(const Tensor& input_t,
|
||||||
IntArrayRef dim = opt_dim.value();
|
IntArrayRef dim = opt_dim.value();
|
||||||
for (const auto dim_val : dim) {
|
for (const auto dim_val : dim) {
|
||||||
auto wrap_dim = maybe_wrap_dim(dim_val, input_shape.size());
|
auto wrap_dim = maybe_wrap_dim(dim_val, input_shape.size());
|
||||||
// canSqueeze logic is broken when dim is negative, it introduces off-by-one-erros or crashes
|
// canSqueeze logic is broken when dim is negative, it introduces off-by-one-errors or crashes
|
||||||
// See https://github.com/pytorch/pytorch/issues/136132#issuecomment-2354482608
|
// See https://github.com/pytorch/pytorch/issues/136132#issuecomment-2354482608
|
||||||
if (wrap_dim >= 4 || dim_val < 0) {
|
if (wrap_dim >= 4 || dim_val < 0) {
|
||||||
canSqueezeLastDim = false;
|
canSqueezeLastDim = false;
|
||||||
|
|
@ -1282,7 +1282,7 @@ static void all_any_common_impl_mps(const Tensor& input_t,
|
||||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||||
|
|
||||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4
|
// reductionOrWithTensor:axis: will throw an internal assert if number of dimensions is more than 4
|
||||||
// See https://github.com/pytorch/pytorch/issues/95538
|
// See https://github.com/pytorch/pytorch/issues/95538
|
||||||
MPSGraphTensor* outputTensor = nil;
|
MPSGraphTensor* outputTensor = nil;
|
||||||
if (input_t.ndimension() > 4) {
|
if (input_t.ndimension() > 4) {
|
||||||
|
|
@ -1352,7 +1352,7 @@ TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
||||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
// reductionOrWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||||
// See https://github.com/pytorch/pytorch/issues/95538
|
// See https://github.com/pytorch/pytorch/issues/95538
|
||||||
if (input_t.dim() > 4) {
|
if (input_t.dim() > 4) {
|
||||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||||
|
|
@ -1400,7 +1400,7 @@ TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
||||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
// reductionAndWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||||
// See https://github.com/pytorch/pytorch/issues/95538
|
// See https://github.com/pytorch/pytorch/issues/95538
|
||||||
if (input_t.ndimension() > 4) {
|
if (input_t.ndimension() > 4) {
|
||||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ namespace at::native::mps {
|
||||||
|
|
||||||
// For both scatter and gather kernels, there are 4 specized ones (for 1D to 4D tensor)
|
// For both scatter and gather kernels, there are 4 specized ones (for 1D to 4D tensor)
|
||||||
// and one generic, for 5+D ones. Assumption (to be tested) about specialized kernels
|
// and one generic, for 5+D ones. Assumption (to be tested) about specialized kernels
|
||||||
// is that reduction of n-dimentional vector, where n is 2, should be slower
|
// is that reduction of n-dimensional vector, where n is 2, should be slower
|
||||||
// than reduction of 2D one, as n is not known at compiler time, therefore compiler
|
// than reduction of 2D one, as n is not known at compiler time, therefore compiler
|
||||||
// could not do loop unrolls, that is
|
// could not do loop unrolls, that is
|
||||||
// float sum(float* v, int n) {
|
// float sum(float* v, int n) {
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
|
||||||
normalized_shape);
|
normalized_shape);
|
||||||
|
|
||||||
// Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
|
// Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
|
||||||
// Also, compute M and N considering the idiosyncracies of NestedTensors
|
// Also, compute M and N considering the idiosyncrasies of NestedTensors
|
||||||
int64_t N = 1;
|
int64_t N = 1;
|
||||||
for (const auto i: c10::irange(normalized_ndim)) {
|
for (const auto i: c10::irange(normalized_ndim)) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
|
|
|
||||||
|
|
@ -95,7 +95,7 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6
|
||||||
for (const auto split_idx : c10::irange(chunks)) {
|
for (const auto split_idx : c10::irange(chunks)) {
|
||||||
auto new_sizes = sizes.clone();
|
auto new_sizes = sizes.clone();
|
||||||
auto new_strides = strides.clone();
|
auto new_strides = strides.clone();
|
||||||
// This copys offsets so we are safe to move
|
// This copies offsets so we are safe to move
|
||||||
auto new_offsets = offsets.clone();
|
auto new_offsets = offsets.clone();
|
||||||
int64_t *size_ptr = new_sizes.data_ptr<int64_t>();
|
int64_t *size_ptr = new_sizes.data_ptr<int64_t>();
|
||||||
int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>();
|
int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>();
|
||||||
|
|
|
||||||
|
|
@ -245,7 +245,7 @@ int64_t get_nnz(const Tensor& nestedtensor) {
|
||||||
// this is because needs_broadcast indicates that the batch_size is 1
|
// this is because needs_broadcast indicates that the batch_size is 1
|
||||||
// and hence there is only 1 value for seq_len
|
// and hence there is only 1 value for seq_len
|
||||||
// (2) The cum_seq_lens are given by [0, {*}_t.size(1), 2 * {*}_t.size(1),
|
// (2) The cum_seq_lens are given by [0, {*}_t.size(1), 2 * {*}_t.size(1),
|
||||||
// ..., outut_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by
|
// ..., output_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by
|
||||||
// output_batch_size * {*}_t.size(1);
|
// output_batch_size * {*}_t.size(1);
|
||||||
|
|
||||||
int64_t max_seqlen_batch_q = 0, Nnz_q = 0;
|
int64_t max_seqlen_batch_q = 0, Nnz_q = 0;
|
||||||
|
|
|
||||||
|
|
@ -193,12 +193,12 @@ vTensor pack_biases_quantized_weights(
|
||||||
src_kw_sz = b_sizes[Layout::BatchMatrices::width];
|
src_kw_sz = b_sizes[Layout::BatchMatrices::width];
|
||||||
src_kh_sz = b_sizes[Layout::BatchMatrices::height];
|
src_kh_sz = b_sizes[Layout::BatchMatrices::height];
|
||||||
} else if (bias.sizes().size() == 2) {
|
} else if (bias.sizes().size() == 2) {
|
||||||
// skip batch dim for boardcasting; index -1
|
// skip batch dim for broadcasting; index -1
|
||||||
src_kb_sz = 1;
|
src_kb_sz = 1;
|
||||||
src_kw_sz = b_sizes[Layout::BatchMatrices::height];
|
src_kw_sz = b_sizes[Layout::BatchMatrices::height];
|
||||||
src_kh_sz = b_sizes[Layout::BatchMatrices::batch];
|
src_kh_sz = b_sizes[Layout::BatchMatrices::batch];
|
||||||
} else {
|
} else {
|
||||||
// skip batch & height dim for boardcasting; index -2
|
// skip batch & height dim for broadcasting; index -2
|
||||||
src_kb_sz = 1;
|
src_kb_sz = 1;
|
||||||
src_kw_sz = b_sizes[Layout::BatchMatrices::batch];
|
src_kw_sz = b_sizes[Layout::BatchMatrices::batch];
|
||||||
src_kh_sz = 1;
|
src_kh_sz = 1;
|
||||||
|
|
@ -327,13 +327,13 @@ bool available_check_with_batch(
|
||||||
weight.size(Layout::BatchMatrices::batch) ||
|
weight.size(Layout::BatchMatrices::batch) ||
|
||||||
bias->size(Layout::BatchMatrices::batch) == 1);
|
bias->size(Layout::BatchMatrices::batch) == 1);
|
||||||
} else if (bias->ndimension() == 2) {
|
} else if (bias->ndimension() == 2) {
|
||||||
// skip batch dim for boardcasting; index -1
|
// skip batch dim for broadcasting; index -1
|
||||||
bias_available &=
|
bias_available &=
|
||||||
(bias->size(Layout::BatchMatrices::height) ==
|
(bias->size(Layout::BatchMatrices::height) ==
|
||||||
weight.size(Layout::BatchMatrices::width) ||
|
weight.size(Layout::BatchMatrices::width) ||
|
||||||
bias->size(Layout::BatchMatrices::height) == 1);
|
bias->size(Layout::BatchMatrices::height) == 1);
|
||||||
} else {
|
} else {
|
||||||
// skip batch & height dim for boardcasting; index -2
|
// skip batch & height dim for broadcasting; index -2
|
||||||
bias_available &=
|
bias_available &=
|
||||||
(bias->size(Layout::BatchMatrices::batch) ==
|
(bias->size(Layout::BatchMatrices::batch) ==
|
||||||
weight.size(Layout::BatchMatrices::width) ||
|
weight.size(Layout::BatchMatrices::width) ||
|
||||||
|
|
|
||||||
|
|
@ -158,7 +158,7 @@ class TORCH_API Tensor: public TensorBase {
|
||||||
// will only lead to trouble and dangling references.
|
// will only lead to trouble and dangling references.
|
||||||
c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
|
c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
|
||||||
|
|
||||||
// The following overloads are very intruiging. Consider the following
|
// The following overloads are very intriguing. Consider the following
|
||||||
// program:
|
// program:
|
||||||
//
|
//
|
||||||
// x[1] = 3;
|
// x[1] = 3;
|
||||||
|
|
|
||||||
|
|
@ -6894,7 +6894,7 @@ TEST_F(VulkanAPITest, slice_height_success) {
|
||||||
{2, {2, 3, 40, 50}}, // 4D tensors with dim=height
|
{2, {2, 3, 40, 50}}, // 4D tensors with dim=height
|
||||||
{1, {3, 40, 50}}, // 3D tensors with dim=height
|
{1, {3, 40, 50}}, // 3D tensors with dim=height
|
||||||
{0, {40, 50}}, // 2D tensors with dim=height
|
{0, {40, 50}}, // 2D tensors with dim=height
|
||||||
// 1D tesnors don't have height dim for test
|
// 1D tensors don't have height dim for test
|
||||||
};
|
};
|
||||||
|
|
||||||
// Act/Assert
|
// Act/Assert
|
||||||
|
|
@ -6906,7 +6906,7 @@ TEST_F(VulkanAPITest, slice_feature_success) {
|
||||||
std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
|
std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
|
||||||
{1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel)
|
{1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel)
|
||||||
{0, {40, 13, 14}}, // 3D tensors with dim=feature(channel)
|
{0, {40, 13, 14}}, // 3D tensors with dim=feature(channel)
|
||||||
// 1D and 2D tesnors don't have feature(channel) dim for test
|
// 1D and 2D tensors don't have feature(channel) dim for test
|
||||||
};
|
};
|
||||||
|
|
||||||
// Act/Assert
|
// Act/Assert
|
||||||
|
|
@ -6917,7 +6917,7 @@ TEST_F(VulkanAPITest, slice_batch_success) {
|
||||||
// Arrange
|
// Arrange
|
||||||
std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
|
std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
|
||||||
{0, {40, 3, 13, 14}}, // 4D tensors with dim=batch
|
{0, {40, 3, 13, 14}}, // 4D tensors with dim=batch
|
||||||
// 1D, 2D and 3D tesnors don't have batch dim for test
|
// 1D, 2D and 3D tensors don't have batch dim for test
|
||||||
};
|
};
|
||||||
|
|
||||||
// Act/Assert
|
// Act/Assert
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user