mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 00:20:18 +01:00
[1/N][Fix] Fix typo in aten folder (#166126)
Fix typo in aten folder. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166126 Approved by: https://github.com/cyyever, https://github.com/slayton58
This commit is contained in:
parent
f2c81635c8
commit
f89a7e9fe8
|
|
@ -38,7 +38,7 @@ set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT)
|
|||
|
||||
configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
|
||||
# TODO: Do not generate CUDAConfig.h for ROCm BUILDS
|
||||
# At the moment, `jit_macors.h` include CUDAConfig.h for both CUDA and HIP builds
|
||||
# At the moment, `jit_macros.h` include CUDAConfig.h for both CUDA and HIP builds
|
||||
if(USE_CUDA OR USE_ROCM)
|
||||
configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ void FunctionalTensorWrapper::freeze_storage() const {
|
|||
// | have their own storages, but backends like functorch |
|
||||
// \/ are allowed to re-alias underneath the pass \/
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
// | underyling_storage | | underyling_storage |
|
||||
// | underlying_storage | | underlying_storage |
|
||||
// . - - - - - - - - - - - - - . . - - - - - - - - - - - - - - - .
|
||||
//
|
||||
// This constructor is only used by view ops.
|
||||
|
|
|
|||
|
|
@ -1534,7 +1534,7 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) {
|
|||
|
||||
// XLA and lazy tensors don't have storage, so they don't have an underlying data pointer.
|
||||
// Nothing beyond this point is important for meta functions, so it's fine to exit early here.
|
||||
// Extend the condition to MAIA tesnors as MAIA tensors also don't have storage.
|
||||
// Extend the condition to MAIA tensors as MAIA tensors also don't have storage.
|
||||
if (privateuse1_without_storage ||
|
||||
common_device_.type() == DeviceType::XLA ||
|
||||
common_device_.type() == DeviceType::IPU ||
|
||||
|
|
|
|||
|
|
@ -94,11 +94,11 @@ struct PinnedReserveSegment {
|
|||
struct TORCH_API HostStats {
|
||||
// COUNT: total allocations (active)
|
||||
Stat active_requests;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. (active)
|
||||
// SUM: bytes allocated/reserved by this memory allocator. (active)
|
||||
Stat active_bytes;
|
||||
// COUNT: total allocations (active + free)
|
||||
Stat allocations;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
||||
// SUM: bytes allocated/reserved by this memory allocator. This accounts
|
||||
// for both free and in-use blocks.
|
||||
Stat allocated_bytes;
|
||||
|
||||
|
|
@ -127,7 +127,7 @@ struct alignas(hardware_destructive_interference_size) HostStatsStaged {
|
|||
// COUNT: total allocations (active + free)
|
||||
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
||||
Stat allocations;
|
||||
// SUM: bytes allocated/reserved by this memory alocator. This accounts
|
||||
// SUM: bytes allocated/reserved by this memory allocator. This accounts
|
||||
// for both free and in-use blocks.
|
||||
Stat allocated_bytes;
|
||||
// COUNT: number of allocations per bucket (active)
|
||||
|
|
@ -455,7 +455,7 @@ struct CachingHostAllocatorImpl {
|
|||
}
|
||||
|
||||
void resetAccumulatedStats() {
|
||||
// Reseting accumulated memory stats requires concurrently holding both the
|
||||
// Resetting accumulated memory stats requires concurrently holding both the
|
||||
// free list mutexes and the blocks mutex. Previously, this was only done in
|
||||
// empty_cache function.
|
||||
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||
|
|
@ -482,7 +482,7 @@ struct CachingHostAllocatorImpl {
|
|||
}
|
||||
|
||||
void resetPeakStats() {
|
||||
// Reseting peak memory stats requires concurrently holding both the
|
||||
// Resetting peak memory stats requires concurrently holding both the
|
||||
// free list mutexes and the blocks mutex. Previously, this was only done in
|
||||
// empty_cache function.
|
||||
for (size_t i = 0; i < free_list_.size(); ++i) {
|
||||
|
|
|
|||
|
|
@ -148,7 +148,7 @@ struct TORCH_API ClassType : public NamedType {
|
|||
|
||||
void checkNotExist(const std::string& name, const std::string& what) const;
|
||||
|
||||
// Attributes are stored in a specific slot at runtime for effiency.
|
||||
// Attributes are stored in a specific slot at runtime for efficiency.
|
||||
// When emitting instructions we specify the slot so that attribute access is
|
||||
// a constant lookup
|
||||
std::optional<size_t> findAttributeSlot(const std::string& name) const {
|
||||
|
|
@ -412,7 +412,7 @@ struct TORCH_API ClassType : public NamedType {
|
|||
// Holds method attributes
|
||||
std::weak_ptr<CompilationUnit> compilation_unit_;
|
||||
|
||||
// Holds all atrributes, attribute details are found on ClassAttribute
|
||||
// Holds all attributes, attribute details are found on ClassAttribute
|
||||
std::vector<ClassAttribute> attributes_;
|
||||
// Construct mirroring attributes_, only around due to the fact that `containedTypes()` method returns an ArrayRef.
|
||||
// Never fill this without using the appropriate provideNewClassAttribute method
|
||||
|
|
|
|||
|
|
@ -537,7 +537,7 @@ int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchK
|
|||
|
||||
// Note: this records a sequence number for both Autograd keys, and for
|
||||
// non-Autograd keys where the dispatchKeySet still contains an autograd key.
|
||||
// This means that we might collect the same sequence nubmer two different
|
||||
// This means that we might collect the same sequence number two different
|
||||
// events if they all occurred above Autograd and still had the Autograd
|
||||
// dispatch key in the dispatch key set.
|
||||
// However, this usually doesn't happen: normally the first call will
|
||||
|
|
|
|||
|
|
@ -585,7 +585,7 @@ class TORCH_API OperatorHandle {
|
|||
|
||||
// We need to store this iterator in order to make
|
||||
// Dispatcher::cleanup() fast -- it runs a lot on program
|
||||
// termination (and presuambly library unloading).
|
||||
// termination (and presumably library unloading).
|
||||
std::list<Dispatcher::OperatorDef>::iterator operatorIterator_;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -365,7 +365,7 @@ std::pair<const AnnotatedKernel&, const char*> OperatorEntry::computeDispatchTab
|
|||
// For autograd keys, we only use kernel from CompositeImplicitAutograd when there's no direct registration
|
||||
// to its corresponding backend key or CompositeExplicitAutograd. See Note [CompositeExplicitAutograd and CompositeImplicitAutograd].
|
||||
// For AutogradOther, we eagerly return ambiguousAutogradOtherKernel() if there's registration to any of
|
||||
// its backends and ask backend extender to request a decicated Autograd key for the backend.
|
||||
// its backends and ask backend extender to request a dedicated Autograd key for the backend.
|
||||
// See Note [Ambiguity in AutogradOther kernel] for more details.
|
||||
// A CompositeExplicitAutograd kernel prevents CompositeImplicitAutograd kernel being used for Autograd keys, but it doesn't
|
||||
// cause confusion for AutogradOther. It's pretty straightforward to use Autograd (if available)
|
||||
|
|
|
|||
|
|
@ -261,7 +261,7 @@ std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) {
|
|||
//
|
||||
// There are 2 cases
|
||||
// 1. something like 'aten::items.str(Dict(str, t) self) -> ((str, t)[])'.
|
||||
// without the extra parenthesis, the c++ schem parser can not parse it.
|
||||
// without the extra parenthesis, the c++ scheme parser can not parse it.
|
||||
// 2. something like '-> ((str, str))'. Need extra parenthesis so the return
|
||||
// type is a single tuple rather than two strings.
|
||||
// PR (https://github.com/pytorch/pytorch/pull/23204) has more context about
|
||||
|
|
|
|||
|
|
@ -1176,7 +1176,7 @@ struct TORCH_API IValue final {
|
|||
using HashIdentityIValueMap =
|
||||
std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
|
||||
|
||||
// Chechs if this and rhs has a subvalues in common.
|
||||
// Checks if this and rhs has a subvalues in common.
|
||||
// [t1,t2] and [t2, t3] returns true.
|
||||
bool overlaps(const IValue& rhs) const;
|
||||
|
||||
|
|
|
|||
|
|
@ -1501,7 +1501,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
|
|||
// However, the CompilationUnit holds ownership of the type's graphs, so
|
||||
// inserting a constant object into a Graph would create a reference cycle if
|
||||
// that constant object held a shared_ptr to its CU. For these objects we
|
||||
// instatiate them with non-owning references to its CU
|
||||
// instantiate them with non-owning references to its CU
|
||||
Object(WeakOrStrongTypePtr type, size_t numSlots) : type_(std::move(type)) {
|
||||
slots_.resize(numSlots);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -373,7 +373,7 @@ struct TORCH_API SymbolicShape {
|
|||
// Unranked shape constructor.
|
||||
SymbolicShape() : dims_(std::nullopt) {}
|
||||
|
||||
// Known rank but unknown dimentions.
|
||||
// Known rank but unknown dimensions.
|
||||
SymbolicShape(std::optional<size_t> rank) : dims_(std::nullopt) {
|
||||
if(!rank) {
|
||||
return;
|
||||
|
|
@ -884,9 +884,9 @@ struct TORCH_API ListType
|
|||
|
||||
// global singleton
|
||||
// Given an inner type T and an identifier,
|
||||
// this function wil return the global singleton type pointer
|
||||
// this function will return the global singleton type pointer
|
||||
// the type List<T>.
|
||||
// The extra "identifier" argument is needed beccause we have multiple container types
|
||||
// The extra "identifier" argument is needed because we have multiple container types
|
||||
// that all re-use this function (List<T>, array<T, N>, etc.)
|
||||
static TypePtr get(const std::string& identifier, TypePtr inner);
|
||||
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ namespace c10 {
|
|||
|
||||
namespace detail {
|
||||
// The first argument of the schema might be of type DispatchKeySet, in which case we remove it.
|
||||
// We do this because every argument in a function schema is expected to be convertable
|
||||
// We do this because every argument in a function schema is expected to be convertible
|
||||
// to an ivalue, but DispatchKeySet is not a type we want the jit to be aware of.
|
||||
// See Note [Plumbing Keys Through The Dispatcher]
|
||||
template<class KernelFunctor>
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ TEST(OperatorRegistrationTest, whenRegisteringCPUTensorType_thenCanOnlyCallUnbox
|
|||
callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CPU), dummyTensor(c10::DispatchKey::CUDA));
|
||||
EXPECT_TRUE(called_kernel_cpu);
|
||||
|
||||
// Ensure that disptach key from tensor is not used here.
|
||||
// Ensure that dispatch key from tensor is not used here.
|
||||
called_kernel_cpu = false;
|
||||
expectThrows<c10::Error>([&] {
|
||||
callOpUnboxedWithPrecomputedDispatchKeySet<void, Tensor>(*op, c10::DispatchKeySet(c10::DispatchKey::CUDA), dummyTensor(c10::DispatchKey::CPU));
|
||||
|
|
|
|||
|
|
@ -172,7 +172,7 @@ VaryingShape<Stride> TensorType::computeStrideProps(
|
|||
// The logic below follows what TensorIterator uses in its logic:
|
||||
// 1. Fast_set_up is the short-cut to identify a. channels_last and
|
||||
// b. contiguous format, which is what we have in the below logic.
|
||||
// 2. In more generla cases, it does best effort to preserve permutatoin.
|
||||
// 2. In more general cases, it does best effort to preserve permutatoin.
|
||||
if (is_channels_last_strides_2d(sizes, strides) || is_channels_last_strides_3d(sizes, strides)) {
|
||||
// case 1.a. short cut channels last
|
||||
std::iota(stride_indices.rbegin() + 1, stride_indices.rend() - 1, 2);
|
||||
|
|
|
|||
|
|
@ -234,7 +234,7 @@ class Vectorized<c10::Half> : public Vectorized16<
|
|||
vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift);
|
||||
return vaddvq_u16(bits_vec);
|
||||
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
|
||||
// use known working implmentation.
|
||||
// use known working implementation.
|
||||
__at_align__ value_type tmp[size()];
|
||||
store(tmp);
|
||||
int mask = 0;
|
||||
|
|
|
|||
|
|
@ -1740,7 +1740,7 @@ Vectorized<int16_t> inline shift_256_16(
|
|||
|
||||
// Control masks for shuffle operation, treating 256 bits as an
|
||||
// array of 16-bit elements, and considering pairs of neighboring
|
||||
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// M!=N) is set so that shuffle will move element with index M from
|
||||
// input pair into element with index N in output pair, and element
|
||||
// with index M in output pair will be set to all 0s.
|
||||
|
|
@ -1875,7 +1875,7 @@ Vectorized<T> inline shift_256_8(
|
|||
|
||||
// Control masks for shuffle operation, treating 256 bits as an
|
||||
// array of 8-bit elements, and considering quadruples of
|
||||
// neighboring elements. Specifially, a mask named "ctl_M_N" (M,N
|
||||
// neighboring elements. Specifically, a mask named "ctl_M_N" (M,N
|
||||
// in [0,1,2,3], and M!=N) is set so that shuffle will move element
|
||||
// with index M from input quadruple into element with index N in
|
||||
// output quadruple, and other elements in output quadruple will be
|
||||
|
|
|
|||
|
|
@ -143,7 +143,7 @@ class Vectorized<double> {
|
|||
const Vectorized<double>& a,
|
||||
const Vectorized<double>& b,
|
||||
const Vectorized<double>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
|
|
|||
|
|
@ -142,7 +142,7 @@ class Vectorized<float> {
|
|||
const Vectorized<float>& a,
|
||||
const Vectorized<float>& b,
|
||||
const Vectorized<float>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
|
|
|||
|
|
@ -202,7 +202,7 @@ class Vectorized<int16_t> {
|
|||
const Vectorized<int16_t>& a,
|
||||
const Vectorized<int16_t>& b,
|
||||
const Vectorized<int16_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
// warning intel style mask will not work properly
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -155,7 +155,7 @@ class Vectorized<int32_t> {
|
|||
const Vectorized<int32_t>& a,
|
||||
const Vectorized<int32_t>& b,
|
||||
const Vectorized<int32_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
// assuming this we can use the same mask directly with vec_sel
|
||||
// warning intel style mask will not work properly
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ class Vectorized<int64_t> {
|
|||
const Vectorized<int64_t>& a,
|
||||
const Vectorized<int64_t>& b,
|
||||
const Vectorized<int64_t>& mask) {
|
||||
// the mask used here returned by comparision of vec256
|
||||
// the mask used here returned by comparison of vec256
|
||||
|
||||
return {
|
||||
vec_sel(a._vec0, b._vec0, mask._vecb0),
|
||||
|
|
|
|||
|
|
@ -397,7 +397,7 @@ inline Vectorized<bool> operator&&(
|
|||
const __m512i* other_ = reinterpret_cast<const __m512i*>(other.as_bytes());
|
||||
__m512i out = _mm512_and_si512(*self_, *other_);
|
||||
Vectorized<bool> ret;
|
||||
// We do not have a constructer that takes __m512i, so we need to memcpy
|
||||
// We do not have a constructor that takes __m512i, so we need to memcpy
|
||||
std::memcpy(ret, &out, ret.size() * sizeof(bool));
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1852,7 +1852,7 @@ Vectorized<T> inline shift_512_8(
|
|||
|
||||
// Control masks for shuffle operation, treating 512 bits as an
|
||||
// array of 8-bit elements, and considering pairs of neighboring
|
||||
// elements. Specifially, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// elements. Specifically, a mask named "ctl_M_N" (M,N in [0,1], and
|
||||
// M!=N) is set so that shuffle will move element with index M from
|
||||
// input pair into element with index N in output pair, and element
|
||||
// with index M in output pair will be set to all 0s.
|
||||
|
|
|
|||
|
|
@ -634,7 +634,7 @@ struct Vectorized {
|
|||
}
|
||||
Vectorized<T> neg() const {
|
||||
// NB: the trailing return type is needed because we need to coerce the
|
||||
// return value back to T in the case of unary operator- incuring a
|
||||
// return value back to T in the case of unary operator- incurring a
|
||||
// promotion
|
||||
return map([](T x) -> T { return -x; });
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1958,7 +1958,7 @@ void scaled_gemm(
|
|||
ScalarType result_dtype,
|
||||
bool use_fast_accum,
|
||||
const std::optional<Tensor>& alpha) {
|
||||
// Note: see `cublasCommonArgs` for various non-intuitive manupulations
|
||||
// Note: see `cublasCommonArgs` for various non-intuitive manipulations
|
||||
// of input arguments to this function.
|
||||
const auto computeType = CUBLAS_COMPUTE_32F;
|
||||
const auto scaleType = CUDA_R_32F;
|
||||
|
|
|
|||
|
|
@ -307,7 +307,7 @@ CUDAGraph::~CUDAGraph() {
|
|||
// There are recent HIP changes where hipGraphExecDestroy doesn't immediately free memory.
|
||||
// They wait for next sync point in order to free the memory, this is to ensure that all
|
||||
// hipGraphLaunch are finished before we release any memory. This feature was enabled in rocm6.2.
|
||||
// We need to ensure all async opreations finish before deleting the object.
|
||||
// We need to ensure all async operations finish before deleting the object.
|
||||
#if (defined(USE_ROCM) && ROCM_VERSION >= 60200)
|
||||
if (capture_dev_ != UNDEFINED_DEVICE) // check if capture_dev_ contains the real device id
|
||||
{
|
||||
|
|
|
|||
|
|
@ -137,7 +137,7 @@ struct CUDACachingHostAllocatorImpl
|
|||
void free_block_slowpath(Block* block) {
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
// Users may change the allocator config at will. torch unit tests do this.
|
||||
// However, allocations using cudaHostRegister should use corresonding
|
||||
// However, allocations using cudaHostRegister should use corresponding
|
||||
// cudaHostUnregister and similarly for cudaHostAlloc / cudaFreeHost.
|
||||
void* ptr = block->ptr_;
|
||||
bool use_register = false;
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
#include <ATen/cuda/CUDAConfig.h>
|
||||
|
||||
// NOTE: These templates are intentionally not defined in this header,
|
||||
// which aviods re-compiling them for each translation unit. If you get
|
||||
// which avoids re-compiling them for each translation unit. If you get
|
||||
// a link error, you need to add an explicit instantiation for your
|
||||
// types in cub.cu
|
||||
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262
|
|||
GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033
|
||||
```
|
||||
|
||||
Note the "Validator" lines. If you change a library verison, or ROCm version, or PyTorch version, TunableOp will detect
|
||||
Note the "Validator" lines. If you change a library version, or ROCm version, or PyTorch version, TunableOp will detect
|
||||
this and reject the tunings file because the prior tunings are likely affected by other software changes.
|
||||
|
||||
The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of
|
||||
|
|
|
|||
|
|
@ -235,7 +235,7 @@ class TunableOp {
|
|||
// numeric check option is controlled by non-static env var, so check it once per tuned operator
|
||||
bool do_numerics_check = ctx->IsNumericsCheckEnabled();
|
||||
|
||||
// calcaulte a reference answer for numerical check
|
||||
// calculate a reference answer for numerical check
|
||||
if (do_numerics_check) {
|
||||
reference_params = params->DeepCopy(false);
|
||||
TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ namespace at {
|
|||
|
||||
// AcceleratorHooksInterface is a shared interface provided by all
|
||||
// accelerators to allow generic code.
|
||||
// This inferface is hook-based as it corresponds to all the functions
|
||||
// This interface is hook-based as it corresponds to all the functions
|
||||
// that are going to be called in a generic way from the CPU code.
|
||||
|
||||
struct TORCH_API AcceleratorHooksInterface {
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
|
|||
|
||||
Generator getNewGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||
// TODO(FFFrog): Perserved for BC and will be removed in the future.
|
||||
// TODO(FFFrog): Preserved for BC and will be removed in the future.
|
||||
if (at::GetGeneratorPrivate().has_value())
|
||||
return at::GetGeneratorForPrivateuse1(device_index);
|
||||
|
||||
|
|
|
|||
|
|
@ -283,7 +283,7 @@ inline void boxed_existing_bdim_all_batch_rule(
|
|||
// Use when all tensors arguments accept one (normal) batch dim.
|
||||
// This batching rule expands the batch dim on all Tensors, reshapes it into
|
||||
// dim 0, calls the op, and then reshapes the batch dim out of dim 0.
|
||||
// This is not the most efficient thing; if there are alternatives, plese try
|
||||
// This is not the most efficient thing; if there are alternatives, please try
|
||||
// to use them. Use this only as a last resort.
|
||||
#define EXISTING_BDIM_ALL_BOXED(op) \
|
||||
m.impl(#op, torch::CppFunction::makeFromBoxedFunction<boxed_existing_bdim_all_batch_rule>());
|
||||
|
|
|
|||
|
|
@ -384,7 +384,7 @@ fourOutputs solve_ex_batch_rule(
|
|||
|
||||
// NOTE [ solve_ex Batch Rule Contiguity ]
|
||||
// A determines whether or not linalg_solve takes an optimized path. We need the check on A_ to match the one run on
|
||||
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behvaior
|
||||
// A as BatchedTensor since it might have been saved by autograd (specifically by the jvp) and the autograd behavior
|
||||
// differs based on whether or not the optimized path was taken
|
||||
const auto batched_A_was_contiguous = A_bdim.has_value() ? at::select(A, *A_bdim, 0).is_contiguous() : A.is_contiguous();
|
||||
if (batched_A_was_contiguous && !A.is_complex()) {
|
||||
|
|
|
|||
|
|
@ -282,7 +282,7 @@ static std::tuple<Tensor, std::optional<int64_t>> _softmax_backward_batch_rule(
|
|||
|
||||
dim = getPhysicalDim(output_, /*has_batch_dim*/true, dim);
|
||||
|
||||
// Not sure why output_ needs to be marked as .contiguous(). Someting must
|
||||
// Not sure why output_ needs to be marked as .contiguous(). Something must
|
||||
// have changed in PyTorch (and output of softmax is probably always contiguous)
|
||||
return std::make_tuple(at::_softmax_backward_data(grad_output_, output_.contiguous(), dim, input_dtype), 0);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -224,7 +224,7 @@ static Tensor safeStack(TensorList tensors) {
|
|||
// is possible for the backward function to return an undefined grad for some
|
||||
// grad_input for each example. In that case, we return an undefined grad.
|
||||
//
|
||||
// It is theoretically posssible for *some* of the examples to produce an
|
||||
// It is theoretically possible for *some* of the examples to produce an
|
||||
// undefined grad (a kernel could peek at the gradient values and return an
|
||||
// undefined tensor if it determines the gradient is full of zeros). We
|
||||
// could handle this by treating the undefined grad as a zero-filled tensor
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ SymIntArrayRef BatchedTensorImpl::sym_sizes_custom() const {
|
|||
return sym_sizes_default();
|
||||
}
|
||||
|
||||
// The following are publically exposed as methods of Tensor
|
||||
// The following are publicly exposed as methods of Tensor
|
||||
|
||||
IntArrayRef BatchedTensorImpl::strides_custom() const {
|
||||
return strides_default();
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ namespace at::functorch {
|
|||
// how to perform the transform.
|
||||
//
|
||||
// TODO: we can excise DynamicLayer in favor of Interpreter,
|
||||
// But I am going to leave it for now as a compatiblity shim to avoid
|
||||
// But I am going to leave it for now as a compatibility shim to avoid
|
||||
// needing to refactor a lot of callsites...
|
||||
struct TORCH_API DynamicLayer {
|
||||
explicit DynamicLayer(
|
||||
|
|
|
|||
|
|
@ -88,7 +88,7 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t);
|
|||
// >>> VmapInterpreterPtr(&interpreter).batchSize()
|
||||
//
|
||||
// Finally, Interpreter::process switches on the type of the interpreter
|
||||
// and calls one of {Transform}Intepreter::processImpl under the hood.
|
||||
// and calls one of {Transform}Interpreter::processImpl under the hood.
|
||||
// Same for Interpreter::sendToNextInterpreter :)
|
||||
|
||||
struct VmapInterpreterMeta {
|
||||
|
|
|
|||
|
|
@ -733,7 +733,7 @@ TORCH_LIBRARY_IMPL(_, FuncTorchBatched, m) {
|
|||
}
|
||||
|
||||
TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
|
||||
// still legacy b/c teturns multiple tensors
|
||||
// still legacy b/c returns multiple tensors
|
||||
m.impl("split.Tensor", split_batching_rule);
|
||||
m.impl("split_with_sizes", split_with_sizes_batching_rule);
|
||||
m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule);
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
|
|||
endKernelCoalescing();
|
||||
id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
|
||||
|
||||
// For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
|
||||
// For some reason fillBufferfor stopped working for length > 4Gb on MacOS 26
|
||||
// See https://github.com/pytorch/pytorch/issues/163962
|
||||
// Workaround by batching copy commands into 4Gb chunks
|
||||
constexpr size_t max_copy_size = 0x100000000; // 4GB
|
||||
|
|
|
|||
|
|
@ -148,7 +148,7 @@ inline void checkInputsSolver(const Tensor& A,
|
|||
|
||||
inline bool is_row_or_column_contiguous(const Tensor& t) {
|
||||
// This could be made more general, similar to how it's checked in matmul, which would allow to
|
||||
// ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
|
||||
// elide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky.
|
||||
// We choose to be conservative for simplicity
|
||||
return t.is_contiguous() || t.transpose(-2, -1).is_contiguous();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ enum class fft_norm_mode {
|
|||
// NOTE [ Fourier Transform Conjugate Symmetry ]
|
||||
//
|
||||
// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is,
|
||||
// assuming X is the transformed K-dimensionsal signal, we have
|
||||
// assuming X is the transformed K-dimensional signal, we have
|
||||
//
|
||||
// X[i_1, ..., i_K] = X[j_i, ..., j_K]*,
|
||||
//
|
||||
|
|
|
|||
|
|
@ -128,7 +128,7 @@ at::Tensor PackedLinearWeight::apply_impl(
|
|||
auto* input_tr_ptr =
|
||||
reinterpret_cast<uint8_t*>(input_tr.data_ptr<c10::quint8>());
|
||||
// TODO: Activation transpose before and after the kernel can be removed if we
|
||||
// keep activation tensor always tranposed.
|
||||
// keep activation tensor always transposed.
|
||||
fbgemm::transpose_simd<uint8_t>(
|
||||
batch_size, K, input_ptr, K, input_tr_ptr, batch_size);
|
||||
|
||||
|
|
|
|||
|
|
@ -520,7 +520,7 @@ cpu_adaptive_avg_pool3d_channels_last(
|
|||
scalar_t* out = output_data + i * channels;
|
||||
int64_t size = channels;
|
||||
|
||||
// Note: For oridinary usage scenario, each out lane should
|
||||
// Note: For ordinary usage scenario, each out lane should
|
||||
// fit in L1 cache; otherwise consider block dim C.
|
||||
// Pass I: zero the out lane
|
||||
int64_t d1 = 0;
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ struct Dist {
|
|||
// finish : This tells what to do with the aggregated value to compute
|
||||
// the norm. Generally this is the result of val ^ (1 / p).
|
||||
// backward : This is the gradient for that norm. Arguments are pretty
|
||||
// self explanitory.
|
||||
// self explanatory.
|
||||
//
|
||||
// There are a few cases where these aren't used. The 0 norm has no backward,
|
||||
// because it's always 0, so that's shortcircuited earlier. There's a special
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ vec::Vectorized<scalar_t> is_nan_vec(vec::Vectorized<scalar_t> vec) {
|
|||
return vec.isnan();
|
||||
}
|
||||
|
||||
// TODO: use is_integeral/is_same to check the scalar_t and simplify the implementation
|
||||
// TODO: use is_integral/is_same to check the scalar_t and simplify the implementation
|
||||
// currently it does not work
|
||||
template <>
|
||||
vec::Vectorized<unsigned char> is_nan_vec<unsigned char>(vec::Vectorized<unsigned char> vec) {
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ it to sum up the entire array into a single value.
|
|||
|
||||
`ReduceOpsKernel.cpp` uses the `CPU_CAPABILITY_*` macros to "know" under which
|
||||
compiler flags it is currently compiled. This allows the programmer to write
|
||||
generic code, which will be compiled under multipled compilation settings.
|
||||
generic code, which will be compiled under multiplied compilation settings.
|
||||
|
||||
`../ReduceOps.cpp` now includes the header `ReduceOpsKernel.h`, which contains
|
||||
a generic definition of `sumImplAll`. This function allows the user to reduce
|
||||
|
|
|
|||
|
|
@ -889,7 +889,7 @@ void ImagingResampleHorizontalConvolution8u(
|
|||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * i))),
|
||||
_mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1);
|
||||
|
||||
// Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA
|
||||
// Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA
|
||||
// RGBA: pix1 = [
|
||||
// r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0
|
||||
// r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0
|
||||
|
|
|
|||
|
|
@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002);
|
|||
_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
|
||||
|
||||
|
||||
/* evaluation of 8 sines at onces using AVX intrinsics
|
||||
/* evaluation of 8 sines at once using AVX intrinsics
|
||||
|
||||
The code is the exact rewriting of the cephes sinf function.
|
||||
Precision is excellent as long as x < 8192 (I did not bother to
|
||||
|
|
|
|||
|
|
@ -311,7 +311,7 @@ void GroupNormKernelImplChannelsLastInternal(
|
|||
const bool gamma_null = (gamma_data == nullptr);
|
||||
const bool beta_null = beta_data == nullptr;
|
||||
|
||||
// NB: About algorithm choosen:
|
||||
// NB: About algorithm chosen:
|
||||
//
|
||||
// On channels last, GroupNorm has a input shape of {N, H, W, GD},
|
||||
// Mean and rstd are collected per each n and g, which involves reduction
|
||||
|
|
|
|||
|
|
@ -930,7 +930,7 @@ void ref_dyn_quant_matmul_4bit_channelwise_kernel(
|
|||
}
|
||||
};
|
||||
|
||||
// Dynamically Quantize the float32 input to 8 bit assymetric
|
||||
// Dynamically Quantize the float32 input to 8 bit asymmetric
|
||||
input_quant_pack_8bit_channelwise(m, k, lhs_f32, (int8_t*)lhs_qa8dx);
|
||||
|
||||
const size_t lhs_stride =
|
||||
|
|
@ -1163,7 +1163,7 @@ void dyn_quant_matmul_4bit_kernel(
|
|||
const int64_t weight_packed_size =
|
||||
kleidiai::kai_pack_rhs_int4_size(N, K, block_size);
|
||||
if (weight_packed_size == packed_weights.numel()) {
|
||||
// KleidiAI interface intenally handles the Channelwise and groupwise
|
||||
// KleidiAI interface internally handles the Channelwise and groupwise
|
||||
// distinction
|
||||
kleidiai::kai_quant_pack_lhs_int4_mm(
|
||||
output, inp, packed_weights, M, N, K, block_size);
|
||||
|
|
|
|||
|
|
@ -505,7 +505,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
|||
// Handle whether to use the Lt interface {
|
||||
static bool persistent_disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device());
|
||||
// if lt path fails, we recurse back into this function here and force the lt path to off
|
||||
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
// we cannot update variable disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#ifdef USE_ROCM
|
||||
// Conditioned on the device index, which is not persistent
|
||||
|
|
|
|||
|
|
@ -494,7 +494,7 @@ void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen)
|
|||
auto value = static_cast<scalar_t>(rand * range + from);
|
||||
// reverse the bounds of curand4 from (0, 1] to [0, 1)
|
||||
// Note that this method is from legacy THCTensorRandom and is likely to give
|
||||
// you more 0-s, since, the probability of gettings 1-s is higher than 0-s and
|
||||
// you more 0-s, since, the probability of getting 1-s is higher than 0-s and
|
||||
// by reversing the bounds, we are flipping the probabilities of 1-s and 0-s.
|
||||
// BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706
|
||||
auto reverse_bound_value = value == to ? from : value;
|
||||
|
|
|
|||
|
|
@ -154,7 +154,7 @@ REGISTER_CUDA_DISPATCH(lstsq_stub, &lazy_lstsq_kernel)
|
|||
|
||||
// Old style dispatches
|
||||
// torch_cuda_linalg dynamic library should have a global constructor
|
||||
// that calls regiserLinaglDispatch so in order ot lazy bind
|
||||
// that calls registerLinalgDispatch so in order ot lazy bind
|
||||
// old style dispatch all one have to do is to load library and call disp.func_name
|
||||
// Protect from infinite recursion by initializing dispatch to self and checking
|
||||
// that values are different after linalg library were loaded
|
||||
|
|
|
|||
|
|
@ -1532,7 +1532,7 @@ NvrtcFunction jit_pwise_function(
|
|||
|
||||
std::string file_path;
|
||||
if (cache_dir.has_value()) {
|
||||
// Attemps to read from the cache.
|
||||
// Attempts to read from the cache.
|
||||
// Cubin name is <kernel name>_arch<major>.<minor>_nvrtc<major>.<minor>_<ptx or sass>_<program length>_<string hash>
|
||||
// Note that the SHA1 hash used in the file name is NOT the SHA1 hash of the file's contents,
|
||||
// because we hash on the CUDA code, but we save the compiled ptx or sass
|
||||
|
|
|
|||
|
|
@ -1346,7 +1346,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
|
|||
});
|
||||
|
||||
if (input.dim() > 2) {
|
||||
// if upper=true we need to tranpose and conjugate the result tensor
|
||||
// if upper=true we need to transpose and conjugate the result tensor
|
||||
// because the cholesky decomposition is stored in the lower triangular part
|
||||
if (upper) {
|
||||
input.copy_(result.mH());
|
||||
|
|
@ -1857,7 +1857,7 @@ void geqrf_kernel(const Tensor& input, const Tensor& tau) {
|
|||
|
||||
auto preferred_backend = at::globalContext().linalgPreferredBackend();
|
||||
switch (preferred_backend) {
|
||||
// TODO Investigate whether the following magma bug is still occuring.
|
||||
// TODO Investigate whether the following magma bug is still occurring.
|
||||
// It may be the case that geqrf followed by orgqr is wrong for the magma backend
|
||||
// geqrf_magma currently uses geqrf2_gpu
|
||||
//
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ void lu_factor_looped_cusolver(const Tensor& self, const Tensor& pivots, const T
|
|||
#if defined(BUILD_LAZY_CUDA_LINALG)
|
||||
namespace cuda { namespace detail {
|
||||
// This is only used for an old-style dispatches
|
||||
// Please do not add any new entires to it
|
||||
// Please do not add any new entries to it
|
||||
struct LinalgDispatch {
|
||||
Tensor (*cholesky_solve_helper)(const Tensor& self, const Tensor& A, bool upper);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -147,7 +147,7 @@ static void check_shape_forward(const Tensor& input,
|
|||
// blocked format will propagate between layers. Input, output will be in blocked format.
|
||||
//
|
||||
// For inference case, weight can be prepacked into blocked format by
|
||||
// (so as to save weight reoder overhead):
|
||||
// (so as to save weight reorder overhead):
|
||||
// model = torch.utils.mkldnn.to_mkldnn(model)
|
||||
//
|
||||
// For training case, grad_output can be CPU tensor or MKLDNN tensor,
|
||||
|
|
@ -723,7 +723,7 @@ Tensor _mkldnn_convolution_transpose(
|
|||
ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true);
|
||||
if (!weight.is_mkldnn()) {
|
||||
// mkldnn transposed convolution has weight in logical order of OIHW or OIDHW,
|
||||
// while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy).
|
||||
// while PyTorch has IOHW or IODHW, `._transpose()` switches strides (no memory copy).
|
||||
w.transpose_(0, 1);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -540,7 +540,7 @@ static void _mkldnn_matmul_i8i8i32_with_primitive(
|
|||
args.insert({DNNL_ARG_WEIGHTS, expected_weight});
|
||||
args.insert({DNNL_ARG_DST, dst});
|
||||
args.insert({DNNL_ARG_SCRATCHPAD, scratchpad});
|
||||
// Create primitve and execute
|
||||
// Create primitive and execute
|
||||
auto primitive = dnnl::matmul(prim_desc);
|
||||
primitive.execute(ideep::stream::default_stream(), args);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -439,7 +439,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> mkldnn_rnn_la
|
|||
// I. Memory Formats
|
||||
// a. mkldnn will use plain formats for input, hx/cx, output, hy/cy
|
||||
// and possibly use blocked formats for weights depending shape info.
|
||||
// b. All mkldnn memorys are created (in plain format) as views on ATen tensor,
|
||||
// b. All mkldnn memories are created (in plain format) as views on ATen tensor,
|
||||
// the weight reorder(if any) is handed automatically inside ideep (mkldnn bridge)
|
||||
//
|
||||
// II. MKLDNN Primitive Mapping
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ void check_mkldnn_binary_fusion_inputs(
|
|||
inline std::vector<int64_t> padding_r(
|
||||
IntArrayRef padding, IntArrayRef output_padding)
|
||||
{
|
||||
// ConvTranpose padding adjustment
|
||||
// ConvTranspose padding adjustment
|
||||
//
|
||||
// PyTorch uses padding/output_padding:
|
||||
// osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
|
|||
}
|
||||
|
||||
bool can_use_flash_attention(sdp::sdp_params const& params, bool debug) {
|
||||
// Currently, XPU fallbacks flash attention to overrideable
|
||||
// Currently, XPU fallbacks flash attention to overridable
|
||||
return can_use_overrideable_attention(params, debug);
|
||||
}
|
||||
|
||||
|
|
@ -115,7 +115,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
|||
// 1. Flash Attention
|
||||
// 2. Math fallback
|
||||
auto& ctx = at::globalContext();
|
||||
// use overrideable linked to onednn as overrideable implementation
|
||||
// use overridable linked to onednn as overridable implementation
|
||||
if (!ctx.userEnabledMathSDP() && !ctx.userEnabledOverrideableSDP() &&
|
||||
!ctx.userEnabledFlashSDP()) {
|
||||
return sdp::SDPBackend::error;
|
||||
|
|
@ -165,7 +165,7 @@ sdp::SDPBackend select_sdp_backend_xpu(sdp::sdp_params const& kernel_params) {
|
|||
}
|
||||
}
|
||||
// If we have gotten to this point then two things have happened:
|
||||
// 1. can_use_overrideable_attention did not satisfy the constraints to be ran
|
||||
// 1. can_use_overridable_attention did not satisfy the constraints to be ran
|
||||
// 2. The user has explicitly disabled the math kernel
|
||||
// We then re-run the kernel checks with debug enabled to print out the
|
||||
// reason why the kernel was not selected
|
||||
|
|
|
|||
|
|
@ -215,7 +215,7 @@ partition create_sdpa_graph_partition(
|
|||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
|
|
@ -556,7 +556,7 @@ partition create_sdpa_backward_graph_partition(
|
|||
// For optional additive mask
|
||||
std::optional<op> mask_add;
|
||||
|
||||
// For optional implicite causal mask
|
||||
// For optional implicit causal mask
|
||||
std::optional<op> mask_gen_idx_row;
|
||||
std::optional<logical_tensor> mask_row_idx;
|
||||
std::optional<op> mask_gen_idx_col;
|
||||
|
|
|
|||
|
|
@ -345,7 +345,7 @@ class Attr {
|
|||
dnnl::memory binary_m;
|
||||
auto binary = ops_params_[i].binary_;
|
||||
auto md = ops_params_[i].meta_;
|
||||
// qeury expected_md to achieve peak performance
|
||||
// query expected_md to achieve peak performance
|
||||
auto expected_md = pd.query_md(
|
||||
dnnl::query::exec_arg_md,
|
||||
DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1);
|
||||
|
|
|
|||
|
|
@ -301,7 +301,7 @@ bool is_onednn_matmul_strides(const at::Tensor& tensor) {
|
|||
return false;
|
||||
}
|
||||
|
||||
// the overlaped cases are not supported
|
||||
// the overlapped cases are not supported
|
||||
dnnl::memory::dims strides = get_onednn_strides(tensor);
|
||||
int64_t storage_size = 1;
|
||||
for (size_t dim = 0; dim < tensor_dim; ++dim)
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@
|
|||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
|
|
@ -42,7 +42,7 @@
|
|||
secondaryTensor:(MPSGraphTensor*)secondaryTensor
|
||||
name:(NSString*)name {
|
||||
// As of MacOS-15.1 m..imumWithNanPropagation is only defined for floating types and calling it with integral
|
||||
// agruments results in
|
||||
// arguments results in
|
||||
// /AppleInternal/Library/BuildRoots/c7c74b64-74b4-11ef-aeda-9635a580fe0d/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShaders/MPSCore/Utility/MPSKernelDAG.mm:805:
|
||||
// failed assertion `Error getting visible function: (null) Function isNaN_u8_i8 was not found in the library'
|
||||
if (([primaryTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||
|
|
@ -539,7 +539,7 @@ Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
|
|||
|
||||
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||
// Use gather kernel to solve strides for macOS < 15.0
|
||||
// Starting with macOS 15.0, MPS supports native strides direclty in the kernels
|
||||
// Starting with macOS 15.0, MPS supports native strides directly in the kernels
|
||||
if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
|
||||
if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
|
||||
Tensor emptyShell = Tensor();
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ static void reduction_out_mps(const Tensor& input_t,
|
|||
IntArrayRef dim = opt_dim.value();
|
||||
for (const auto dim_val : dim) {
|
||||
auto wrap_dim = maybe_wrap_dim(dim_val, input_shape.size());
|
||||
// canSqueeze logic is broken when dim is negative, it introduces off-by-one-erros or crashes
|
||||
// canSqueeze logic is broken when dim is negative, it introduces off-by-one-errors or crashes
|
||||
// See https://github.com/pytorch/pytorch/issues/136132#issuecomment-2354482608
|
||||
if (wrap_dim >= 4 || dim_val < 0) {
|
||||
canSqueezeLastDim = false;
|
||||
|
|
@ -1282,7 +1282,7 @@ static void all_any_common_impl_mps(const Tensor& input_t,
|
|||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionOrWithTensor:axis: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
MPSGraphTensor* outputTensor = nil;
|
||||
if (input_t.ndimension() > 4) {
|
||||
|
|
@ -1352,7 +1352,7 @@ TORCH_IMPL_FUNC(any_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
|||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionOrWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
if (input_t.dim() > 4) {
|
||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||
|
|
@ -1400,7 +1400,7 @@ TORCH_IMPL_FUNC(all_all_out_mps)(const Tensor& input_t, const Tensor& output_t)
|
|||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_t);
|
||||
auto castInputTensor = castToIHFTypes(mpsGraph, inputTensor, input_t);
|
||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimentions is more than 4
|
||||
// reductionAndWithTensor:axes: will throw an internal assert if number of dimensions is more than 4
|
||||
// See https://github.com/pytorch/pytorch/issues/95538
|
||||
if (input_t.ndimension() > 4) {
|
||||
castInputTensor = [mpsGraph reshapeTensor:castInputTensor withShape:@[ @-1 ] name:nil];
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ namespace at::native::mps {
|
|||
|
||||
// For both scatter and gather kernels, there are 4 specized ones (for 1D to 4D tensor)
|
||||
// and one generic, for 5+D ones. Assumption (to be tested) about specialized kernels
|
||||
// is that reduction of n-dimentional vector, where n is 2, should be slower
|
||||
// is that reduction of n-dimensional vector, where n is 2, should be slower
|
||||
// than reduction of 2D one, as n is not known at compiler time, therefore compiler
|
||||
// could not do loop unrolls, that is
|
||||
// float sum(float* v, int n) {
|
||||
|
|
|
|||
|
|
@ -53,7 +53,7 @@ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
|
|||
normalized_shape);
|
||||
|
||||
// Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
|
||||
// Also, compute M and N considering the idiosyncracies of NestedTensors
|
||||
// Also, compute M and N considering the idiosyncrasies of NestedTensors
|
||||
int64_t N = 1;
|
||||
for (const auto i: c10::irange(normalized_ndim)) {
|
||||
TORCH_CHECK(
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ std::vector<Tensor> chunk_nested_tensor(const Tensor& self, int64_t chunks, int6
|
|||
for (const auto split_idx : c10::irange(chunks)) {
|
||||
auto new_sizes = sizes.clone();
|
||||
auto new_strides = strides.clone();
|
||||
// This copys offsets so we are safe to move
|
||||
// This copies offsets so we are safe to move
|
||||
auto new_offsets = offsets.clone();
|
||||
int64_t *size_ptr = new_sizes.data_ptr<int64_t>();
|
||||
int64_t *new_offsets_ptr = new_offsets.data_ptr<int64_t>();
|
||||
|
|
|
|||
|
|
@ -245,7 +245,7 @@ int64_t get_nnz(const Tensor& nestedtensor) {
|
|||
// this is because needs_broadcast indicates that the batch_size is 1
|
||||
// and hence there is only 1 value for seq_len
|
||||
// (2) The cum_seq_lens are given by [0, {*}_t.size(1), 2 * {*}_t.size(1),
|
||||
// ..., outut_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by
|
||||
// ..., output_batch_size * {*}_t.size(1)] (3) Nnz_{*} is given by
|
||||
// output_batch_size * {*}_t.size(1);
|
||||
|
||||
int64_t max_seqlen_batch_q = 0, Nnz_q = 0;
|
||||
|
|
|
|||
|
|
@ -193,12 +193,12 @@ vTensor pack_biases_quantized_weights(
|
|||
src_kw_sz = b_sizes[Layout::BatchMatrices::width];
|
||||
src_kh_sz = b_sizes[Layout::BatchMatrices::height];
|
||||
} else if (bias.sizes().size() == 2) {
|
||||
// skip batch dim for boardcasting; index -1
|
||||
// skip batch dim for broadcasting; index -1
|
||||
src_kb_sz = 1;
|
||||
src_kw_sz = b_sizes[Layout::BatchMatrices::height];
|
||||
src_kh_sz = b_sizes[Layout::BatchMatrices::batch];
|
||||
} else {
|
||||
// skip batch & height dim for boardcasting; index -2
|
||||
// skip batch & height dim for broadcasting; index -2
|
||||
src_kb_sz = 1;
|
||||
src_kw_sz = b_sizes[Layout::BatchMatrices::batch];
|
||||
src_kh_sz = 1;
|
||||
|
|
@ -327,13 +327,13 @@ bool available_check_with_batch(
|
|||
weight.size(Layout::BatchMatrices::batch) ||
|
||||
bias->size(Layout::BatchMatrices::batch) == 1);
|
||||
} else if (bias->ndimension() == 2) {
|
||||
// skip batch dim for boardcasting; index -1
|
||||
// skip batch dim for broadcasting; index -1
|
||||
bias_available &=
|
||||
(bias->size(Layout::BatchMatrices::height) ==
|
||||
weight.size(Layout::BatchMatrices::width) ||
|
||||
bias->size(Layout::BatchMatrices::height) == 1);
|
||||
} else {
|
||||
// skip batch & height dim for boardcasting; index -2
|
||||
// skip batch & height dim for broadcasting; index -2
|
||||
bias_available &=
|
||||
(bias->size(Layout::BatchMatrices::batch) ==
|
||||
weight.size(Layout::BatchMatrices::width) ||
|
||||
|
|
|
|||
|
|
@ -158,7 +158,7 @@ class TORCH_API Tensor: public TensorBase {
|
|||
// will only lead to trouble and dangling references.
|
||||
c10::MaybeOwned<Tensor> expect_contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) && = delete;
|
||||
|
||||
// The following overloads are very intruiging. Consider the following
|
||||
// The following overloads are very intriguing. Consider the following
|
||||
// program:
|
||||
//
|
||||
// x[1] = 3;
|
||||
|
|
|
|||
|
|
@ -6894,7 +6894,7 @@ TEST_F(VulkanAPITest, slice_height_success) {
|
|||
{2, {2, 3, 40, 50}}, // 4D tensors with dim=height
|
||||
{1, {3, 40, 50}}, // 3D tensors with dim=height
|
||||
{0, {40, 50}}, // 2D tensors with dim=height
|
||||
// 1D tesnors don't have height dim for test
|
||||
// 1D tensors don't have height dim for test
|
||||
};
|
||||
|
||||
// Act/Assert
|
||||
|
|
@ -6906,7 +6906,7 @@ TEST_F(VulkanAPITest, slice_feature_success) {
|
|||
std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
|
||||
{1, {2, 40, 13, 14}}, // 4D tensors with dim=feature(channel)
|
||||
{0, {40, 13, 14}}, // 3D tensors with dim=feature(channel)
|
||||
// 1D and 2D tesnors don't have feature(channel) dim for test
|
||||
// 1D and 2D tensors don't have feature(channel) dim for test
|
||||
};
|
||||
|
||||
// Act/Assert
|
||||
|
|
@ -6917,7 +6917,7 @@ TEST_F(VulkanAPITest, slice_batch_success) {
|
|||
// Arrange
|
||||
std::unordered_map<int64_t, std::vector<int64_t>> dim2sizes {
|
||||
{0, {40, 3, 13, 14}}, // 4D tensors with dim=batch
|
||||
// 1D, 2D and 3D tesnors don't have batch dim for test
|
||||
// 1D, 2D and 3D tensors don't have batch dim for test
|
||||
};
|
||||
|
||||
// Act/Assert
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user