mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: Things changed in this PR that requires review: test/forward_backward_compatibility/check_forward_backward_compatibility.py Our previous function overload extension names were wrong and has been updated in this PR, hence the compatibility list updated. nvfuser code updates with bug fixes towards failures we encountered in OpInfoTests as well as failures reported by AOTAutograd team. Pull Request resolved: https://github.com/pytorch/pytorch/pull/73627 Reviewed By: Chillee Differential Revision: D34765458 Pulled By: davidberard98 fbshipit-source-id: c81f3d6a1b723fb3a8ba419b7f82227f70440ca7 (cherry picked from commit b6a2c362c37051e44fac31687b2fe272f776551e)
135 lines
3.9 KiB
C++
135 lines
3.9 KiB
C++
#pragma once
|
|
|
|
#include <ATen/ATen.h>
|
|
#include <c10/util/Exception.h>
|
|
#include <torch/csrc/jit/ir/ir.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace fuser {
|
|
namespace cuda {
|
|
|
|
void debugPrint(const c10::TensorTypePtr& type);
|
|
|
|
bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
|
|
bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
|
|
|
|
bool is_cpu_scalar(const at::Tensor& tensor);
|
|
bool is_cpu_scalar(const c10::TensorType& tensor_type);
|
|
|
|
//! Types of debug print-outs
|
|
//!
|
|
//! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
|
|
//!
|
|
enum class DebugDumpOption {
|
|
FusionIr, //!< Dump the Fusion IR before lowering
|
|
FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
|
|
KernelIr, //!< Dump the compiler Kernel IR
|
|
CudaKernel, //!< Dump the generated CUDA C++ kernel code
|
|
CudaFull, //!< Dump the complete CUDA C++ code
|
|
CudaToFile, //!< Dump CUDA Strings to File
|
|
LaunchParam, //!< Dump the Launch parameters of kernel
|
|
FusionSegments, //!< Dump Segmented Fusion Graph
|
|
PrintRuntimeArgs, //!< Print the runtime arguments when launching kernels
|
|
EffectiveBandwidth, //! Measure kernel performance and print effective
|
|
//! bandwidth
|
|
FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
|
|
PrintPtxasLog, //!< Print the ptxas verbose log including register usage
|
|
BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
|
|
SchedulerDebug, //! Dump scheduler heuristic parameters
|
|
ParallelDimensions, //!< Dump known parallel dimensions
|
|
Halo //! Halo information of tensors
|
|
};
|
|
|
|
TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
|
|
|
|
// Check if fallback path should be used which will dispatch to eagermode if any
|
|
// errors are encountered. Helpful for debugging.
|
|
bool useFallback();
|
|
|
|
// Returns if unrolling should not be used for kernels with RNG in them.
|
|
bool disableRNGUnrolling();
|
|
|
|
//! Returns if index hoisting should be disabled
|
|
TORCH_CUDA_CU_API bool disableIndexHoisting();
|
|
|
|
//! Ceil integer division
|
|
constexpr int64_t ceilDiv(int64_t a, int64_t b) {
|
|
return (a + b - 1) / b;
|
|
}
|
|
|
|
//! Simple mixin for suppressing copy & move operations, ex:
|
|
//!
|
|
//! class Foo : public NonCopyable {
|
|
//! ...
|
|
//! };
|
|
//!
|
|
class NonCopyable {
|
|
public:
|
|
NonCopyable() = default;
|
|
|
|
// No copy/move semantics
|
|
NonCopyable(const NonCopyable&) = delete;
|
|
NonCopyable& operator=(const NonCopyable&) = delete;
|
|
};
|
|
|
|
//! A generic root for a hierarchy of polymorphic classes:
|
|
//! - It ensures virtual destructors
|
|
//! - Provides the base->as<Derived>() and node->isA<T>() notation
|
|
class PolymorphicBase {
|
|
public:
|
|
virtual ~PolymorphicBase() = default;
|
|
|
|
// Replacement for static_cast<T*>(ptr): ptr->as<T>()
|
|
// (checked in DEBUG builds)
|
|
template <class T>
|
|
T* as() {
|
|
#ifdef NDEBUG
|
|
auto downcast_ptr = static_cast<T*>(this);
|
|
#else
|
|
auto downcast_ptr = dynamic_cast<T*>(this);
|
|
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
|
|
#endif
|
|
return downcast_ptr;
|
|
}
|
|
|
|
template <class T>
|
|
const T* as() const {
|
|
#ifdef NDEBUG
|
|
auto downcast_ptr = static_cast<const T*>(this);
|
|
#else
|
|
auto downcast_ptr = dynamic_cast<const T*>(this);
|
|
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
|
|
#endif
|
|
return downcast_ptr;
|
|
}
|
|
|
|
//! Check if the runtime time is T (or derived from T)
|
|
//!
|
|
//! \note Don't use this for conditional casts. Instead, use:
|
|
//!
|
|
//! if (auto t = dynamic_cast<T>(p)) { ... }
|
|
//!
|
|
//! instead of:
|
|
//!
|
|
//! if (p->isA<T>()) { auto t = p->as<T>(); ... }
|
|
//!
|
|
template <class T>
|
|
bool isA() const {
|
|
return dynamic_cast<const T*>(this) != nullptr;
|
|
}
|
|
};
|
|
|
|
template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
|
|
constexpr unsigned int switch_pair(T t1, T t2) {
|
|
constexpr unsigned int _WORD_SHIFT = 16;
|
|
return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
|
|
}
|
|
|
|
std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
|
|
|
|
} // namespace cuda
|
|
} // namespace fuser
|
|
} // namespace jit
|
|
} // namespace torch
|