mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Syncing nvfuser devel branch to upstream master. https://github.com/csarofeen/pytorch/ A few bigger updates: 1. Initial support of cp.async and cp.async.wait: https://github.com/csarofeen/pytorch/pull/1619 2. Emulate ampere's mma 16816 with Turing's mma 1688, for a unified interface: https://github.com/csarofeen/pytorch/pull/1643 3. Extending the infrastructure to support mma operators on turing and ampere arch: https://github.com/csarofeen/pytorch/pull/1440 Commits that's actually in this PR from the csarofeen branch ``` * dd2325294e236c5082c642819a1103bcfe4561a3 (csarofeen/devel) Fusion Segmenter: Unify single kernel and multi-kernel runtime path (#1710) * b3d1c3f446355a2d276bac8272e7aa8b5bb6b1f0 Fix missing cooperative launch (#1726) * dc670a226cbe52be46cecef47001f38bf9a09433 Async gmem copy support on sm80+ (#1619) * 5e6a8dab5a71aefe0548bbfa15d1a93c556d23fe Add turing mma support and test (#1643) * d6d6b7d3f10dd91dafa4cdbd5e460bbb38173af4 Fix rFactor when there are indirect root domain(s), and refactor (#1723) * 7093e39150c6d80e0f9f767d56654714a2e8a927 Mma op integration on ampere (#1440) * fade8da55e60a118c5595378896d34b862b2fcc3 patch python test for bfloat16 (#1724) * 8fbd0b18743a72ac10478857c3d2351204375685 Fine-grained kernel profiling (#1720) * 77c1b4fa633f9e631d267923f4537336fa328939 Adding dry run mode to skip arch dependent checks (#1702) * 151d95b97bebefc94199bb4a53423ede32b55451 More precise concretization analysis (#1719) * f4d3630ed54d7069dd377a64be1f91013b285b66 Enable complex python tests (#1667) * 4ceeee509774cc2ce6c834a4dc1e313f71d94503 Minor bugfix in transform_rfactor.cpp (#1715) * 3675c70faf218e86d2c78dbd3874b175a3b0a203 Separate root domain and rfactor domain in TransformPrinter (#1716) * f68b830d5def65dadfe29d4edf52fc703369c84a Fix scheduling with polymorphic broadcast (#1714) * 4ab5ef7ae2cfd8fffad1e1d882ae7c50631211dc updating_ci_machine (#1718) * 56585c58b1ff338704cafb0cd6be2b3d536bed5a Merge pull request #1711 from csarofeen/upstream_master_bump_0517 * 174d453d3be0c11a5acb0fff3b3f36e19cfdaf81 Allow using nvFuser on CUDA extension (#1701) * 18bee67495454b9a79625799776e746bd5e81c4c Validate LOOP concrete IDs have complete IterDomains (#1676) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/78244 Approved by: https://github.com/csarofeen, https://github.com/malfet
161 lines
4.9 KiB
C++
161 lines
4.9 KiB
C++
#pragma once
|
|
|
|
#include <ATen/ATen.h>
|
|
#include <c10/util/Exception.h>
|
|
#include <torch/csrc/jit/ir/ir.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace fuser {
|
|
namespace cuda {
|
|
|
|
void debugPrint(const c10::TensorTypePtr& type);
|
|
|
|
bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
|
|
bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
|
|
|
|
bool is_cpu_scalar(const at::Tensor& tensor);
|
|
bool is_cpu_scalar(const c10::TensorType& tensor_type);
|
|
|
|
//! Types of debug print-outs
|
|
//!
|
|
//! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
|
|
//!
|
|
enum class DebugDumpOption {
|
|
FusionIr, //!< Dump the Fusion IR before lowering
|
|
FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
|
|
KernelIr, //!< Dump the compiler Kernel IR
|
|
ComputeAtMap, //!< Dump the computeAt map
|
|
CudaKernel, //!< Dump the generated CUDA C++ kernel code
|
|
CudaFull, //!< Dump the complete CUDA C++ code
|
|
CudaToFile, //!< Dump CUDA Strings to File
|
|
LaunchParam, //!< Dump the Launch parameters of kernel
|
|
FusionSegments, //!< Dump Segmented Fusion Graph
|
|
FusionSegmenterLog, //!< Dump Detailed Segmenter Logging
|
|
FusionArgs, //!< Print the runtime fusion arguments
|
|
KernelArgs, //!< Print the runtime kernel arguments when launching kernels
|
|
EffectiveBandwidth, //! Measure kernel performance and print effective
|
|
//! bandwidth
|
|
FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
|
|
PrintPtxasLog, //!< Print the ptxas verbose log including register usage
|
|
BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
|
|
SchedulerDebug, //! Dump scheduler heuristic parameters
|
|
ParallelDimensions, //!< Dump known parallel dimensions
|
|
Halo, //! Halo information of tensors
|
|
PerfDebugVerbose //! When running kernels, print verbose information
|
|
//! associated with what's running
|
|
};
|
|
|
|
TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
|
|
|
|
//! Types of features to disable
|
|
//!
|
|
//! These can be set through the `PYTORCH_NVFUSER_DISABLE` environment variable
|
|
//!
|
|
enum class DisableOption {
|
|
ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
|
|
Fallback, //! Disable fallback
|
|
Fma, //! Disable FMA instructions
|
|
IndexHoist, //! Disable index hoisting
|
|
Nvtx, //! Disable NVTX instrumentation
|
|
PredicateElimination, //! Disable predicate elimination
|
|
UnrollWithRng //! Disable unrolling for kernels with RNG in them
|
|
};
|
|
|
|
TORCH_CUDA_CU_API bool isDisabled(DisableOption option);
|
|
|
|
//! Types of features to enable
|
|
//!
|
|
//! These can be set through the `PYTORCH_NVFUSER_ENABLE` environment variable
|
|
//!
|
|
enum class EnableOption {
|
|
Complex, //! Enable complex support on python
|
|
KernelProfile //! Enable intra-kernel performance profiling
|
|
};
|
|
|
|
TORCH_CUDA_CU_API bool isEnabled(EnableOption option);
|
|
|
|
// Check if fallback path should be used which will dispatch to eagermode if any
|
|
// errors are encountered. Helpful for debugging.
|
|
bool useFallback();
|
|
|
|
//! Ceil integer division
|
|
constexpr int64_t ceilDiv(int64_t a, int64_t b) {
|
|
return (a + b - 1) / b;
|
|
}
|
|
|
|
//! Simple mixin for suppressing copy & move operations, ex:
|
|
//!
|
|
//! class Foo : public NonCopyable {
|
|
//! ...
|
|
//! };
|
|
//!
|
|
class NonCopyable {
|
|
public:
|
|
NonCopyable() = default;
|
|
|
|
// No copy/move semantics
|
|
NonCopyable(const NonCopyable&) = delete;
|
|
NonCopyable& operator=(const NonCopyable&) = delete;
|
|
};
|
|
|
|
//! A generic root for a hierarchy of polymorphic classes:
|
|
//! - It ensures virtual destructors
|
|
//! - Provides the base->as<Derived>() and node->isA<T>() notation
|
|
class PolymorphicBase {
|
|
public:
|
|
virtual ~PolymorphicBase() = default;
|
|
|
|
// Replacement for static_cast<T*>(ptr): ptr->as<T>()
|
|
// (checked in DEBUG builds)
|
|
template <class T>
|
|
T* as() {
|
|
#ifdef NDEBUG
|
|
auto downcast_ptr = static_cast<T*>(this);
|
|
#else
|
|
auto downcast_ptr = dynamic_cast<T*>(this);
|
|
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
|
|
#endif
|
|
return downcast_ptr;
|
|
}
|
|
|
|
template <class T>
|
|
const T* as() const {
|
|
#ifdef NDEBUG
|
|
auto downcast_ptr = static_cast<const T*>(this);
|
|
#else
|
|
auto downcast_ptr = dynamic_cast<const T*>(this);
|
|
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
|
|
#endif
|
|
return downcast_ptr;
|
|
}
|
|
|
|
//! Check if the runtime time is T (or derived from T)
|
|
//!
|
|
//! \note Don't use this for conditional casts. Instead, use:
|
|
//!
|
|
//! if (auto t = dynamic_cast<T>(p)) { ... }
|
|
//!
|
|
//! instead of:
|
|
//!
|
|
//! if (p->isA<T>()) { auto t = p->as<T>(); ... }
|
|
//!
|
|
template <class T>
|
|
bool isA() const {
|
|
return dynamic_cast<const T*>(this) != nullptr;
|
|
}
|
|
};
|
|
|
|
template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
|
|
constexpr unsigned int switch_pair(T t1, T t2) {
|
|
constexpr unsigned int _WORD_SHIFT = 16;
|
|
return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
|
|
}
|
|
|
|
std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
|
|
|
|
} // namespace cuda
|
|
} // namespace fuser
|
|
} // namespace jit
|
|
} // namespace torch
|