mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
This PR does the following: - Replaces the `FusionOwner` with a `FusionCache` and `FusionInterface`. The `FusionCache` is a singleton that contains a cache of Fusions based on the `FusionDefinition`. It replaces the TorchScript graph caching that looked up a Fusion based on a stringified and canonicalized representation of the TorchScript graph with a prefix tree of statements in the `FusionDefinition`. The `FusionInterface` is an object that represents a Fusion in python. It can also query the cache based on id. - The ability to print out a mechanically derived definition, in python, for the user to use when debugging was added. - Replaces the python `examples` directory with true python tests under `test/test_nvfuser_frontend.py`. - Adds a set of C++ tests under the `test` directory to verify the `FusionCache`, `FusionDefinition`, and parts of the `RecordFunctor` child classes. - Adds a README file to explain how to use the Python Frontend While there are 3,000+ line edits, the bulk of the changes were repetitive line changes to the python bindings for each operation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/83267 Approved by: https://github.com/jjsjann123, https://github.com/davidberard98
172 lines
5.5 KiB
C++
172 lines
5.5 KiB
C++
#pragma once
|
|
|
|
#include <ATen/ATen.h>
|
|
#include <c10/util/Exception.h>
|
|
#include <torch/csrc/jit/ir/ir.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace fuser {
|
|
namespace cuda {
|
|
|
|
void debugPrint(const c10::TensorTypePtr& type);
|
|
|
|
bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
|
|
bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
|
|
|
|
bool is_cpu_scalar(const at::Tensor& tensor);
|
|
bool is_cpu_scalar(const c10::TensorType& tensor_type);
|
|
|
|
//! Types of debug print-outs
|
|
//!
|
|
//! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
|
|
//!
|
|
enum class DebugDumpOption {
|
|
FusionIr, //!< Dump the Fusion IR before lowering
|
|
FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
|
|
FusionIrPresched, //!< Dump the Fusion IR before it is scheduled.
|
|
KernelIr, //!< Dump the compiler Kernel IR
|
|
ComputeAtMap, //!< Dump the computeAt map
|
|
CudaKernel, //!< Dump the generated CUDA C++ kernel code
|
|
CudaFull, //!< Dump the complete CUDA C++ code
|
|
CudaToFile, //!< Dump CUDA Strings to File
|
|
DebugInfo, //!< Embed line info and debug info to compiled kernel, and dump
|
|
//!< the full CUDA C++ code
|
|
LaunchParam, //!< Dump the Launch parameters of kernel
|
|
FusionSegments, //!< Dump Segmented Fusion Graph
|
|
FusionSegmenterLog, //!< Dump Detailed Segmenter Logging
|
|
FusionArgs, //!< Print the runtime fusion arguments
|
|
KernelArgs, //!< Print the runtime kernel arguments when launching kernels
|
|
EffectiveBandwidth, //! Measure kernel performance and print effective
|
|
//! bandwidth
|
|
FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
|
|
PrintPtxasLog, //!< Print the ptxas verbose log including register usage
|
|
BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
|
|
SchedulerDebug, //! Dump scheduler heuristic parameters
|
|
ParallelDimensions, //!< Dump known parallel dimensions
|
|
Halo, //! Halo information of tensors
|
|
PerfDebugVerbose, //! When running kernels, print verbose information
|
|
//! associated with what's running
|
|
PythonDefinition, //! Python Frontend Fusion Definition.
|
|
PythonFrontendDebug, //! Python Frontend debug information.
|
|
TransformPropagator, //! When running TransformPropagator, print propagation
|
|
//! path and replay result
|
|
InlinePropagator //! When running InlinePropagator, print propagation
|
|
//! path and inlining result
|
|
};
|
|
|
|
TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
|
|
|
|
//! Types of features to disable
|
|
//!
|
|
//! These can be set through the `PYTORCH_NVFUSER_DISABLE` environment variable
|
|
//!
|
|
enum class DisableOption {
|
|
ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
|
|
Fallback, //! Disable fallback
|
|
Fma, //! Disable FMA instructions
|
|
IndexHoist, //! Disable index hoisting
|
|
Nvtx, //! Disable NVTX instrumentation
|
|
PredicateElimination, //! Disable predicate elimination
|
|
UnrollWithRng //! Disable unrolling for kernels with RNG in them
|
|
};
|
|
|
|
TORCH_CUDA_CU_API bool isOptionDisabled(DisableOption option);
|
|
|
|
//! Types of features to enable
|
|
//!
|
|
//! These can be set through the `PYTORCH_NVFUSER_ENABLE` environment variable
|
|
//!
|
|
enum class EnableOption {
|
|
Complex, //! Enable complex support on python
|
|
KernelProfile, //! Enable intra-kernel performance profiling
|
|
LinearDecomposition, //! Enable linear-bias decomposition
|
|
ConvDecomposition //! Enable conv-bias decomposition
|
|
};
|
|
|
|
TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
|
|
|
|
// Check if fallback path should be used which will dispatch to eagermode if any
|
|
// errors are encountered. Helpful for debugging.
|
|
bool useFallback();
|
|
|
|
//! Ceil integer division
|
|
constexpr int64_t ceilDiv(int64_t a, int64_t b) {
|
|
return (a + b - 1) / b;
|
|
}
|
|
|
|
//! Simple mixin for suppressing copy & move operations, ex:
|
|
//!
|
|
//! class Foo : public NonCopyable {
|
|
//! ...
|
|
//! };
|
|
//!
|
|
class NonCopyable {
|
|
public:
|
|
NonCopyable() = default;
|
|
|
|
// No copy/move semantics
|
|
NonCopyable(const NonCopyable&) = delete;
|
|
NonCopyable& operator=(const NonCopyable&) = delete;
|
|
};
|
|
|
|
//! A generic root for a hierarchy of polymorphic classes:
|
|
//! - It ensures virtual destructors
|
|
//! - Provides the base->as<Derived>() and node->isA<T>() notation
|
|
class PolymorphicBase {
|
|
public:
|
|
virtual ~PolymorphicBase() = default;
|
|
|
|
// Replacement for static_cast<T*>(ptr): ptr->as<T>()
|
|
// (checked in DEBUG builds)
|
|
template <class T>
|
|
T* as() {
|
|
#ifdef NDEBUG
|
|
auto downcast_ptr = static_cast<T*>(this);
|
|
#else
|
|
auto downcast_ptr = dynamic_cast<T*>(this);
|
|
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
|
|
#endif
|
|
return downcast_ptr;
|
|
}
|
|
|
|
template <class T>
|
|
const T* as() const {
|
|
#ifdef NDEBUG
|
|
auto downcast_ptr = static_cast<const T*>(this);
|
|
#else
|
|
auto downcast_ptr = dynamic_cast<const T*>(this);
|
|
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
|
|
#endif
|
|
return downcast_ptr;
|
|
}
|
|
|
|
//! Check if the runtime time is T (or derived from T)
|
|
//!
|
|
//! \note Don't use this for conditional casts. Instead, use:
|
|
//!
|
|
//! if (auto t = dynamic_cast<T>(p)) { ... }
|
|
//!
|
|
//! instead of:
|
|
//!
|
|
//! if (p->isA<T>()) { auto t = p->as<T>(); ... }
|
|
//!
|
|
template <class T>
|
|
bool isA() const {
|
|
return dynamic_cast<const T*>(this) != nullptr;
|
|
}
|
|
};
|
|
|
|
template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
|
|
constexpr unsigned int switch_pair(T t1, T t2) {
|
|
constexpr unsigned int _WORD_SHIFT = 16;
|
|
return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
|
|
}
|
|
|
|
std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
|
|
|
|
} // namespace cuda
|
|
} // namespace fuser
|
|
} // namespace jit
|
|
} // namespace torch
|