pytorch/torch/csrc/jit/codegen/cuda/utils.h
jjsjann123 b21a6ff639 [NVFuser] Upstream push 0811 (#83239)
Syncing nvfuser devel branch to upstream master. https://github.com/csarofeen/pytorch/

Code changes includes:

- codegen improvements:
  1. double support in expression evaluator
- bug fixes:
  1. dropout fix - rework RNG to support broadcasted dropout (Fixes #82784)
  2. expand fix - Patch expand+reduction, expand+view, rework view analysis and guard
- scheduler:
  1. manual transpose schedule example
  2. WIP transpose scheduler

Commits that's in this PR from the devel branch:

```
b7435afcd22c917713c2f41a7237bc26e1183f14 Transpose scheduler, step 1 (#1854)
8a45dbf72034684eb8e18b1835b533e90b68f184 Add an example on how to manually schedule transpose (#1889)
83dbf56a9554b2efbd5416461d938fff477b0b27 Patch dropout fix (#1898)
69d3519a532250719b1aa8341b50e067b181b42d Expand+Reduction, Expand+View support, rework View analysis and guards (#1883)
15091c488e96343bdc49e3990acbf238a3b3da51 Rework RNG to correctly support broadcasted dropout (#1888)
aafe2d048aaac596e503596a41303423619f3954 Make ExpressionEvaluator support Double (#1885)
```

RUN_TORCHBENCH: nvfuser

Differential Revision: [D38657074](https://our.internmc.facebook.com/intern/diff/D38657074)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83239
Approved by: https://github.com/davidberard98
2022-08-25 02:23:22 +00:00

169 lines
5.4 KiB
C++

#pragma once
#include <ATen/ATen.h>
#include <c10/util/Exception.h>
#include <torch/csrc/jit/ir/ir.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
void debugPrint(const c10::TensorTypePtr& type);
bool is_zero_dim_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
bool is_zero_sized_tensor(const std::shared_ptr<c10::TensorType>& tensor_type);
bool is_cpu_scalar(const at::Tensor& tensor);
bool is_cpu_scalar(const c10::TensorType& tensor_type);
//! Types of debug print-outs
//!
//! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
//!
enum class DebugDumpOption {
FusionIr, //!< Dump the Fusion IR before lowering
FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
KernelIr, //!< Dump the compiler Kernel IR
ComputeAtMap, //!< Dump the computeAt map
CudaKernel, //!< Dump the generated CUDA C++ kernel code
CudaFull, //!< Dump the complete CUDA C++ code
CudaToFile, //!< Dump CUDA Strings to File
DebugInfo, //!< Embed line info and debug info to compiled kernel, and dump
//!< the full CUDA C++ code
LaunchParam, //!< Dump the Launch parameters of kernel
FusionSegments, //!< Dump Segmented Fusion Graph
FusionSegmenterLog, //!< Dump Detailed Segmenter Logging
FusionArgs, //!< Print the runtime fusion arguments
KernelArgs, //!< Print the runtime kernel arguments when launching kernels
EffectiveBandwidth, //! Measure kernel performance and print effective
//! bandwidth
FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
PrintPtxasLog, //!< Print the ptxas verbose log including register usage
BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
SchedulerDebug, //! Dump scheduler heuristic parameters
ParallelDimensions, //!< Dump known parallel dimensions
Halo, //! Halo information of tensors
PerfDebugVerbose, //! When running kernels, print verbose information
//! associated with what's running
TransformPropagator, //! When running TransformPropagator, print propagation
//! path and replay result
InlinePropagator, //! When running InlinePropagator, print propagation
//! path and inlining result
};
TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
//! Types of features to disable
//!
//! These can be set through the `PYTORCH_NVFUSER_DISABLE` environment variable
//!
enum class DisableOption {
ArchCheck, //! Disable hardware-specific checks to enable cross arch debug
Fallback, //! Disable fallback
Fma, //! Disable FMA instructions
IndexHoist, //! Disable index hoisting
Nvtx, //! Disable NVTX instrumentation
PredicateElimination, //! Disable predicate elimination
UnrollWithRng //! Disable unrolling for kernels with RNG in them
};
TORCH_CUDA_CU_API bool isOptionDisabled(DisableOption option);
//! Types of features to enable
//!
//! These can be set through the `PYTORCH_NVFUSER_ENABLE` environment variable
//!
enum class EnableOption {
Complex, //! Enable complex support on python
KernelProfile, //! Enable intra-kernel performance profiling
LinearDecomposition, //! Enable linear-bias decomposition
ConvDecomposition //! Enable conv-bias decomposition
};
TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
// Check if fallback path should be used which will dispatch to eagermode if any
// errors are encountered. Helpful for debugging.
bool useFallback();
//! Ceil integer division
constexpr int64_t ceilDiv(int64_t a, int64_t b) {
return (a + b - 1) / b;
}
//! Simple mixin for suppressing copy & move operations, ex:
//!
//! class Foo : public NonCopyable {
//! ...
//! };
//!
class NonCopyable {
public:
NonCopyable() = default;
// No copy/move semantics
NonCopyable(const NonCopyable&) = delete;
NonCopyable& operator=(const NonCopyable&) = delete;
};
//! A generic root for a hierarchy of polymorphic classes:
//! - It ensures virtual destructors
//! - Provides the base->as<Derived>() and node->isA<T>() notation
class PolymorphicBase {
public:
virtual ~PolymorphicBase() = default;
// Replacement for static_cast<T*>(ptr): ptr->as<T>()
// (checked in DEBUG builds)
template <class T>
T* as() {
#ifdef NDEBUG
auto downcast_ptr = static_cast<T*>(this);
#else
auto downcast_ptr = dynamic_cast<T*>(this);
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
#endif
return downcast_ptr;
}
template <class T>
const T* as() const {
#ifdef NDEBUG
auto downcast_ptr = static_cast<const T*>(this);
#else
auto downcast_ptr = dynamic_cast<const T*>(this);
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
#endif
return downcast_ptr;
}
//! Check if the runtime time is T (or derived from T)
//!
//! \note Don't use this for conditional casts. Instead, use:
//!
//! if (auto t = dynamic_cast<T>(p)) { ... }
//!
//! instead of:
//!
//! if (p->isA<T>()) { auto t = p->as<T>(); ... }
//!
template <class T>
bool isA() const {
return dynamic_cast<const T*>(this) != nullptr;
}
};
template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
constexpr unsigned int switch_pair(T t1, T t2) {
constexpr unsigned int _WORD_SHIFT = 16;
return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
}
std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch