pytorch/torch/csrc/jit/codegen/cuda/utils.h
jiej 2d110d514f Nvfuser code bump 2_1_2022 (#72127)
Summary:
Things changed in this PR that requires review:
1. aten/src/ATen/core/interned_strings.h
2. torch/csrc/jit/ir/alias_analysis.h : exposing createValue to allow efficient mutation
3. torch/csrc/jit/runtime/symbolic_shape_registry.cpp : added gelu/tanh/erf in registry
4. torch/jit/_script.py : throws scripting model sees autocast as decorator since it's not supported

nvfuser code update:
1. codegen improvements and performance tuning
2. integration bug fixes for shape expression logic
3. kernel segmentation update to address perf regression from horizontal fusion
4. scalar cpu tensor promotion to support inter-device operation between cpu scalar tensor and cuda tensor

Things reverted from local changes:
aten::gelu with approximation (tracked in PR: https://github.com/pytorch/pytorch/pull/61439)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/72127

Reviewed By: HamidShojanazeri

Differential Revision: D34113233

Pulled By: jbschlosser

fbshipit-source-id: b82cde32b71e324eca0ea57cb8c9f9647278ca74
(cherry picked from commit e009bc5c4e)
2022-02-15 00:43:16 +00:00

129 lines
3.7 KiB
C++

#pragma once
#include <ATen/ATen.h>
#include <c10/util/Exception.h>
#include <torch/csrc/jit/ir/ir.h>
namespace torch {
namespace jit {
namespace fuser {
namespace cuda {
void debugPrint(const c10::TensorTypePtr& type);
bool is_cpu_scalar(const at::Tensor& tensor);
bool is_cpu_scalar(const c10::TensorType& tensor_type);
//! Types of debug print-outs
//!
//! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
//!
enum class DebugDumpOption {
FusionIr, //!< Dump the Fusion IR before lowering
FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
KernelIr, //!< Dump the compiler Kernel IR
CudaKernel, //!< Dump the generated CUDA C++ kernel code
CudaFull, //!< Dump the complete CUDA C++ code
CudaToFile, //!< Dump CUDA Strings to File
LaunchParam, //!< Dump the Launch parameters of kernel
FusionSegments, //!< Dump Segmented Fusion Graph
PrintRuntimeArgs, //!< Print the runtime arguments when launching kernels
EffectiveBandwidth, //! Measure kernel performance and print effective
//! bandwidth
FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
PrintPtxasLog, //!< Print the ptxas verbose log including register usage
BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
SchedulerDebug, //! Dump scheduler heuristic parameters
ParallelDimensions, //!< Dump known parallel dimensions
Halo //! Halo information of tensors
};
TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
// Check if fallback path should be used which will dispatch to eagermode if any
// errors are encountered. Helpful for debugging.
bool useFallback();
// Returns if unrolling should not be used for kernels with RNG in them.
bool disableRNGUnrolling();
//! Ceil integer division
constexpr int64_t ceilDiv(int64_t a, int64_t b) {
return (a + b - 1) / b;
}
//! Simple mixin for suppressing copy & move operations, ex:
//!
//! class Foo : public NonCopyable {
//! ...
//! };
//!
class NonCopyable {
public:
NonCopyable() = default;
// No copy/move semantics
NonCopyable(const NonCopyable&) = delete;
NonCopyable& operator=(const NonCopyable&) = delete;
};
//! A generic root for a hierarchy of polymorphic classes:
//! - It ensures virtual destructors
//! - Provides the base->as<Derived>() and node->isA<T>() notation
class PolymorphicBase {
public:
virtual ~PolymorphicBase() = default;
// Replacement for static_cast<T*>(ptr): ptr->as<T>()
// (checked in DEBUG builds)
template <class T>
T* as() {
#ifdef NDEBUG
auto downcast_ptr = static_cast<T*>(this);
#else
auto downcast_ptr = dynamic_cast<T*>(this);
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
#endif
return downcast_ptr;
}
template <class T>
const T* as() const {
#ifdef NDEBUG
auto downcast_ptr = static_cast<const T*>(this);
#else
auto downcast_ptr = dynamic_cast<const T*>(this);
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
#endif
return downcast_ptr;
}
//! Check if the runtime time is T (or derived from T)
//!
//! \note Don't use this for conditional casts. Instead, use:
//!
//! if (auto t = dynamic_cast<T>(p)) { ... }
//!
//! instead of:
//!
//! if (p->isA<T>()) { auto t = p->as<T>(); ... }
//!
template <class T>
bool isA() const {
return dynamic_cast<const T*>(this) != nullptr;
}
};
template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
constexpr unsigned int switch_pair(T t1, T t2) {
constexpr unsigned int _WORD_SHIFT = 16;
return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
}
std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
} // namespace cuda
} // namespace fuser
} // namespace jit
} // namespace torch