mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary:
Things changed in this PR that requires review:
1. aten/src/ATen/core/interned_strings.h
2. torch/csrc/jit/ir/alias_analysis.h : exposing createValue to allow efficient mutation
3. torch/csrc/jit/runtime/symbolic_shape_registry.cpp : added gelu/tanh/erf in registry
4. torch/jit/_script.py : throws scripting model sees autocast as decorator since it's not supported
nvfuser code update:
1. codegen improvements and performance tuning
2. integration bug fixes for shape expression logic
3. kernel segmentation update to address perf regression from horizontal fusion
4. scalar cpu tensor promotion to support inter-device operation between cpu scalar tensor and cuda tensor
Things reverted from local changes:
aten::gelu with approximation (tracked in PR: https://github.com/pytorch/pytorch/pull/61439)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72127
Reviewed By: HamidShojanazeri
Differential Revision: D34113233
Pulled By: jbschlosser
fbshipit-source-id: b82cde32b71e324eca0ea57cb8c9f9647278ca74
(cherry picked from commit e009bc5c4e)
129 lines
3.7 KiB
C++
129 lines
3.7 KiB
C++
#pragma once
|
|
|
|
#include <ATen/ATen.h>
|
|
#include <c10/util/Exception.h>
|
|
#include <torch/csrc/jit/ir/ir.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace fuser {
|
|
namespace cuda {
|
|
|
|
void debugPrint(const c10::TensorTypePtr& type);
|
|
|
|
bool is_cpu_scalar(const at::Tensor& tensor);
|
|
bool is_cpu_scalar(const c10::TensorType& tensor_type);
|
|
|
|
//! Types of debug print-outs
|
|
//!
|
|
//! These can be set through the `PYTORCH_NVFUSER_DUMP` environment variable
|
|
//!
|
|
enum class DebugDumpOption {
|
|
FusionIr, //!< Dump the Fusion IR before lowering
|
|
FusionIrMath, //!< Dump just the compute (math) part of the Fusion IR
|
|
KernelIr, //!< Dump the compiler Kernel IR
|
|
CudaKernel, //!< Dump the generated CUDA C++ kernel code
|
|
CudaFull, //!< Dump the complete CUDA C++ code
|
|
CudaToFile, //!< Dump CUDA Strings to File
|
|
LaunchParam, //!< Dump the Launch parameters of kernel
|
|
FusionSegments, //!< Dump Segmented Fusion Graph
|
|
PrintRuntimeArgs, //!< Print the runtime arguments when launching kernels
|
|
EffectiveBandwidth, //! Measure kernel performance and print effective
|
|
//! bandwidth
|
|
FusionSegmentsDrawing, //!< Dump Segmented Fusion Graph
|
|
PrintPtxasLog, //!< Print the ptxas verbose log including register usage
|
|
BufferReuseInfo, //!< Dump the analysis details of local/shared buffer re-use
|
|
SchedulerDebug, //! Dump scheduler heuristic parameters
|
|
ParallelDimensions, //!< Dump known parallel dimensions
|
|
Halo //! Halo information of tensors
|
|
};
|
|
|
|
TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);
|
|
|
|
// Check if fallback path should be used which will dispatch to eagermode if any
|
|
// errors are encountered. Helpful for debugging.
|
|
bool useFallback();
|
|
|
|
// Returns if unrolling should not be used for kernels with RNG in them.
|
|
bool disableRNGUnrolling();
|
|
|
|
//! Ceil integer division
|
|
constexpr int64_t ceilDiv(int64_t a, int64_t b) {
|
|
return (a + b - 1) / b;
|
|
}
|
|
|
|
//! Simple mixin for suppressing copy & move operations, ex:
|
|
//!
|
|
//! class Foo : public NonCopyable {
|
|
//! ...
|
|
//! };
|
|
//!
|
|
class NonCopyable {
|
|
public:
|
|
NonCopyable() = default;
|
|
|
|
// No copy/move semantics
|
|
NonCopyable(const NonCopyable&) = delete;
|
|
NonCopyable& operator=(const NonCopyable&) = delete;
|
|
};
|
|
|
|
//! A generic root for a hierarchy of polymorphic classes:
|
|
//! - It ensures virtual destructors
|
|
//! - Provides the base->as<Derived>() and node->isA<T>() notation
|
|
class PolymorphicBase {
|
|
public:
|
|
virtual ~PolymorphicBase() = default;
|
|
|
|
// Replacement for static_cast<T*>(ptr): ptr->as<T>()
|
|
// (checked in DEBUG builds)
|
|
template <class T>
|
|
T* as() {
|
|
#ifdef NDEBUG
|
|
auto downcast_ptr = static_cast<T*>(this);
|
|
#else
|
|
auto downcast_ptr = dynamic_cast<T*>(this);
|
|
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
|
|
#endif
|
|
return downcast_ptr;
|
|
}
|
|
|
|
template <class T>
|
|
const T* as() const {
|
|
#ifdef NDEBUG
|
|
auto downcast_ptr = static_cast<const T*>(this);
|
|
#else
|
|
auto downcast_ptr = dynamic_cast<const T*>(this);
|
|
TORCH_INTERNAL_ASSERT(downcast_ptr != nullptr);
|
|
#endif
|
|
return downcast_ptr;
|
|
}
|
|
|
|
//! Check if the runtime time is T (or derived from T)
|
|
//!
|
|
//! \note Don't use this for conditional casts. Instead, use:
|
|
//!
|
|
//! if (auto t = dynamic_cast<T>(p)) { ... }
|
|
//!
|
|
//! instead of:
|
|
//!
|
|
//! if (p->isA<T>()) { auto t = p->as<T>(); ... }
|
|
//!
|
|
template <class T>
|
|
bool isA() const {
|
|
return dynamic_cast<const T*>(this) != nullptr;
|
|
}
|
|
};
|
|
|
|
template <class T, std::enable_if_t<std::is_enum<T>::value, bool> = true>
|
|
constexpr unsigned int switch_pair(T t1, T t2) {
|
|
constexpr unsigned int _WORD_SHIFT = 16;
|
|
return ((unsigned int)t1 << _WORD_SHIFT) + (unsigned int)t2;
|
|
}
|
|
|
|
std::vector<int64_t> getTensorSizes(TensorTypePtr const& tensor_type);
|
|
|
|
} // namespace cuda
|
|
} // namespace fuser
|
|
} // namespace jit
|
|
} // namespace torch
|