mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Debugging illegal memory access is hard; even CUDA_LAUNCH_BLOCKING=1 and using C10_CUDA_KERNEL_LAUNCH_CHECK doesn't guarantee a useful stack trace. doesn't necessarily guarantee that you'll get a stack trace pointing to the right kernel. This diff adds a config option to force a CUDA synchronize after every kernel call in inductor, for debugging those tricky cases. Differential Revision: [D41744967](https://our.internmc.facebook.com/intern/diff/D41744967/) Differential Revision: [D41744967](https://our.internmc.facebook.com/intern/diff/D41744967) Pull Request resolved: https://github.com/pytorch/pytorch/pull/90472 Approved by: https://github.com/jansel
294 lines
8.6 KiB
Python
294 lines
8.6 KiB
Python
import os
|
|
import sys
|
|
|
|
# add some debug printouts
|
|
debug = False
|
|
|
|
# use cpp wrapper instead of python wrapper
|
|
cpp_wrapper = False
|
|
|
|
# dead code elimination
|
|
dce = False
|
|
|
|
# assume input tensors are dynamic
|
|
dynamic_shapes = (
|
|
os.environ.get("TORCHDYNAMO_DYNAMIC_SHAPES") == "1"
|
|
) # Use dynamic shapes if torchdynamo dynamic shapes is set
|
|
|
|
# assume weight tensors are fixed size
|
|
static_weight_shapes = True
|
|
|
|
# put correctness assertions in generated code
|
|
size_asserts = True
|
|
|
|
# enable loop reordering based on input orders
|
|
pick_loop_orders = True
|
|
|
|
# generate inplace computations
|
|
inplace_buffers = True
|
|
|
|
# codegen benchmark harness
|
|
benchmark_harness = True
|
|
|
|
# control store vs recompute heuristic
|
|
# For fanouts, rematearialization can lead to exponential blowup. So, have
|
|
# smaller threshold
|
|
realize_reads_threshold = 4
|
|
realize_bytes_threshold = 2000
|
|
|
|
# Threshold to prevent excessive accumulation of ops in one buffer during lowering
|
|
realize_acc_reads_threshold = 8
|
|
|
|
# fallback to eager for random/dropout, this is slow but useful for debugging
|
|
fallback_random = False
|
|
|
|
# automatically create fallbacks when encountering an unhandled op
|
|
implicit_fallbacks = True
|
|
|
|
# Enables a fusion pass that groups nodes together before the scheduler
|
|
prefuse_nodes = True
|
|
|
|
# do bench to decide best layout, currently only for aten.conv
|
|
tune_layout = False
|
|
|
|
# fuse even in cases without common reads
|
|
aggressive_fusion = False
|
|
|
|
# how many nodes to allow into a single fusion
|
|
max_fusion_size = 64
|
|
|
|
# replace small reductions with pointwise, disable with `= 1`
|
|
unroll_reductions_threshold = 8
|
|
|
|
comment_origin = False
|
|
|
|
|
|
def is_fbcode():
|
|
import torch
|
|
|
|
return not hasattr(torch.version, "git_version")
|
|
|
|
|
|
compile_threads = (
|
|
1
|
|
if sys.platform == "win32" or is_fbcode()
|
|
else min(
|
|
32,
|
|
len(os.sched_getaffinity(0))
|
|
if hasattr(os, "sched_getaffinity")
|
|
else os.cpu_count(),
|
|
)
|
|
)
|
|
|
|
# If kernel is fused, the name is generated from the origin node op names
|
|
# for larger kernels limit this
|
|
kernel_name_max_ops = 10
|
|
|
|
# How to import torchinductor, either torchinductor or torch.inductor
|
|
inductor_import = __name__.replace(".config", "")
|
|
|
|
# How to import torchdynamo, either torchdynamo or torch.dynamo
|
|
dynamo_import = inductor_import.replace("inductor", "dynamo")
|
|
|
|
# Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs
|
|
shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "0") == "1"
|
|
|
|
# Pad input tensors in dimension N and M of bmm to leverage Tensor Cores in NVIDIA GPUs
|
|
shape_padding_bmm = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING_BMM", "1") == "1"
|
|
|
|
# Fx-based linear/matmul/bmm + permute/transpose vertical fusion
|
|
permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
|
|
|
|
# Mark the wrapper call in PyTorch profiler
|
|
profiler_mark_wrapper_call = False
|
|
|
|
# config specific to codegen/cpp.pp
|
|
class cpp:
|
|
# set to torch.get_num_threads()
|
|
threads = -1
|
|
|
|
# Assume number of threads is dynamic, don't specialize thread number.
|
|
# Kernels don't recompile on thread number changes with this flag on.
|
|
# For single-threaded workload, turning it on would incur a slight
|
|
# performance degradation.
|
|
dynamic_threads = False
|
|
|
|
simdlen = None
|
|
min_chunk_size = 4096
|
|
cxx = (
|
|
None, # download gcc12 from conda-forge if conda is installed
|
|
# "g++-12",
|
|
# "g++-11",
|
|
# "g++-10",
|
|
# "clang++",
|
|
"g++",
|
|
# "g++.par",
|
|
)
|
|
# Allow kernel performance profiling via PyTorch profiler
|
|
enable_kernel_profile = False
|
|
|
|
|
|
# config specific to codegen/triton.py
|
|
class triton:
|
|
|
|
# Use cudagraphs on output code
|
|
cudagraphs = True
|
|
|
|
# Synchronize before and after every compiled graph.
|
|
debug_sync_graph = False
|
|
|
|
# Synchronize after every kernel launch, to help pinpoint bugs
|
|
debug_sync_kernel = False
|
|
|
|
# choose conv backend, "aten" or "triton" or "autotune"
|
|
convolution = "aten"
|
|
|
|
# choose mm backend, "aten" or "triton" or "autotune"
|
|
mm = "aten"
|
|
|
|
# Always load full blocks (rather than broadcasting inside the block)
|
|
# Set default as True because otherwise will encouter `map::at` error
|
|
# in triton if loading from 1-dim tensor using 2-dim pointer offset
|
|
# https://triton-lang.slack.com/archives/C01L1FLTX70/p1656023403343639
|
|
# could be set as False if triton fixes the bug later
|
|
dense_indexing = False
|
|
|
|
# limit tiling dimensions
|
|
max_tiles = 2
|
|
|
|
# use triton.autotune?
|
|
autotune = True
|
|
|
|
use_bmm = False
|
|
|
|
# should we stop a fusion to allow better tiling?
|
|
tiling_prevents_pointwise_fusion = True
|
|
tiling_prevents_reduction_fusion = True
|
|
# should we give different names to kernels
|
|
ordered_kernel_names = False
|
|
# should we put op names in kernel names
|
|
descriptive_kernel_names = True
|
|
|
|
|
|
# create a directory containing lots of debug information
|
|
class trace:
|
|
# master switch for all debugging flags below
|
|
enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1"
|
|
|
|
# Save python logger call >=logging.DEBUG
|
|
debug_log = True
|
|
|
|
# Save python logger call >=logging.INFO
|
|
info_log = False
|
|
|
|
# Save input FX graph (post decomps)
|
|
fx_graph = True
|
|
|
|
# Save TorchInductor IR before fusion pass
|
|
ir_pre_fusion = True
|
|
|
|
# Save TorchInductor IR after fusion pass
|
|
ir_post_fusion = True
|
|
|
|
# Copy generated code to trace dir
|
|
output_code = True
|
|
|
|
# SVG figure showing post-fusion graph
|
|
graph_diagram = False
|
|
|
|
# Store cProfile (see snakeviz to view)
|
|
compile_profile = False
|
|
|
|
# Upload the .tar.gz file
|
|
# Needs to be overriden based on specific environment needs
|
|
upload_tar = None
|
|
|
|
|
|
class InductorConfigContext:
|
|
static_memory: bool
|
|
matmul_tune: str
|
|
matmul_padding: bool
|
|
triton_autotune: bool
|
|
triton_bmm: bool
|
|
triton_mm: str
|
|
triton_convolution: str
|
|
rematerialize_threshold: int
|
|
rematerialize_acc_threshold: int
|
|
|
|
def _save(self):
|
|
self.static_memory = triton.cudagraphs
|
|
self.matmul_tune = triton.mm
|
|
self.matmul_padding = shape_padding
|
|
self.triton_autotune = triton.autotune
|
|
self.triton_bmm = triton.use_bmm
|
|
self.triton_mm = triton.mm
|
|
self.triton_convolution = triton.convolution
|
|
self.rematerialize_threshold = realize_reads_threshold
|
|
self.rematerialize_acc_threshold = realize_acc_reads_threshold
|
|
|
|
def _apply(self):
|
|
triton.cudagraphs = self.static_memory
|
|
triton.mm = self.matmul_tune
|
|
shape_padding = self.matmul_padding
|
|
triton.autotune = self.triton_autotune
|
|
triton.use_bmm = self.triton_bmm
|
|
triton.mm = self.triton_mm
|
|
triton.convolution = self.triton_convolution
|
|
realize_reads_threshold = self.rematerialize_threshold
|
|
realize_acc_reads_threshold = self.rematerialize_acc_threshold
|
|
|
|
def __init__(self, arg=None):
|
|
self._save()
|
|
if arg is None:
|
|
return
|
|
# Handle mode
|
|
if type(arg) is str:
|
|
|
|
def default():
|
|
self.static_memory = False
|
|
|
|
def reduce_overhead():
|
|
self.static_memory = True
|
|
|
|
def max_autotune():
|
|
self.static_memory = False
|
|
self.matmul_padding = True
|
|
self.triton_convolution = "autotune"
|
|
self.triton_mm = "autotune"
|
|
self.matmul_padding = True
|
|
|
|
modes = {
|
|
x.__name__.replace("_", "-"): x
|
|
for x in [default, reduce_overhead, max_autotune]
|
|
}
|
|
if arg not in modes:
|
|
raise RuntimeError(
|
|
f"Unrecognized mode {arg}, should be one of {', '.join(modes.keys())}"
|
|
)
|
|
modes[arg]()
|
|
return
|
|
# Handle passes
|
|
for (name, val) in arg.items():
|
|
attr_name = name.replace("-", "_")
|
|
if not hasattr(self, attr_name):
|
|
known_passes = ", ".join(
|
|
[x.replace("_", "-") for x in dir(self) if not x.startswith("_")]
|
|
)
|
|
raise RuntimeError(
|
|
f"Unexpected optimization pass {name}, known passes are {known_passes}"
|
|
)
|
|
if type(val) != type(getattr(self, attr_name)):
|
|
val_type_str = type(val).__name__
|
|
expected_type_str = type(getattr(self, attr_name)).__name__
|
|
raise RuntimeError(
|
|
f"Unexpected type of attr {name}, got {val_type_str} should be {expected_type_str}"
|
|
)
|
|
setattr(self, attr_name, val)
|
|
|
|
def __enter__(self):
|
|
self._prev = InductorConfigContext()
|
|
self._apply()
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
self._prev._apply()
|