import os import sys # add some debug printouts debug = False # use cpp wrapper instead of python wrapper cpp_wrapper = False # dead code elimination dce = False # assume input tensors are dynamic dynamic_shapes = ( os.environ.get("TORCHDYNAMO_DYNAMIC_SHAPES") == "1" ) # Use dynamic shapes if torchdynamo dynamic shapes is set # assume weight tensors are fixed size static_weight_shapes = True # put correctness assertions in generated code size_asserts = True # enable loop reordering based on input orders pick_loop_orders = True # generate inplace computations inplace_buffers = True # codegen benchmark harness benchmark_harness = True # control store vs recompute heuristic # For fanouts, rematearialization can lead to exponential blowup. So, have # smaller threshold realize_reads_threshold = 4 realize_bytes_threshold = 2000 # Threshold to prevent excessive accumulation of ops in one buffer during lowering realize_acc_reads_threshold = 8 # fallback to eager for random/dropout, this is slow but useful for debugging fallback_random = False # automatically create fallbacks when encountering an unhandled op implicit_fallbacks = True # Enables a fusion pass that groups nodes together before the scheduler prefuse_nodes = True # do bench to decide best layout, currently only for aten.conv tune_layout = False # fuse even in cases without common reads aggressive_fusion = False # how many nodes to allow into a single fusion max_fusion_size = 64 # replace small reductions with pointwise, disable with `= 1` unroll_reductions_threshold = 8 comment_origin = False def is_fbcode(): import torch return not hasattr(torch.version, "git_version") compile_threads = ( 1 if sys.platform == "win32" or is_fbcode() else min( 32, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else os.cpu_count(), ) ) # If kernel is fused, the name is generated from the origin node op names # for larger kernels limit this kernel_name_max_ops = 10 # How to import torchinductor, either torchinductor or torch.inductor inductor_import = __name__.replace(".config", "") # How to import torchdynamo, either torchdynamo or torch.dynamo dynamo_import = inductor_import.replace("inductor", "dynamo") # Pad input tensors of matmul/bmm/addmm to leverage Tensor Cores in NVIDIA GPUs shape_padding = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING", "0") == "1" # Pad input tensors in dimension N and M of bmm to leverage Tensor Cores in NVIDIA GPUs shape_padding_bmm = os.environ.get("TORCHINDUCTOR_SHAPE_PADDING_BMM", "1") == "1" # Fx-based linear/matmul/bmm + permute/transpose vertical fusion permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1" # Mark the wrapper call in PyTorch profiler profiler_mark_wrapper_call = False # config specific to codegen/cpp.pp class cpp: # set to torch.get_num_threads() threads = -1 # Assume number of threads is dynamic, don't specialize thread number. # Kernels don't recompile on thread number changes with this flag on. # For single-threaded workload, turning it on would incur a slight # performance degradation. dynamic_threads = False simdlen = None min_chunk_size = 4096 cxx = ( None, # download gcc12 from conda-forge if conda is installed # "g++-12", # "g++-11", # "g++-10", # "clang++", "g++", # "g++.par", ) # Allow kernel performance profiling via PyTorch profiler enable_kernel_profile = False # config specific to codegen/triton.py class triton: # Use cudagraphs on output code cudagraphs = True # Synchronize before and after every compiled graph. debug_sync_graph = False # Synchronize after every kernel launch, to help pinpoint bugs debug_sync_kernel = False # choose conv backend, "aten" or "triton" or "autotune" convolution = "aten" # choose mm backend, "aten" or "triton" or "autotune" mm = "aten" # Always load full blocks (rather than broadcasting inside the block) # Set default as True because otherwise will encouter `map::at` error # in triton if loading from 1-dim tensor using 2-dim pointer offset # https://triton-lang.slack.com/archives/C01L1FLTX70/p1656023403343639 # could be set as False if triton fixes the bug later dense_indexing = False # limit tiling dimensions max_tiles = 2 # use triton.autotune? autotune = True use_bmm = False # should we stop a fusion to allow better tiling? tiling_prevents_pointwise_fusion = True tiling_prevents_reduction_fusion = True # should we give different names to kernels ordered_kernel_names = False # should we put op names in kernel names descriptive_kernel_names = True # create a directory containing lots of debug information class trace: # master switch for all debugging flags below enabled = os.environ.get("TORCH_COMPILE_DEBUG", "0") == "1" # Save python logger call >=logging.DEBUG debug_log = True # Save python logger call >=logging.INFO info_log = False # Save input FX graph (post decomps) fx_graph = True # Save TorchInductor IR before fusion pass ir_pre_fusion = True # Save TorchInductor IR after fusion pass ir_post_fusion = True # Copy generated code to trace dir output_code = True # SVG figure showing post-fusion graph graph_diagram = False # Store cProfile (see snakeviz to view) compile_profile = False # Upload the .tar.gz file # Needs to be overriden based on specific environment needs upload_tar = None class InductorConfigContext: static_memory: bool matmul_tune: str matmul_padding: bool triton_autotune: bool triton_bmm: bool triton_mm: str triton_convolution: str rematerialize_threshold: int rematerialize_acc_threshold: int def _save(self): self.static_memory = triton.cudagraphs self.matmul_tune = triton.mm self.matmul_padding = shape_padding self.triton_autotune = triton.autotune self.triton_bmm = triton.use_bmm self.triton_mm = triton.mm self.triton_convolution = triton.convolution self.rematerialize_threshold = realize_reads_threshold self.rematerialize_acc_threshold = realize_acc_reads_threshold def _apply(self): triton.cudagraphs = self.static_memory triton.mm = self.matmul_tune shape_padding = self.matmul_padding triton.autotune = self.triton_autotune triton.use_bmm = self.triton_bmm triton.mm = self.triton_mm triton.convolution = self.triton_convolution realize_reads_threshold = self.rematerialize_threshold realize_acc_reads_threshold = self.rematerialize_acc_threshold def __init__(self, arg=None): self._save() if arg is None: return # Handle mode if type(arg) is str: def default(): self.static_memory = False def reduce_overhead(): self.static_memory = True def max_autotune(): self.static_memory = False self.matmul_padding = True self.triton_convolution = "autotune" self.triton_mm = "autotune" self.matmul_padding = True modes = { x.__name__.replace("_", "-"): x for x in [default, reduce_overhead, max_autotune] } if arg not in modes: raise RuntimeError( f"Unrecognized mode {arg}, should be one of {', '.join(modes.keys())}" ) modes[arg]() return # Handle passes for (name, val) in arg.items(): attr_name = name.replace("-", "_") if not hasattr(self, attr_name): known_passes = ", ".join( [x.replace("_", "-") for x in dir(self) if not x.startswith("_")] ) raise RuntimeError( f"Unexpected optimization pass {name}, known passes are {known_passes}" ) if type(val) != type(getattr(self, attr_name)): val_type_str = type(val).__name__ expected_type_str = type(getattr(self, attr_name)).__name__ raise RuntimeError( f"Unexpected type of attr {name}, got {val_type_str} should be {expected_type_str}" ) setattr(self, attr_name, val) def __enter__(self): self._prev = InductorConfigContext() self._apply() def __exit__(self, exc_type, exc_val, exc_tb): self._prev._apply()