Benjamin Glass 2025-05-21 16:18:09 +00:00 committed by PyTorch MergeBot
parent 4c6f0fe22f
commit 3c89cfd460
3 changed files with 141 additions and 52 deletions

View File

@ -138,6 +138,7 @@ if TYPE_CHECKING:
from concurrent.futures import Future
from .compile_fx import _CompileFxKwargs
from .cpp_builder import BuildOptionsBase
from .graph import GraphLowering
from .ir import ChoiceCaller
from .output_code import CompiledFxGraphConstants, OutputCode
@ -2225,7 +2226,7 @@ def _precompile_header(
os.makedirs(_HEADER_LOCK_DIR, exist_ok=True)
_worker_compile_cpp(
os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"),
cpp_builder,
(cpp_builder,),
)
return header_full_path
@ -2251,6 +2252,9 @@ def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str:
@clear_on_fresh_inductor_cache
class CppCodeCache:
"""Compiles and caches C++ libraries. Users of this class supply the source code to
be compiled, while compilation flags are set by CppBuilder."""
cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
cache_clear = staticmethod(cache.clear)
cpp_compile_command_flags: dict[str, Any] = {}
@ -2292,11 +2296,14 @@ class CppCodeCache:
@classmethod
def load_async(
cls,
source_code: str,
main_code: str,
device_type: str = "cpu",
submit_fn: Any = None,
extra_flags: Sequence[str] = (),
optimized_code: Optional[str] = None,
) -> Any:
"""Compile and load a C++ library. Returns a callable that returns the loaded
library."""
compile_command = {
**cls.cpp_compile_command_flags,
"device_type": device_type,
@ -2307,48 +2314,112 @@ class CppCodeCache:
_set_gpu_runtime_env() # cpp_extension consults the env
cpp_build_option = CppTorchDeviceOptions(**compile_command)
command_gen = CppBuilder(name="o", sources="i", BuildOption=cpp_build_option)
# write function will calc source_code hash, the same source code with different
# ISA level should be generate different hash.
# So we need get a command_line which contains isa related parameter as a part of hash key.
# And then pass the command_line to below write function as extra parameter to
# guarantee the source code hash contains ISA difference.
vec_isa_cmd = repr(command_gen.get_command_line())
key, input_path = write(source_code, "cpp", extra=vec_isa_cmd)
# Note the distinction between the two booleans. We do minimal optimization if
# the optimized_code argument is present at all, since that's how the user of
# this function opts in, but we do compilation and linking in one step if the
# optimized_code argument is empty (as a micro-optimization).
main_build_option = CppTorchDeviceOptions(
compile_only=bool(optimized_code),
min_optimize=optimized_code is not None,
**compile_command,
)
optimized_build_option = CppTorchDeviceOptions(
compile_only=True, **compile_command
)
def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
"""Writing the code to file will calculate a hash, which we need to vary if
the command line flags change. This implements a mostly-generic way of
validating that."""
return CppBuilder(
name="o", sources="i", BuildOption=build_option
).get_command_line()
main_cmd_line = get_hashable_command_line(main_build_option)
optimized_cmd_line = get_hashable_command_line(optimized_build_option)
key, main_path = write(
main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}"
)
# Don't bother writing if the argument is empty.
if optimized_code:
_, optimized_path = write(
optimized_code, "optimized.cpp", extra=optimized_cmd_line
)
else:
# Unused, but makes type checkers happy.
optimized_path = os.devnull
if key not in cls.cache:
from torch.utils._filelock import FileLock
lock_path = os.path.join(get_lock_dir(), key + ".lock")
output_name, output_dir = get_name_and_dir_from_output_file_path(input_path)
future: Optional[Future[Any]] = None
lib = None
# if requested, pre-compile any headers
if (
config.cpp_cache_precompile_headers
and not _IS_WINDOWS
and (header_file := cls._get_uncompiled_header(device_type))
):
cpp_build_option.precompiled_header = _precompile_header(
header_file,
vec_isa_cmd,
**compile_command,
if config.cpp_cache_precompile_headers and not _IS_WINDOWS:
if header := cls._get_uncompiled_header(device_type):
main_build_option.precompiled_header = _precompile_header(
header,
main_cmd_line,
min_optimize=optimized_code is not None,
**compile_command,
)
# Currently, the optimized_code field is only used for cpp kernel code,
# so go ahead and precompile the relevant header here. Revisit this
# decision if that ever changes.
if optimized_code and (header := _get_cpp_prefix_header(device_type)):
optimized_build_option.precompiled_header = _precompile_header(
header,
optimized_cmd_line,
**compile_command,
)
main_name, output_dir = get_name_and_dir_from_output_file_path(main_path)
main_builder = CppBuilder(
name=main_name,
sources=main_path,
BuildOption=main_build_option,
output_dir=output_dir,
)
if optimized_code:
optimized_name, _ = get_name_and_dir_from_output_file_path(
optimized_path
)
optimized_builder = CppBuilder(
name=optimized_name,
sources=optimized_path,
BuildOption=optimized_build_option,
output_dir=output_dir,
)
cpp_builder = CppBuilder(
name=output_name,
sources=input_path,
output_dir=output_dir,
BuildOption=cpp_build_option,
)
worker_fn = functools.partial(
_worker_compile_cpp,
lock_path,
cpp_builder,
)
binary_path = normalize_path_separator(cpp_builder.get_target_file_path())
linker = CppBuilder(
name=main_name,
sources=[
main_builder.get_target_file_path(),
optimized_builder.get_target_file_path(),
],
BuildOption=CppTorchDeviceOptions(**compile_command),
output_dir=output_dir,
)
worker_fn = functools.partial(
_worker_compile_cpp,
lock_path,
(main_builder, optimized_builder, linker),
)
binary_path = normalize_path_separator(linker.get_target_file_path())
else:
worker_fn = functools.partial(
_worker_compile_cpp, lock_path, (main_builder,)
)
binary_path = normalize_path_separator(
main_builder.get_target_file_path()
)
def load_fn() -> Any:
nonlocal lib
@ -2371,19 +2442,20 @@ class CppCodeCache:
return cls.cache[key]
@classmethod
def load(cls, source_code: str, device_type: str = "cpu") -> Any:
return cls.load_async(source_code, device_type)()
def load(cls, *args: Any, **kwargs: Any) -> Any:
return cls.load_async(*args, **kwargs)()
def _worker_compile_cpp(
lock_path: str,
cpp_builder: CppBuilder,
cpp_builders: Sequence[CppBuilder],
) -> None:
from torch.utils._filelock import FileLock
with FileLock(lock_path, timeout=LOCK_TIMEOUT):
if not os.path.exists(cpp_builder.get_target_file_path()):
cpp_builder.build()
for builder in cpp_builders:
if not os.path.exists(builder.get_target_file_path()):
builder.build()
# Customized Python binding for cpp kernels
@ -2513,19 +2585,24 @@ class CppPythonBindingsCodeCache(CppCodeCache):
@classmethod
def load_pybinding_async(
cls,
argtypes: list[str],
source_code: str,
argtypes: Sequence[str],
main_code: str,
device_type: str = "cpu",
num_outputs: int = -1,
submit_fn: Any = None,
extra_flags: Sequence[str] = (),
kernel_code: Optional[str] = None,
) -> Any:
"""
Wrap a C++ function in fast Python bindings.
Args:
argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
source_code: C++ source code containing a ENTRY_FUNCTION() function
main_code: C++ source code containing ENTRY_FUNCTION(). Will be built at
-O3 if kernel_code is None (to maximize performance in any kernels that
are present), or -O1 otherwise (to minimize compile time).
kernel_code: If present, C++ source code that will be built at -O3 and
linked to main_code.
Returns:
A python version of ENTRY_FUNCTION()
@ -2541,10 +2618,11 @@ class CppPythonBindingsCodeCache(CppCodeCache):
extra_parse_arg=cls.extra_parse_arg.format(array_len=num_outputs),
)
get_result = cls.load_async(
source_code + suffix,
main_code + suffix,
device_type,
submit_fn=submit_fn,
extra_flags=extra_flags,
optimized_code=kernel_code,
)
result = None

View File

@ -1074,6 +1074,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
result.writeline("} // inductor_entry_impl")
def generate_end(self, result):
"""Generates the end of the code block, and any code needed to call it."""
if V.graph.aot_mode:
if V.graph.is_const_graph:
result.writeline("} // AOTInductorModel::_const_run_impl")
@ -1081,19 +1082,29 @@ class CppWrapperCpu(PythonWrapperCodegen):
result.writeline("} // namespace torch::aot_inductor\n\n\n")
return
# Add any kernel definitions into the wrapped code. We currently only build
# them in separate files in AOT mode.
result.splice(self.kernel_declarations.getvalue())
self.kernel_declarations.clear()
# Close the wrapper code block, then write any kernel definitions.
result.splice("'''\n)")
if self.kernel_declarations:
result.splice("\nkernel_src = (\nr'''")
result.splice(self.kernel_declarations.getvalue())
result.splice("'''\n)")
else:
result.splice(
"""
kernel_src = ''
"""
)
# cpp entry function for JIT with cpp wrapper
result.splice(
f"""
'''
)
inductor_entry = CppWrapperCodeCache.load_pybinding(
["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
argtypes=["std::vector<AtenTensorHandle>"],
main_code=cpp_wrapper_src,
device_type="{self.device}",
num_outputs={len(V.graph.graph_outputs)},
kernel_code=kernel_src,
)
"""
)

View File

@ -2284,8 +2284,8 @@ class GraphLowering(torch.fx.Interpreter):
return self._compile_to_module()
def _compile_to_module(self) -> CompiledModule:
# Currently, if we're here, we don't have to worry about the kernel code, which
# is only available in AOTInductor mode.
# If we're here, we don't have to worry about the kernel code, which is only
# returned separately in AOTInductor mode.
wrapper_code, _ = (
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
)