diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh index 178db42a609..a0eb0b72df2 100644 --- a/.ci/aarch64_linux/aarch64_ci_build.sh +++ b/.ci/aarch64_linux/aarch64_ci_build.sh @@ -31,8 +31,7 @@ pip install -r /pytorch/requirements.txt pip install auditwheel==6.2.0 wheel if [ "$DESIRED_CUDA" = "cpu" ]; then echo "BASE_CUDA_VERSION is not set. Building cpu wheel." - #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files - USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn + python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn else echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA" export USE_SYSTEM_NCCL=1 @@ -46,6 +45,5 @@ else export USE_NVIDIA_PYPI_LIBS=1 fi - #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files - USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda + python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda fi diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py index 1b6429fa8c0..d4afea81ac0 100755 --- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py +++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py @@ -317,7 +317,7 @@ if __name__ == "__main__": ).decode() print("Building PyTorch wheel") - build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 " + build_vars = "" # MAX_JOB=5 is not required for CPU backend (see commit 465d98b) if enable_cuda: build_vars += "MAX_JOBS=5 " diff --git a/.gitignore b/.gitignore index 2dd40f8cfa8..ca87f1306e1 100644 --- a/.gitignore +++ b/.gitignore @@ -259,6 +259,9 @@ gen .pytest_cache aten/build/* +# Linker scripts for prioritized text optimization +cmake/linker_script.ld + # Bram plsdontbreak diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fba0eea881..8323f310fec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -380,6 +380,13 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF) cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON "CPU_AARCH64" OFF) +# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le. +set(USE_PRIORITIZED_TEXT_DEFAULT OFF) +if(LINUX AND CPU_AARCH64) + set(USE_PRIORITIZED_TEXT_DEFAULT ON) +endif() +cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld." + "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF) option(USE_MIMALLOC "Use mimalloc" OFF) # Enable third party mimalloc library to improve memory allocation performance @@ -657,6 +664,11 @@ endif(MSVC) string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all") +# Set linker max-page-size to 64KiB on AArch64 Linux +if(LINUX AND CPU_AARCH64) + add_link_options_if_supported("-z,max-page-size=0x10000") +endif() + # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not # applicable to mobile are disabled by this variable. Setting # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it @@ -1421,3 +1433,57 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA) install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") endif() + +if(USE_PRIORITIZED_TEXT_FOR_LD) + add_compile_options( + $<$:-ffunction-sections> + $<$:-fdata-sections> + ) + set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld") + set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt") + + add_custom_command( + OUTPUT "${LINKER_SCRIPT_FILE_OUT}" + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}" + DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}" + COMMENT "Generating prioritized text linker files" + VERBATIM + ) + + add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}") + + if(BUILD_PYTHON) + set(LINKER_OPT_TARGETS torch_python) + endif() + + if(NOT BUILD_LIBTORCHLESS) + list(APPEND LINKER_OPT_TARGETS torch_cpu c10) + if(USE_CUDA) + list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda) + endif() + if(USE_XPU) + list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu) + endif() + if(USE_ROCM) + list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip) + endif() + endif() + + foreach(tgt IN LISTS LINKER_OPT_TARGETS) + if(TARGET ${tgt}) + add_dependencies("${tgt}" generate_linker_script) + target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}") + set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}") + else() + message(WARNING "Requested target '${tgt}' for linker script optimization was not found.") + endif() + endforeach() + +else() + if(LINUX AND CPU_AARCH64) + message(WARNING [[ + It is strongly recommend to enable linker script optimization for all AArch64 Linux builds. + To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 + ]]) + endif() +endif() \ No newline at end of file diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index fb64e99bccf..a0bfb22bed8 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -158,6 +158,7 @@ function(caffe2_print_configuration_summary) if(${USE_KLEIDIAI}) message(STATUS " USE_KLEIDIAI : ${USE_KLEIDIAI}") endif() + message(STATUS " USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}") message(STATUS " USE_UCC : ${USE_UCC}") if(${USE_UCC}) message(STATUS " USE_SYSTEM_UCC : ${USE_SYSTEM_UCC}") diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake index 68e66bb3fc3..c96ffebf858 100644 --- a/cmake/public/utils.cmake +++ b/cmake/public/utils.cmake @@ -482,6 +482,7 @@ function(torch_update_find_cuda_flags) endfunction() include(CheckCXXCompilerFlag) +include(CheckLinkerFlag) ############################################################################## # CHeck if given flag is supported and append it to provided outputvar @@ -511,3 +512,22 @@ function(target_compile_options_if_supported target flag) target_compile_options(${target} PRIVATE ${flag}) endif() endfunction() + +# Check if a global link option is supported +function(add_link_options_if_supported flag) + check_linker_flag(C "LINKER:${flag}" _supported) + if("${_supported}") + add_link_options("LINKER:${flag}") + else() + message(WARNING "Attempted to use unsupported link option : ${flag}.") + endif() +endfunction() + +function(target_link_options_if_supported tgt flag) + check_linker_flag(C "LINKER:${flag}" _supported) + if("${_supported}") + target_link_options("${tgt}" PRIVATE "LINKER:${flag}") + else() + message(WARNING "Attempted to use unsupported link option : ${flag}.") + endif() +endfunction() \ No newline at end of file diff --git a/setup.py b/setup.py index c0523a1b5c6..2bb63a93cec 100644 --- a/setup.py +++ b/setup.py @@ -227,9 +227,6 @@ # Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free. # By default, It is only enabled on Windows. # -# USE_PRIORITIZED_TEXT_FOR_LD -# Uses prioritized text form cmake/prioritized_text.txt for LD -# # BUILD_LIBTORCH_WHL # Builds libtorch.so and its dependencies as a wheel # @@ -323,7 +320,6 @@ from tools.setup_helpers.env import ( IS_LINUX, IS_WINDOWS, ) -from tools.setup_helpers.generate_linker_script import gen_linker_script def str2bool(value: str | None) -> bool: @@ -1627,26 +1623,6 @@ def main() -> None: if BUILD_PYTHON_ONLY: install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"] - if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")): - gen_linker_script( - filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld" - ) - linker_script_path = os.path.abspath("cmake/linker_script.ld") - os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}" - os.environ["CFLAGS"] = ( - os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections" - ) - os.environ["CXXFLAGS"] = ( - os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections" - ) - elif platform.system() == "Linux" and platform.processor() == "aarch64": - print_box( - """ - WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA. - To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1 - """ - ) - # Parse the command line and check the arguments before we proceed with # building deps and setup. We need to set values so `--help` works. dist = Distribution() diff --git a/tools/setup_helpers/generate_linker_script.py b/tools/setup_helpers/generate_linker_script.py index e66fc197062..b5a7a4ce7de 100644 --- a/tools/setup_helpers/generate_linker_script.py +++ b/tools/setup_helpers/generate_linker_script.py @@ -1,5 +1,7 @@ +import argparse import os import subprocess +from pathlib import Path def gen_linker_script( @@ -28,6 +30,10 @@ def gen_linker_script( assert len(text_line_start) == 1, "The linker script has multiple text sections!" text_line_start = text_line_start[0] + # ensure that parent directory exists before writing + fout = Path(fout) + fout.parent.mkdir(parents=True, exist_ok=True) + with open(fout, "w") as f: for lineid, line in enumerate(linker_script_lines): if lineid == text_line_start + 2: @@ -36,3 +42,20 @@ def gen_linker_script( f.write(f" .text.{plines}\n") f.write(" )\n") f.write(f"{line}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate linker file based on prioritized symbols. Used for link-time optimization.", + ) + parser.add_argument( + "--filein", + help="Path to prioritized_text.txt input file", + default=argparse.SUPPRESS, + ) + parser.add_argument( + "--fout", help="Output path for linker ld file", default=argparse.SUPPRESS + ) + # convert args to a dict to pass to gen_linker_script + kwargs = vars(parser.parse_args()) + gen_linker_script(**kwargs)