[AOTI][BE] Change test_aoti_inference to one-pass build (#164277)

Summary: To fix https://github.com/pytorch/pytorch/issues/159400. Currently, test_aoti_abi_check and test_aoti_inference need to be built in two passes, first build pytorch using the regular `pythonsetup.py develop` and then build with `CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 python setup.py devleop`. This is cumbersome. Fix by rewriting CMakeLists.txt for test_aoti_inference to one-pass build which runs AOTI to compile models at the test time. Also update CI test script to get rid of two-pass build. For test_aoti_abi_check, it is not AOTI specific, so we make it not guarded by BUILD_AOT_INDUCTOR_TEST.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/164277
Approved by: https://github.com/janeyx99
This commit is contained in:
Bin Bao 2025-10-28 10:30:16 -04:00 committed by PyTorch MergeBot
parent 895795f07c
commit 687c15c0b3
7 changed files with 150 additions and 92 deletions

View File

@ -100,6 +100,8 @@ COPY ./common/common_utils.sh common_utils.sh
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
COPY ci_commit_pins/timm.txt timm.txt
COPY ci_commit_pins/torchbench.txt torchbench.txt
# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

View File

@ -460,28 +460,18 @@ test_inductor_shard() {
--verbose
}
test_inductor_aoti() {
# docker build uses bdist_wheel which does not work with test_aot_inductor
# TODO: need a faster way to build
test_inductor_aoti_cpp() {
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
# We need to hipify before building again
python3 tools/amd_build/build_amd.py
fi
if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
# TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
else
BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
fi
# aoti cmake custom command requires `torch` to be installed
# initialize the cmake build cache and install torch
/usr/bin/env "${BUILD_COMMAND[@]}"
# rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
/usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
/usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
}
@ -1776,7 +1766,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
if [[ "$SHARD_NUMBER" -eq "1" ]]; then
test_inductor_aoti
test_inductor_aoti_cpp
fi
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
install_torchvision

View File

@ -1358,9 +1358,15 @@ if(BUILD_TEST)
)
else()
add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
# NativeRT is disabled
# add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
if(BUILD_AOT_INDUCTOR_TEST)
add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
endif()
if(USE_DISTRIBUTED)
add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
if(NOT WIN32)
@ -1378,16 +1384,6 @@ if(BUILD_TEST)
${CMAKE_BINARY_DIR}/test_mobile_nnc
)
endif()
add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
${CMAKE_BINARY_DIR}/test_lazy)
endif()
if(BUILD_AOT_INDUCTOR_TEST)
add_subdirectory(
${TORCH_ROOT}/test/cpp/aoti_abi_check
${CMAKE_BINARY_DIR}/test_aoti_abi_check)
add_subdirectory(
${TORCH_ROOT}/test/cpp/aoti_inference
${CMAKE_BINARY_DIR}/test_aoti_inference)
endif()
endif()

View File

@ -1,3 +1,8 @@
# Skip on windows
if(WIN32)
return()
endif()
set(AOTI_ABI_CHECK_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_abi_check)
# Build the cpp gtest binary containing the cpp-only tests.
@ -30,8 +35,15 @@ target_compile_definitions(test_aoti_abi_check PRIVATE USE_GTEST)
# WARNING: DO NOT LINK torch!!!
# The purpose is to check if the used aten/c10 headers are written in a header-only way
target_link_libraries(test_aoti_abi_check PRIVATE gtest_main)
target_link_libraries(test_aoti_abi_check PRIVATE gtest_main sleef)
target_include_directories(test_aoti_abi_check PRIVATE ${ATen_CPU_INCLUDE})
if(NOT USE_SYSTEM_SLEEF)
target_include_directories(test_aoti_abi_check PRIVATE ${CMAKE_BINARY_DIR}/include)
endif()
# Disable unused-variable warnings for variables that are only used to test compilation
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-variable)
target_compile_options_if_supported(test_aoti_abi_check -Wno-unused-but-set-variable)
foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
foreach(i RANGE ${NUM_CPU_CAPABILITY_NAMES})
@ -41,12 +53,17 @@ foreach(test_src ${AOTI_ABI_CHECK_VEC_TEST_SRCS})
separate_arguments(FLAGS UNIX_COMMAND "${FLAGS}")
add_executable(${test_name}_${CPU_CAPABILITY} "${test_src}")
target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main)
target_link_libraries(${test_name}_${CPU_CAPABILITY} PRIVATE gtest_main sleef)
target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${ATen_CPU_INCLUDE})
if(NOT USE_SYSTEM_SLEEF)
target_include_directories(${test_name}_${CPU_CAPABILITY} PRIVATE ${CMAKE_BINARY_DIR}/include)
endif()
# Define CPU_CAPABILITY and CPU_CAPABILITY_XXX macros for conditional compilation
target_compile_definitions(${test_name}_${CPU_CAPABILITY} PRIVATE CPU_CAPABILITY=${CPU_CAPABILITY} CPU_CAPABILITY_${CPU_CAPABILITY})
target_compile_options(${test_name}_${CPU_CAPABILITY} PRIVATE ${FLAGS})
target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-variable)
target_compile_options_if_supported(${test_name}_${CPU_CAPABILITY} -Wno-unused-but-set-variable)
endforeach()
endforeach()

View File

@ -2,10 +2,27 @@
#include <ATen/cpu/vec/vec.h>
#include <iostream>
namespace torch {
namespace aot_inductor {
template <typename T>
void ExpectVecEqual(
const at::vec::Vectorized<T>& expected,
const at::vec::Vectorized<T>& actual) {
using Vec = at::vec::Vectorized<T>;
// Have to use std::vector for comparison because at::vec::Vectorized doesn't
// support operator[] on aarch64
std::vector<T> expected_data(Vec::size());
std::vector<T> actual_data(Vec::size());
expected.store(expected_data.data());
actual.store(actual_data.data());
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_data[i], actual_data[i]);
}
}
TEST(TestVec, TestAdd) {
using Vec = at::vec::Vectorized<int>;
std::vector<int> a(1024, 1);
@ -16,9 +33,7 @@ TEST(TestVec, TestAdd) {
std::vector<int> expected(1024, 3);
Vec expected_vec = Vec::loadu(expected.data());
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
ExpectVecEqual(expected_vec, actual_vec);
}
TEST(TestVec, TestMax) {
@ -30,9 +45,7 @@ TEST(TestVec, TestMax) {
Vec actual_vec = at::vec::maximum(a_vec, b_vec);
Vec expected_vec = b_vec;
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
ExpectVecEqual(expected_vec, actual_vec);
}
TEST(TestVec, TestMin) {
@ -44,9 +57,7 @@ TEST(TestVec, TestMin) {
Vec actual_vec = at::vec::minimum(a_vec, b_vec);
Vec expected_vec = a_vec;
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
ExpectVecEqual(expected_vec, actual_vec);
}
TEST(TestVec, TestConvert) {
@ -58,9 +69,7 @@ TEST(TestVec, TestConvert) {
auto actual_vec = at::vec::convert<float>(a_vec);
auto expected_vec = b_vec;
for (int i = 0; i < at::vec::Vectorized<int>::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
ExpectVecEqual(expected_vec, actual_vec);
}
TEST(TestVec, TestClampMin) {
@ -72,9 +81,7 @@ TEST(TestVec, TestClampMin) {
Vec actual_vec = at::vec::clamp_min(a_vec, min_vec);
Vec expected_vec = min_vec;
for (int i = 0; i < Vec::size(); i++) {
EXPECT_EQ(expected_vec[i], actual_vec[i]);
}
ExpectVecEqual(expected_vec, actual_vec);
}
} // namespace aot_inductor

View File

@ -1,4 +1,3 @@
set(AOT_INDUCTOR_TEST_ROOT ${TORCH_ROOT}/test/cpp/aoti_inference)
# Build custom TorchScript op for AOTInductor
@ -8,27 +7,12 @@ set_target_properties(aoti_custom_class PROPERTIES
if(USE_CUDA)
target_compile_definitions(aoti_custom_class PRIVATE USE_CUDA)
elseif(USE_ROCM)
target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
target_compile_definitions(aoti_custom_class PRIVATE USE_ROCM)
endif()
# Link against LibTorch
target_link_libraries(aoti_custom_class torch)
# the custom command that generates the TorchScript module
add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
# This script requires the torch package to be installed.
COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
DEPENDS torch torch_python aoti_custom_class ${AOT_INDUCTOR_TEST_ROOT}/compile_model.py
)
add_custom_target(aoti_script_model ALL
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_data.pt
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cpu.pt
DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/script_model_cuda.pt
)
add_dependencies(aoti_script_model aoti_custom_class)
# Build the cpp gtest binary containing the cpp-only tests.
set(INDUCTOR_TEST_SRCS
${AOT_INDUCTOR_TEST_ROOT}/test.cpp
@ -37,23 +21,12 @@ set(INDUCTOR_TEST_SRCS
add_executable(test_aoti_inference
${TORCH_ROOT}/test/cpp/common/main.cpp
${INDUCTOR_TEST_SRCS}
data.pt
script_data.pt
script_model_cpu.pt
script_model_cuda.pt
)
add_dependencies(test_aoti_inference aoti_custom_class aoti_script_model)
add_dependencies(test_aoti_inference aoti_custom_class)
# TODO temporary until we can delete the old gtest polyfills.
target_compile_definitions(test_aoti_inference PRIVATE USE_GTEST)
# Define a custom command to generate the library
add_custom_command(
OUTPUT data.pt
COMMAND python ${AOT_INDUCTOR_TEST_ROOT}/test.py
DEPENDS ${AOT_INDUCTOR_TEST_ROOT}/test.py
)
target_link_libraries(test_aoti_inference PRIVATE
torch
gtest_main
@ -71,6 +44,10 @@ target_compile_definitions(test_aoti_inference PRIVATE
CMAKE_CURRENT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
)
target_compile_options_if_supported(test_aoti_inference -Wno-unused-variable)
target_compile_options_if_supported(test_aoti_inference -Wno-unused-but-set-variable)
target_compile_options_if_supported(test_aoti_inference -Wno-unused-function)
if(INSTALL_TEST)
install(TARGETS test_aoti_inference DESTINATION bin)
# Install PDB files for MSVC builds

View File

@ -2,7 +2,9 @@
#include <gtest/gtest.h>
#include <atomic>
#include <condition_variable>
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <functional>
#include <mutex>
#include <queue>
@ -28,6 +30,64 @@
namespace {
// Function to check if test data files exist and are valid
bool testDataFilesExist() {
std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
std::array<std::string, 4> required_files = {
"data.pt",
"script_data.pt",
"script_model_cpu.pt",
"script_model_cuda.pt"};
for (const auto& filename : required_files) {
std::string filepath = bindir + "/" + filename;
std::ifstream file(filepath);
if (!file.good()) {
return false;
}
}
return true;
}
// Function to ensure test data files are generated at runtime
void ensureTestDataGenerated() {
static std::once_flag generated_flag;
std::call_once(generated_flag, []() {
// Only generate if files don't exist or are placeholders
if (testDataFilesExist()) {
return;
}
std::string bindir = STRINGIZE(CMAKE_CURRENT_BINARY_DIR);
// Calculate path to source directory: build/test_aoti_inference -> build ->
// pytorch
std::string pytorch_root = bindir.substr(0, bindir.find_last_of("/"));
pytorch_root = pytorch_root.substr(0, pytorch_root.find_last_of("/"));
std::string source_dir = pytorch_root + "/test/cpp/aoti_inference";
// Generate test data files (data.pt, etc.) by running test.py directly
std::string test_script = source_dir + "/test.py";
std::string test_data_cmd = "cd " + bindir + " && python " + test_script;
std::cout << "Generating test data: " << test_data_cmd << std::endl;
int result1 = std::system(test_data_cmd.c_str());
if (result1 != 0) {
std::cerr << "Warning: Test data generation failed with code " << result1
<< std::endl;
}
// Generate model files (script_*.pt) by running compile_model.py directly
std::string compile_script = source_dir + "/compile_model.py";
std::string models_cmd = "cd " + bindir + " && python " + compile_script;
std::cout << "Generating model files: " << models_cmd << std::endl;
int result2 = std::system(models_cmd.c_str());
if (result2 != 0) {
std::cerr << "Warning: Model generation failed with code " << result2
<< std::endl;
}
});
}
const std::unordered_map<std::string, at::Tensor> derefTensorConstantMap(
torch::inductor::TensorConstantMap tensor_constant_map) {
std::unordered_map<std::string, at::Tensor> ret;
@ -855,7 +915,6 @@ void test_aoti_free_buffer(bool use_runtime_constant_folding) {
}
}
#if defined(USE_CUDA) || defined(USE_ROCM)
void test_cuda_alloc_test() {
torch::NoGradGuard no_grad;
@ -895,8 +954,8 @@ void test_cuda_alloc_test() {
runner->run(data_loader.attr(inputs_attr.c_str()).toTensorList().vec());
ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0]));
}
#endif
#ifdef USE_CUDA
class ThreadPool {
private:
struct Task {
@ -1037,86 +1096,96 @@ void test_multi_cuda_streams(const std::string& device) {
ASSERT_TRUE(torch::allclose(ref_output_tensors[0], all_outputs[i][0]));
}
}
#endif
#endif // USE_CUDA
#endif // USE_CUDA || USE_ROCM
} // namespace
namespace torch::aot_inductor {
TEST(AotInductorTest, BasicTestCpu) {
// Test fixture that ensures test data is generated once for all tests
class AotInductorTest : public ::testing::Test {
public:
// This runs once before all tests in this test suite
static void SetUpTestSuite() {
ensureTestDataGenerated();
}
};
TEST_F(AotInductorTest, BasicTestCpu) {
test_aoti("cpu", false);
}
TEST(AotInductorTest, BasicScriptTestCpu) {
TEST_F(AotInductorTest, BasicScriptTestCpu) {
test_aoti_script("cpu");
}
TEST(AotInductorTest, BasicPackageLoaderTestCpu) {
TEST_F(AotInductorTest, BasicPackageLoaderTestCpu) {
test_aoti_package_loader("cpu", false);
}
TEST(AotInductorTest, ExtractConstantsMapCpu) {
TEST_F(AotInductorTest, ExtractConstantsMapCpu) {
test_aoti_extract_constants_map("cpu");
}
#ifdef USE_CUDA
TEST(AotInductorTest, BasicTestCuda) {
TEST_F(AotInductorTest, BasicTestCuda) {
test_aoti("cuda", true);
test_aoti("cuda", false);
}
TEST(AotInductorTest, BasicScriptTestCuda) {
TEST_F(AotInductorTest, BasicScriptTestCuda) {
test_aoti_script("cuda");
}
TEST(AotInductorTest, BasicPackageLoaderTestCuda) {
TEST_F(AotInductorTest, BasicPackageLoaderTestCuda) {
test_aoti_package_loader("cuda", false);
}
TEST(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
TEST_F(AotInductorTest, BasicPackageLoaderTestMultiGpuCuda) {
test_aoti_package_loader_multi_gpu("cuda", false);
}
TEST(AotInductorTest, UpdateUserManagedConstantsCuda) {
TEST_F(AotInductorTest, UpdateUserManagedConstantsCuda) {
test_aoti_user_managed_buffer();
}
TEST(AotInductorTest, RuntimeUpdateConstantsCuda) {
TEST_F(AotInductorTest, RuntimeUpdateConstantsCuda) {
test_aoti_constants_update("cuda", true);
}
TEST(AotInductorTest, UpdateConstantsCuda) {
TEST_F(AotInductorTest, UpdateConstantsCuda) {
test_aoti_constants_update("cuda", false);
}
TEST(AotInductorTest, ExtractConstantsMapCuda) {
TEST_F(AotInductorTest, ExtractConstantsMapCuda) {
test_aoti_extract_constants_map("cuda");
}
TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
TEST_F(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) {
test_aoti_double_buffering("cuda", true);
}
TEST(AotInductorTest, UpdateInactiveConstantsCuda) {
TEST_F(AotInductorTest, UpdateInactiveConstantsCuda) {
test_aoti_double_buffering("cuda", false);
}
TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
TEST_F(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) {
test_aoti_double_buffering_with_tensor_constants();
}
TEST(AotInductorTest, FreeInactiveConstantBufferCuda) {
TEST_F(AotInductorTest, FreeInactiveConstantBufferCuda) {
test_aoti_free_buffer(false);
}
TEST(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
TEST_F(AotInductorTest, FreeInactiveConstantBufferRuntimeConstantFoldingCuda) {
test_aoti_free_buffer(true);
}
TEST(AotInductorTest, MultiStreamTestCuda) {
TEST_F(AotInductorTest, MultiStreamTestCuda) {
test_multi_cuda_streams("cuda");
}
TEST(AotInductorTest, CudaAllocTestCuda) {
TEST_F(AotInductorTest, CudaAllocTestCuda) {
test_cuda_alloc_test();
}
#endif