Revert "[AOTI] Add a boxed_run API (#142213)"

This reverts commit 868984c3e3.

Reverted https://github.com/pytorch/pytorch/pull/142213 on behalf of https://github.com/kit1980 due to breaking lots of internal builds, see D68036023 ([comment](https://github.com/pytorch/pytorch/pull/142213#issuecomment-2588378262))
This commit is contained in:
PyTorch MergeBot 2025-01-13 22:43:47 +00:00
parent a54a784b82
commit 4f74864c94
14 changed files with 46 additions and 150 deletions

View File

@ -12,7 +12,7 @@ functorch_maml_omniglot,inductor,float32,dynamic,cpp,1.126799
yolov3,export-aot-inductor,float32,static,default,1.40687424
mobilenet_v2,export-aot-inductor,float32,static,default,2.90375357
resnext50_32x4d,export-aot-inductor,float32,dynamic,default,1.49299689
hf_Albert,export-aot-inductor,float32,dynamic,default,1.261471
hf_Albert,export-aot-inductor,float32,dynamic,default,1.33293645
resnext50_32x4d,inductor,amp,static,default,1.47023111
vgg16,inductor,amp,static,default,1.2692454
hf_Longformer,inductor,amp,dynamic,default,1.22015225

1 #name backend data_type shape wrapper perf_speedup_target_c7i_metal_24xl
12 yolov3 export-aot-inductor float32 static default 1.40687424
13 mobilenet_v2 export-aot-inductor float32 static default 2.90375357
14 resnext50_32x4d export-aot-inductor float32 dynamic default 1.49299689
15 hf_Albert export-aot-inductor float32 dynamic default 1.261471 1.33293645
16 resnext50_32x4d inductor amp static default 1.47023111
17 vgg16 inductor amp static default 1.2692454
18 hf_Longformer inductor amp dynamic default 1.22015225

View File

@ -22,7 +22,6 @@ from torch.testing._internal.common_utils import (
)
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
from torch.testing._internal.triton_utils import HAS_CUDA
from torch.utils._python_dispatch import TorchDispatchMode
if IS_WINDOWS and IS_CI:
@ -269,46 +268,6 @@ class AOTInductorTestsTemplate:
with self.assertRaisesRegex(RuntimeError, "Expected extern kernel"):
self.check_model(m, args)
def test_boxed_run_inputs_clearing(self):
# Borrowed from test_torchinductor
class Model(torch.nn.Module):
def forward(self, x, y):
return torch.ops.aoti_custom_ops.custom_add(x, y)
inps = [
torch.rand(5, 5, device=self.device),
torch.rand(5, 5, device=self.device),
]
model = Model().to(device=self.device)
# NOTE: There are additional references to inps if we use
# strict=True here, which will cause inps not deallocated
# in time later in this test.
ep = torch.export.export(model, tuple(inps), strict=False)
package = torch._inductor.aoti_compile_and_package(ep)
fn_compiled = torch._inductor.aoti_load_package(package)
test_self = self
sentinel_seen = False
class TestRefMode(TorchDispatchMode):
def __torch_dispatch__(self, func, types, args=(), kwargs=None):
kwargs = kwargs if kwargs else {}
nonlocal inps
nonlocal test_self
nonlocal sentinel_seen
if func is torch.ops.aoti_custom_ops.custom_add.default:
# inputs should be deallocated by this point
sentinel_seen = True
test_self.assertEqual(len(inps), 0)
return func(*args, **kwargs)
with TestRefMode():
fn_compiled.loader.boxed_run(inps)
self.assertEqual(len(inps), 0)
self.assertTrue(sentinel_seen)
class AOTInductorLoggingTest(LoggingTestCase):
@make_logging_test(dynamic=logging.DEBUG)

View File

@ -241,7 +241,7 @@ class AOTICompiledModel:
out_spec = pytree.treespec_loads(call_spec[1])
flat_inputs = pytree.tree_flatten((args, reorder_kwargs(kwargs, in_spec)))[0]
flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
flat_outputs = self.loader.boxed_run(flat_inputs) # type: ignore[attr-defined]
flat_outputs = self.loader.run(flat_inputs) # type: ignore[attr-defined]
return pytree.tree_unflatten(flat_outputs, out_spec)
def get_metadata(self) -> Dict[str, str]:

View File

@ -464,12 +464,6 @@ std::vector<at::Tensor> AOTIModelPackageLoader::run(
return runner_->run(inputs, stream_handle);
}
std::vector<at::Tensor> AOTIModelPackageLoader::boxed_run(
std::vector<at::Tensor>&& inputs,
void* stream_handle) {
return runner_->boxed_run(std::move(inputs), stream_handle);
}
std::unordered_map<std::string, std::string> AOTIModelPackageLoader::
get_metadata() {
return metadata_;

View File

@ -15,16 +15,9 @@ class TORCH_API AOTIModelPackageLoader {
AOTIModelContainerRunner* get_runner();
std::unordered_map<std::string, std::string> get_metadata();
std::vector<at::Tensor> run(
const std::vector<at::Tensor>& inputs,
void* stream_handle = nullptr);
// boxed_run will steal the ownership of the input tensors
std::vector<at::Tensor> boxed_run(
std::vector<at::Tensor>&& inputs,
void* stream_handle = nullptr);
std::vector<std::string> get_call_spec();
void load_constants(
std::unordered_map<std::string, at::Tensor>& constants_map,

View File

@ -5,64 +5,26 @@
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
#endif
#include <torch/csrc/autograd/python_variable.h>
#include <torch/csrc/inductor/aoti_runner/pybind.h>
#include <torch/csrc/utils/pybind.h>
namespace torch::inductor {
class AOTIModelPackageLoaderPybind : public AOTIModelPackageLoader {
public:
AOTIModelPackageLoaderPybind(const std::string& model_package_path)
: AOTIModelPackageLoader(model_package_path) {}
AOTIModelPackageLoaderPybind(
const std::string& model_package_path,
const std::string& model_name)
: AOTIModelPackageLoader(model_package_path, model_name) {}
py::list boxed_run(py::list& inputs, void* stream_handle = nullptr) {
std::vector<at::Tensor> input_tensors;
input_tensors.reserve(inputs.size());
for (auto& item : inputs) {
input_tensors.emplace_back(py::cast<at::Tensor>(item));
}
// Explicitly clear the passed-in Python list
inputs.clear();
std::vector<at::Tensor> result_tensors = AOTIModelPackageLoader::boxed_run(
std::move(input_tensors), stream_handle);
py::list outputs;
for (const auto& tensor : result_tensors) {
outputs.append(THPVariable_Wrap(tensor));
}
return outputs;
}
};
void initAOTIPackageBindings(PyObject* module) {
auto rootModule = py::handle(module).cast<py::module>();
auto m = rootModule.def_submodule("_aoti");
py::class_<AOTIModelPackageLoaderPybind>(m, "AOTIModelPackageLoader")
py::class_<AOTIModelPackageLoader>(m, "AOTIModelPackageLoader")
.def(py::init<const std::string&, const std::string&>())
.def(py::init<const std::string&>())
.def("get_metadata", &AOTIModelPackageLoaderPybind::get_metadata)
.def("get_metadata", &AOTIModelPackageLoader::get_metadata)
.def(
"run",
&AOTIModelPackageLoaderPybind::run,
&AOTIModelPackageLoader::run,
py::arg("inputs"),
py::arg("stream_handle") = nullptr)
.def(
"boxed_run",
&AOTIModelPackageLoaderPybind::boxed_run,
py::arg("inputs"),
py::arg("stream_handle") = nullptr)
.def("get_call_spec", &AOTIModelPackageLoaderPybind::get_call_spec)
.def("load_constants", &AOTIModelPackageLoaderPybind::load_constants)
.def(
"get_constant_fqns",
&AOTIModelPackageLoaderPybind::get_constant_fqns);
.def("get_call_spec", &AOTIModelPackageLoader::get_call_spec)
.def("load_constants", &AOTIModelPackageLoader::load_constants)
.def("get_constant_fqns", &AOTIModelPackageLoader::get_constant_fqns);
}
} // namespace torch::inductor

View File

@ -91,9 +91,12 @@ AOTIModelContainerRunner::~AOTIModelContainerRunner() {
result == AOTI_RUNTIME_SUCCESS, "AOTInductorModelContainerDelete failed");
}
std::vector<at::Tensor> AOTIModelContainerRunner::run_impl(
std::vector<AtenTensorHandle>& input_handles,
std::vector<at::Tensor> AOTIModelContainerRunner::run(
const std::vector<at::Tensor>& inputs,
void* stream_handle) {
auto input_handles =
torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(inputs);
// For outputs, we only allocate a vector to hold returned tensor handles,
// not allocating the actual output tensor storage here
size_t num_outputs = 0;
@ -114,23 +117,6 @@ std::vector<at::Tensor> AOTIModelContainerRunner::run_impl(
output_handles.data(), output_handles.size());
}
std::vector<at::Tensor> AOTIModelContainerRunner::run(
const std::vector<at::Tensor>& inputs,
void* stream_handle) {
std::vector<AtenTensorHandle> input_handles =
torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(inputs);
return run_impl(input_handles, stream_handle);
}
std::vector<at::Tensor> AOTIModelContainerRunner::boxed_run(
std::vector<at::Tensor>&& inputs,
void* stream_handle) {
std::vector<AtenTensorHandle> input_handles =
torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(inputs);
std::move(inputs).clear();
return run_impl(input_handles, stream_handle);
}
std::unordered_map<std::string, std::string> AOTIModelContainerRunner::
getConstantNamesToOriginalFQNs() const {
std::unordered_map<std::string, std::string> result;

View File

@ -24,19 +24,13 @@ class TORCH_API AOTIModelContainerRunner {
delete;
virtual ~AOTIModelContainerRunner();
std::vector<at::Tensor> run(
virtual std::vector<at::Tensor> run(
const std::vector<at::Tensor>& inputs,
void* stream_handle = nullptr);
// boxed_run will steal the ownership of the input tensors
std::vector<at::Tensor> boxed_run(
std::vector<at::Tensor>&& inputs,
void* stream_handle = nullptr);
std::unordered_map<std::string, std::string> getConstantNamesToOriginalFQNs()
const;
std::unordered_map<std::string, int32_t> getConstantNamesToDtypes() const;
void update_inactive_constant_buffer(const TensorConstantMap& const_map);
void update_constant_buffer(
std::unordered_map<std::string, at::Tensor>& tensor_map,
@ -60,10 +54,6 @@ class TORCH_API AOTIModelContainerRunner {
const std::string& device_str,
const std::string& cubin_dir);
virtual std::vector<at::Tensor> run_impl(
std::vector<AtenTensorHandle>& input_handles,
void* stream_handle);
std::unique_ptr<at::DynamicLibrary> model_so_;
decltype(&AOTInductorModelContainerCreateWithDevice) create_func_{nullptr};
decltype(&AOTInductorModelContainerDelete) delete_func_{nullptr};

View File

@ -12,6 +12,12 @@ AOTIModelContainerRunnerCpu::AOTIModelContainerRunnerCpu(
AOTIModelContainerRunnerCpu::~AOTIModelContainerRunnerCpu() = default;
std::vector<at::Tensor> AOTIModelContainerRunnerCpu::run(
const std::vector<at::Tensor>& inputs,
void* stream_handle) {
return AOTIModelContainerRunner::run(inputs, stream_handle);
}
namespace {
std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_cpu(
const std::string& model_so_path,

View File

@ -11,6 +11,10 @@ class TORCH_API AOTIModelContainerRunnerCpu : public AOTIModelContainerRunner {
size_t num_models = 1);
~AOTIModelContainerRunnerCpu() override;
std::vector<at::Tensor> run(
const std::vector<at::Tensor>& inputs,
void* stream_handle = nullptr) override;
};
} // namespace torch::inductor

View File

@ -16,20 +16,21 @@ AOTIModelContainerRunnerCuda::AOTIModelContainerRunnerCuda(
AOTIModelContainerRunnerCuda::~AOTIModelContainerRunnerCuda() = default;
std::vector<at::Tensor> AOTIModelContainerRunnerCuda::run_impl(
std::vector<AtenTensorHandle>& input_handles,
std::vector<at::Tensor> AOTIModelContainerRunnerCuda::run(
const std::vector<at::Tensor>& inputs,
void* stream_handle) {
if (stream_handle == nullptr) {
at::cuda::CUDAStream cuda_stream = c10::cuda::getCurrentCUDAStream();
stream_handle = reinterpret_cast<void*>(cuda_stream.stream());
}
return AOTIModelContainerRunner::run_impl(input_handles, stream_handle);
return AOTIModelContainerRunner::run(inputs, stream_handle);
}
std::vector<at::Tensor> AOTIModelContainerRunnerCuda::run_with_cuda_stream(
const std::vector<at::Tensor>& inputs,
const at::cuda::CUDAStream& cuda_stream) {
return run(inputs, reinterpret_cast<void*>(cuda_stream.stream()));
at::cuda::CUDAStream cuda_stream) {
return AOTIModelContainerRunner::run(
inputs, reinterpret_cast<void*>(cuda_stream.stream()));
}
namespace {

View File

@ -21,13 +21,13 @@ class TORCH_CUDA_CPP_API AOTIModelContainerRunnerCuda
~AOTIModelContainerRunnerCuda() override;
std::vector<at::Tensor> run_impl(
std::vector<AtenTensorHandle>& input_handles,
void* stream_handle) override;
std::vector<at::Tensor> run(
const std::vector<at::Tensor>& inputs,
void* stream_handle = nullptr) override;
std::vector<at::Tensor> run_with_cuda_stream(
const std::vector<at::Tensor>& inputs,
const at::cuda::CUDAStream& cuda_stream);
at::cuda::CUDAStream cuda_stream);
};
} // namespace torch::inductor

View File

@ -16,20 +16,21 @@ AOTIModelContainerRunnerXpu::AOTIModelContainerRunnerXpu(
AOTIModelContainerRunnerXpu::~AOTIModelContainerRunnerXpu() = default;
std::vector<at::Tensor> AOTIModelContainerRunnerXpu::run_impl(
std::vector<AtenTensorHandle>& input_handles,
std::vector<at::Tensor> AOTIModelContainerRunnerXpu::run(
const std::vector<at::Tensor>& inputs,
void* stream_handle) {
if (stream_handle == nullptr) {
at::xpu::XPUStream xpu_stream = c10::xpu::getCurrentXPUStream();
stream_handle = reinterpret_cast<void*>(&(xpu_stream.queue()));
}
return AOTIModelContainerRunner::run_impl(inputs, stream_handle);
return AOTIModelContainerRunner::run(inputs, stream_handle);
}
std::vector<at::Tensor> AOTIModelContainerRunnerXpu::run_with_xpu_stream(
const std::vector<at::Tensor>& inputs,
const at::xpu::XPUStream& xpu_stream) {
return run(inputs, reinterpret_cast<void*>(&(xpu_stream.queue())));
std::vector<at::Tensor>& inputs,
at::xpu::XPUStream xpu_stream) {
return AOTIModelContainerRunner::run(
inputs, reinterpret_cast<void*>(&(xpu_stream.queue())));
}
namespace {

View File

@ -23,13 +23,13 @@ class C10_EXPORT AOTIModelContainerRunnerXpu : public AOTIModelContainerRunner {
~AOTIModelContainerRunnerXpu() override;
std::vector<at::Tensor> run_impl(
std::vector<AtenTensorHandle>& input_handles,
void* stream_handle) override;
std::vector<at::Tensor> run(
const std::vector<at::Tensor>& inputs,
void* stream_handle = nullptr) override;
std::vector<at::Tensor> run_with_xpu_stream(
const std::vector<at::Tensor>& inputs,
const at::xpu::XPUStream& xpu_stream);
std::vector<at::Tensor>& inputs,
at::xpu::XPUStream xpu_stream);
};
} // namespace torch::inductor