mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Revert "[AOTI] Add a boxed_run API (#142213)"
This reverts commit 868984c3e3.
Reverted https://github.com/pytorch/pytorch/pull/142213 on behalf of https://github.com/kit1980 due to breaking lots of internal builds, see D68036023 ([comment](https://github.com/pytorch/pytorch/pull/142213#issuecomment-2588378262))
This commit is contained in:
parent
a54a784b82
commit
4f74864c94
|
|
@ -12,7 +12,7 @@ functorch_maml_omniglot,inductor,float32,dynamic,cpp,1.126799
|
|||
yolov3,export-aot-inductor,float32,static,default,1.40687424
|
||||
mobilenet_v2,export-aot-inductor,float32,static,default,2.90375357
|
||||
resnext50_32x4d,export-aot-inductor,float32,dynamic,default,1.49299689
|
||||
hf_Albert,export-aot-inductor,float32,dynamic,default,1.261471
|
||||
hf_Albert,export-aot-inductor,float32,dynamic,default,1.33293645
|
||||
resnext50_32x4d,inductor,amp,static,default,1.47023111
|
||||
vgg16,inductor,amp,static,default,1.2692454
|
||||
hf_Longformer,inductor,amp,dynamic,default,1.22015225
|
||||
|
|
|
|||
|
|
|
@ -22,7 +22,6 @@ from torch.testing._internal.common_utils import (
|
|||
)
|
||||
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
|
||||
from torch.testing._internal.triton_utils import HAS_CUDA
|
||||
from torch.utils._python_dispatch import TorchDispatchMode
|
||||
|
||||
|
||||
if IS_WINDOWS and IS_CI:
|
||||
|
|
@ -269,46 +268,6 @@ class AOTInductorTestsTemplate:
|
|||
with self.assertRaisesRegex(RuntimeError, "Expected extern kernel"):
|
||||
self.check_model(m, args)
|
||||
|
||||
def test_boxed_run_inputs_clearing(self):
|
||||
# Borrowed from test_torchinductor
|
||||
class Model(torch.nn.Module):
|
||||
def forward(self, x, y):
|
||||
return torch.ops.aoti_custom_ops.custom_add(x, y)
|
||||
|
||||
inps = [
|
||||
torch.rand(5, 5, device=self.device),
|
||||
torch.rand(5, 5, device=self.device),
|
||||
]
|
||||
model = Model().to(device=self.device)
|
||||
# NOTE: There are additional references to inps if we use
|
||||
# strict=True here, which will cause inps not deallocated
|
||||
# in time later in this test.
|
||||
ep = torch.export.export(model, tuple(inps), strict=False)
|
||||
package = torch._inductor.aoti_compile_and_package(ep)
|
||||
fn_compiled = torch._inductor.aoti_load_package(package)
|
||||
|
||||
test_self = self
|
||||
sentinel_seen = False
|
||||
|
||||
class TestRefMode(TorchDispatchMode):
|
||||
def __torch_dispatch__(self, func, types, args=(), kwargs=None):
|
||||
kwargs = kwargs if kwargs else {}
|
||||
nonlocal inps
|
||||
nonlocal test_self
|
||||
nonlocal sentinel_seen
|
||||
if func is torch.ops.aoti_custom_ops.custom_add.default:
|
||||
# inputs should be deallocated by this point
|
||||
sentinel_seen = True
|
||||
test_self.assertEqual(len(inps), 0)
|
||||
|
||||
return func(*args, **kwargs)
|
||||
|
||||
with TestRefMode():
|
||||
fn_compiled.loader.boxed_run(inps)
|
||||
|
||||
self.assertEqual(len(inps), 0)
|
||||
self.assertTrue(sentinel_seen)
|
||||
|
||||
|
||||
class AOTInductorLoggingTest(LoggingTestCase):
|
||||
@make_logging_test(dynamic=logging.DEBUG)
|
||||
|
|
|
|||
|
|
@ -241,7 +241,7 @@ class AOTICompiledModel:
|
|||
out_spec = pytree.treespec_loads(call_spec[1])
|
||||
flat_inputs = pytree.tree_flatten((args, reorder_kwargs(kwargs, in_spec)))[0]
|
||||
flat_inputs = [x for x in flat_inputs if isinstance(x, torch.Tensor)]
|
||||
flat_outputs = self.loader.boxed_run(flat_inputs) # type: ignore[attr-defined]
|
||||
flat_outputs = self.loader.run(flat_inputs) # type: ignore[attr-defined]
|
||||
return pytree.tree_unflatten(flat_outputs, out_spec)
|
||||
|
||||
def get_metadata(self) -> Dict[str, str]:
|
||||
|
|
|
|||
|
|
@ -464,12 +464,6 @@ std::vector<at::Tensor> AOTIModelPackageLoader::run(
|
|||
return runner_->run(inputs, stream_handle);
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> AOTIModelPackageLoader::boxed_run(
|
||||
std::vector<at::Tensor>&& inputs,
|
||||
void* stream_handle) {
|
||||
return runner_->boxed_run(std::move(inputs), stream_handle);
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, std::string> AOTIModelPackageLoader::
|
||||
get_metadata() {
|
||||
return metadata_;
|
||||
|
|
|
|||
|
|
@ -15,16 +15,9 @@ class TORCH_API AOTIModelPackageLoader {
|
|||
|
||||
AOTIModelContainerRunner* get_runner();
|
||||
std::unordered_map<std::string, std::string> get_metadata();
|
||||
|
||||
std::vector<at::Tensor> run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle = nullptr);
|
||||
|
||||
// boxed_run will steal the ownership of the input tensors
|
||||
std::vector<at::Tensor> boxed_run(
|
||||
std::vector<at::Tensor>&& inputs,
|
||||
void* stream_handle = nullptr);
|
||||
|
||||
std::vector<std::string> get_call_spec();
|
||||
void load_constants(
|
||||
std::unordered_map<std::string, at::Tensor>& constants_map,
|
||||
|
|
|
|||
|
|
@ -5,64 +5,26 @@
|
|||
#include <torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h>
|
||||
#endif
|
||||
|
||||
#include <torch/csrc/autograd/python_variable.h>
|
||||
#include <torch/csrc/inductor/aoti_runner/pybind.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
|
||||
namespace torch::inductor {
|
||||
|
||||
class AOTIModelPackageLoaderPybind : public AOTIModelPackageLoader {
|
||||
public:
|
||||
AOTIModelPackageLoaderPybind(const std::string& model_package_path)
|
||||
: AOTIModelPackageLoader(model_package_path) {}
|
||||
|
||||
AOTIModelPackageLoaderPybind(
|
||||
const std::string& model_package_path,
|
||||
const std::string& model_name)
|
||||
: AOTIModelPackageLoader(model_package_path, model_name) {}
|
||||
|
||||
py::list boxed_run(py::list& inputs, void* stream_handle = nullptr) {
|
||||
std::vector<at::Tensor> input_tensors;
|
||||
input_tensors.reserve(inputs.size());
|
||||
for (auto& item : inputs) {
|
||||
input_tensors.emplace_back(py::cast<at::Tensor>(item));
|
||||
}
|
||||
// Explicitly clear the passed-in Python list
|
||||
inputs.clear();
|
||||
|
||||
std::vector<at::Tensor> result_tensors = AOTIModelPackageLoader::boxed_run(
|
||||
std::move(input_tensors), stream_handle);
|
||||
|
||||
py::list outputs;
|
||||
for (const auto& tensor : result_tensors) {
|
||||
outputs.append(THPVariable_Wrap(tensor));
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
};
|
||||
|
||||
void initAOTIPackageBindings(PyObject* module) {
|
||||
auto rootModule = py::handle(module).cast<py::module>();
|
||||
auto m = rootModule.def_submodule("_aoti");
|
||||
|
||||
py::class_<AOTIModelPackageLoaderPybind>(m, "AOTIModelPackageLoader")
|
||||
py::class_<AOTIModelPackageLoader>(m, "AOTIModelPackageLoader")
|
||||
.def(py::init<const std::string&, const std::string&>())
|
||||
.def(py::init<const std::string&>())
|
||||
.def("get_metadata", &AOTIModelPackageLoaderPybind::get_metadata)
|
||||
.def("get_metadata", &AOTIModelPackageLoader::get_metadata)
|
||||
.def(
|
||||
"run",
|
||||
&AOTIModelPackageLoaderPybind::run,
|
||||
&AOTIModelPackageLoader::run,
|
||||
py::arg("inputs"),
|
||||
py::arg("stream_handle") = nullptr)
|
||||
.def(
|
||||
"boxed_run",
|
||||
&AOTIModelPackageLoaderPybind::boxed_run,
|
||||
py::arg("inputs"),
|
||||
py::arg("stream_handle") = nullptr)
|
||||
.def("get_call_spec", &AOTIModelPackageLoaderPybind::get_call_spec)
|
||||
.def("load_constants", &AOTIModelPackageLoaderPybind::load_constants)
|
||||
.def(
|
||||
"get_constant_fqns",
|
||||
&AOTIModelPackageLoaderPybind::get_constant_fqns);
|
||||
.def("get_call_spec", &AOTIModelPackageLoader::get_call_spec)
|
||||
.def("load_constants", &AOTIModelPackageLoader::load_constants)
|
||||
.def("get_constant_fqns", &AOTIModelPackageLoader::get_constant_fqns);
|
||||
}
|
||||
} // namespace torch::inductor
|
||||
|
|
|
|||
|
|
@ -91,9 +91,12 @@ AOTIModelContainerRunner::~AOTIModelContainerRunner() {
|
|||
result == AOTI_RUNTIME_SUCCESS, "AOTInductorModelContainerDelete failed");
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> AOTIModelContainerRunner::run_impl(
|
||||
std::vector<AtenTensorHandle>& input_handles,
|
||||
std::vector<at::Tensor> AOTIModelContainerRunner::run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle) {
|
||||
auto input_handles =
|
||||
torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(inputs);
|
||||
|
||||
// For outputs, we only allocate a vector to hold returned tensor handles,
|
||||
// not allocating the actual output tensor storage here
|
||||
size_t num_outputs = 0;
|
||||
|
|
@ -114,23 +117,6 @@ std::vector<at::Tensor> AOTIModelContainerRunner::run_impl(
|
|||
output_handles.data(), output_handles.size());
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> AOTIModelContainerRunner::run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle) {
|
||||
std::vector<AtenTensorHandle> input_handles =
|
||||
torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(inputs);
|
||||
return run_impl(input_handles, stream_handle);
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> AOTIModelContainerRunner::boxed_run(
|
||||
std::vector<at::Tensor>&& inputs,
|
||||
void* stream_handle) {
|
||||
std::vector<AtenTensorHandle> input_handles =
|
||||
torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(inputs);
|
||||
std::move(inputs).clear();
|
||||
return run_impl(input_handles, stream_handle);
|
||||
}
|
||||
|
||||
std::unordered_map<std::string, std::string> AOTIModelContainerRunner::
|
||||
getConstantNamesToOriginalFQNs() const {
|
||||
std::unordered_map<std::string, std::string> result;
|
||||
|
|
|
|||
|
|
@ -24,19 +24,13 @@ class TORCH_API AOTIModelContainerRunner {
|
|||
delete;
|
||||
virtual ~AOTIModelContainerRunner();
|
||||
|
||||
std::vector<at::Tensor> run(
|
||||
virtual std::vector<at::Tensor> run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle = nullptr);
|
||||
|
||||
// boxed_run will steal the ownership of the input tensors
|
||||
std::vector<at::Tensor> boxed_run(
|
||||
std::vector<at::Tensor>&& inputs,
|
||||
void* stream_handle = nullptr);
|
||||
|
||||
std::unordered_map<std::string, std::string> getConstantNamesToOriginalFQNs()
|
||||
const;
|
||||
std::unordered_map<std::string, int32_t> getConstantNamesToDtypes() const;
|
||||
|
||||
void update_inactive_constant_buffer(const TensorConstantMap& const_map);
|
||||
void update_constant_buffer(
|
||||
std::unordered_map<std::string, at::Tensor>& tensor_map,
|
||||
|
|
@ -60,10 +54,6 @@ class TORCH_API AOTIModelContainerRunner {
|
|||
const std::string& device_str,
|
||||
const std::string& cubin_dir);
|
||||
|
||||
virtual std::vector<at::Tensor> run_impl(
|
||||
std::vector<AtenTensorHandle>& input_handles,
|
||||
void* stream_handle);
|
||||
|
||||
std::unique_ptr<at::DynamicLibrary> model_so_;
|
||||
decltype(&AOTInductorModelContainerCreateWithDevice) create_func_{nullptr};
|
||||
decltype(&AOTInductorModelContainerDelete) delete_func_{nullptr};
|
||||
|
|
|
|||
|
|
@ -12,6 +12,12 @@ AOTIModelContainerRunnerCpu::AOTIModelContainerRunnerCpu(
|
|||
|
||||
AOTIModelContainerRunnerCpu::~AOTIModelContainerRunnerCpu() = default;
|
||||
|
||||
std::vector<at::Tensor> AOTIModelContainerRunnerCpu::run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle) {
|
||||
return AOTIModelContainerRunner::run(inputs, stream_handle);
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::unique_ptr<AOTIModelContainerRunner> create_aoti_runner_cpu(
|
||||
const std::string& model_so_path,
|
||||
|
|
|
|||
|
|
@ -11,6 +11,10 @@ class TORCH_API AOTIModelContainerRunnerCpu : public AOTIModelContainerRunner {
|
|||
size_t num_models = 1);
|
||||
|
||||
~AOTIModelContainerRunnerCpu() override;
|
||||
|
||||
std::vector<at::Tensor> run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle = nullptr) override;
|
||||
};
|
||||
|
||||
} // namespace torch::inductor
|
||||
|
|
|
|||
|
|
@ -16,20 +16,21 @@ AOTIModelContainerRunnerCuda::AOTIModelContainerRunnerCuda(
|
|||
|
||||
AOTIModelContainerRunnerCuda::~AOTIModelContainerRunnerCuda() = default;
|
||||
|
||||
std::vector<at::Tensor> AOTIModelContainerRunnerCuda::run_impl(
|
||||
std::vector<AtenTensorHandle>& input_handles,
|
||||
std::vector<at::Tensor> AOTIModelContainerRunnerCuda::run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle) {
|
||||
if (stream_handle == nullptr) {
|
||||
at::cuda::CUDAStream cuda_stream = c10::cuda::getCurrentCUDAStream();
|
||||
stream_handle = reinterpret_cast<void*>(cuda_stream.stream());
|
||||
}
|
||||
return AOTIModelContainerRunner::run_impl(input_handles, stream_handle);
|
||||
return AOTIModelContainerRunner::run(inputs, stream_handle);
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> AOTIModelContainerRunnerCuda::run_with_cuda_stream(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
const at::cuda::CUDAStream& cuda_stream) {
|
||||
return run(inputs, reinterpret_cast<void*>(cuda_stream.stream()));
|
||||
at::cuda::CUDAStream cuda_stream) {
|
||||
return AOTIModelContainerRunner::run(
|
||||
inputs, reinterpret_cast<void*>(cuda_stream.stream()));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
|
|
|||
|
|
@ -21,13 +21,13 @@ class TORCH_CUDA_CPP_API AOTIModelContainerRunnerCuda
|
|||
|
||||
~AOTIModelContainerRunnerCuda() override;
|
||||
|
||||
std::vector<at::Tensor> run_impl(
|
||||
std::vector<AtenTensorHandle>& input_handles,
|
||||
void* stream_handle) override;
|
||||
std::vector<at::Tensor> run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle = nullptr) override;
|
||||
|
||||
std::vector<at::Tensor> run_with_cuda_stream(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
const at::cuda::CUDAStream& cuda_stream);
|
||||
at::cuda::CUDAStream cuda_stream);
|
||||
};
|
||||
|
||||
} // namespace torch::inductor
|
||||
|
|
|
|||
|
|
@ -16,20 +16,21 @@ AOTIModelContainerRunnerXpu::AOTIModelContainerRunnerXpu(
|
|||
|
||||
AOTIModelContainerRunnerXpu::~AOTIModelContainerRunnerXpu() = default;
|
||||
|
||||
std::vector<at::Tensor> AOTIModelContainerRunnerXpu::run_impl(
|
||||
std::vector<AtenTensorHandle>& input_handles,
|
||||
std::vector<at::Tensor> AOTIModelContainerRunnerXpu::run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle) {
|
||||
if (stream_handle == nullptr) {
|
||||
at::xpu::XPUStream xpu_stream = c10::xpu::getCurrentXPUStream();
|
||||
stream_handle = reinterpret_cast<void*>(&(xpu_stream.queue()));
|
||||
}
|
||||
return AOTIModelContainerRunner::run_impl(inputs, stream_handle);
|
||||
return AOTIModelContainerRunner::run(inputs, stream_handle);
|
||||
}
|
||||
|
||||
std::vector<at::Tensor> AOTIModelContainerRunnerXpu::run_with_xpu_stream(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
const at::xpu::XPUStream& xpu_stream) {
|
||||
return run(inputs, reinterpret_cast<void*>(&(xpu_stream.queue())));
|
||||
std::vector<at::Tensor>& inputs,
|
||||
at::xpu::XPUStream xpu_stream) {
|
||||
return AOTIModelContainerRunner::run(
|
||||
inputs, reinterpret_cast<void*>(&(xpu_stream.queue())));
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
|
|
|||
|
|
@ -23,13 +23,13 @@ class C10_EXPORT AOTIModelContainerRunnerXpu : public AOTIModelContainerRunner {
|
|||
|
||||
~AOTIModelContainerRunnerXpu() override;
|
||||
|
||||
std::vector<at::Tensor> run_impl(
|
||||
std::vector<AtenTensorHandle>& input_handles,
|
||||
void* stream_handle) override;
|
||||
std::vector<at::Tensor> run(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
void* stream_handle = nullptr) override;
|
||||
|
||||
std::vector<at::Tensor> run_with_xpu_stream(
|
||||
const std::vector<at::Tensor>& inputs,
|
||||
const at::xpu::XPUStream& xpu_stream);
|
||||
std::vector<at::Tensor>& inputs,
|
||||
at::xpu::XPUStream xpu_stream);
|
||||
};
|
||||
|
||||
} // namespace torch::inductor
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user