mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Add fastpath for common use cases in our python arg parsing.
This is using the observation that exact type check is a lot fast (pointer comparison) than subtype check (isintance call). So we make sure to do these before any isinstance check.
This can be pretty significant where `a.view((1, 1, 1, 1))` goes from ~1.13us to 800ns.
Full test:
Tested perf locally with cpu freq locked and script pinned to a single core to reduce jitter.
Benchmark results after doing each change in this PR one by one:
```
[albandes@albandes-fedora-K2202N0104138 test]$ # Original
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
827 ns ± 0.945 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
947 ns ± 1.23 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1.04 µs ± 0.882 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.14 µs ± 1.59 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
797 ns ± 0.955 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
937 ns ± 1.51 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.02 µs ± 3.52 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
823 ns ± 1.76 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
938 ns ± 1.38 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1.03 µs ± 0.801 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.13 µs ± 0.877 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
768 ns ± 2.27 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
927 ns ± 0.779 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.01 µs ± 1.34 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ # checkLong fastpath
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
801 ns ± 0.982 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
900 ns ± 0.593 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1 µs ± 1.44 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.1 µs ± 1.38 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
782 ns ± 0.968 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
1.11 µs ± 424 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.09 µs ± 54.7 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
817 ns ± 0.65 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
912 ns ± 0.853 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1.02 µs ± 8.45 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.11 µs ± 2.53 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
781 ns ± 0.942 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
939 ns ± 1.57 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.01 µs ± 0.875 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ # Tensor check fastpath
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
806 ns ± 2.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
903 ns ± 1.82 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1 µs ± 1.21 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.1 µs ± 1.17 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
770 ns ± 1.66 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
931 ns ± 3.36 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.02 µs ± 0.983 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
813 ns ± 2.42 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
915 ns ± 0.868 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1.02 µs ± 1.09 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.11 µs ± 1.15 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
785 ns ± 0.807 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
941 ns ± 1.02 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.02 µs ± 0.857 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ # Fast path number in intlist/symintlist
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
728 ns ± 0.503 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
749 ns ± 0.829 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
771 ns ± 0.727 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
800 ns ± 0.962 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
772 ns ± 0.622 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
883 ns ± 0.567 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
915 ns ± 0.638 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
735 ns ± 1.27 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
753 ns ± 2.57 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
774 ns ± 1.38 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
801 ns ± 0.835 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
773 ns ± 0.677 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
873 ns ± 1.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
907 ns ± 0.836 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
```
<details>
<summary>Test script</summary>
```python
import torch
from IPython import get_ipython
a = torch.empty(1)
print("Running ", "a.view(1)")
get_ipython().run_line_magic("timeit", "a.view(1)")
print("Running ", "a.view((1, 1))")
get_ipython().run_line_magic("timeit", "a.view((1, 1))")
print("Running ", "a.view((1, 1, 1))")
get_ipython().run_line_magic("timeit", "a.view((1, 1, 1))")
print("Running ", "a.view((1, 1, 1, 1))")
get_ipython().run_line_magic("timeit", "a.view((1, 1, 1, 1))")
a = torch.empty(1, 1, 1)
print("Running ", "a.squeeze(0)")
get_ipython().run_line_magic("timeit", "a.squeeze(0)")
print("Running ", "a.squeeze((0,))")
get_ipython().run_line_magic("timeit", "a.squeeze((0,))")
print("Running ", "a.squeeze((0, 1))")
get_ipython().run_line_magic("timeit", "a.squeeze((0, 1))")
```
</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/98764
Approved by: https://github.com/ngimel
84 lines
2.3 KiB
C++
84 lines
2.3 KiB
C++
#pragma once
|
|
|
|
#include <ATen/core/Tensor.h>
|
|
#include <torch/csrc/python_headers.h>
|
|
#include <memory>
|
|
|
|
#include <ATen/core/function_schema.h>
|
|
#include <pybind11/pybind11.h>
|
|
#include <torch/csrc/Exceptions.h>
|
|
#include <torch/csrc/Export.h>
|
|
#include <torch/csrc/autograd/variable.h>
|
|
#include <torch/csrc/utils/pybind.h>
|
|
|
|
namespace py = pybind11;
|
|
|
|
// Python object that backs torch.autograd.Variable
|
|
struct THPVariable {
|
|
PyObject_HEAD;
|
|
// Payload
|
|
c10::MaybeOwned<at::Tensor> cdata;
|
|
// Hooks to be run on backwards pass (corresponds to Python attr
|
|
// '_backwards_hooks', set by 'register_hook')
|
|
PyObject* backward_hooks = nullptr;
|
|
};
|
|
|
|
TORCH_PYTHON_API void registerPythonTensorClass(
|
|
const std::string& device,
|
|
PyObject* python_tensor_class);
|
|
|
|
TORCH_PYTHON_API void activateCUDATrace();
|
|
|
|
TORCH_PYTHON_API extern PyObject* THPVariableClass;
|
|
TORCH_PYTHON_API extern PyObject* ParameterClass;
|
|
|
|
bool THPVariable_initModule(PyObject* module);
|
|
TORCH_PYTHON_API PyObject* THPVariable_Wrap(at::TensorBase var);
|
|
|
|
static inline bool THPVariable_CheckTypeExact(PyTypeObject* tp) {
|
|
// Check that a python object is a `Tensor`, but not a `Tensor` subclass.
|
|
// (A subclass could have different semantics.) The one exception is
|
|
// Parameter, which is used for Python bookkeeping but is equivalent to
|
|
// Tensor as far as C++ is concerned.
|
|
return (
|
|
tp == (PyTypeObject*)THPVariableClass ||
|
|
tp == (PyTypeObject*)ParameterClass);
|
|
}
|
|
|
|
static inline bool THPVariable_CheckExact(PyObject* obj) {
|
|
return THPVariable_CheckTypeExact(Py_TYPE(obj));
|
|
}
|
|
|
|
inline bool THPVariable_Check(PyObject* obj) {
|
|
if (!THPVariableClass)
|
|
return false;
|
|
|
|
// Fast path
|
|
if (THPVariable_CheckExact(obj)) {
|
|
return true;
|
|
}
|
|
|
|
const auto result = PyObject_IsInstance(obj, THPVariableClass);
|
|
if (result == -1)
|
|
throw python_error();
|
|
return result;
|
|
}
|
|
|
|
inline const at::Tensor& THPVariable_Unpack(THPVariable* var) {
|
|
return *var->cdata;
|
|
}
|
|
|
|
inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
|
|
return THPVariable_Unpack(reinterpret_cast<THPVariable*>(obj));
|
|
}
|
|
|
|
std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
|
|
const c10::OperatorHandle& op,
|
|
const std::vector<c10::IValue>& arguments);
|
|
|
|
void pushPyOutToStack(
|
|
const c10::OperatorHandle& op,
|
|
torch::jit::Stack* stack,
|
|
py::object out,
|
|
const char* msg);
|