mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Add fastpath for common use cases in our python arg parsing.
This is using the observation that exact type check is a lot fast (pointer comparison) than subtype check (isintance call). So we make sure to do these before any isinstance check.
This can be pretty significant where `a.view((1, 1, 1, 1))` goes from ~1.13us to 800ns.
Full test:
Tested perf locally with cpu freq locked and script pinned to a single core to reduce jitter.
Benchmark results after doing each change in this PR one by one:
```
[albandes@albandes-fedora-K2202N0104138 test]$ # Original
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
827 ns ± 0.945 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
947 ns ± 1.23 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1.04 µs ± 0.882 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.14 µs ± 1.59 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
797 ns ± 0.955 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
937 ns ± 1.51 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.02 µs ± 3.52 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
823 ns ± 1.76 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
938 ns ± 1.38 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1.03 µs ± 0.801 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.13 µs ± 0.877 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
768 ns ± 2.27 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
927 ns ± 0.779 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.01 µs ± 1.34 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ # checkLong fastpath
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
801 ns ± 0.982 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
900 ns ± 0.593 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1 µs ± 1.44 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.1 µs ± 1.38 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
782 ns ± 0.968 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
1.11 µs ± 424 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.09 µs ± 54.7 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
817 ns ± 0.65 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
912 ns ± 0.853 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1.02 µs ± 8.45 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.11 µs ± 2.53 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
781 ns ± 0.942 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
939 ns ± 1.57 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.01 µs ± 0.875 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ # Tensor check fastpath
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
806 ns ± 2.8 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
903 ns ± 1.82 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1 µs ± 1.21 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.1 µs ± 1.17 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
770 ns ± 1.66 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
931 ns ± 3.36 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.02 µs ± 0.983 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
813 ns ± 2.42 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
915 ns ± 0.868 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
1.02 µs ± 1.09 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
1.11 µs ± 1.15 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
785 ns ± 0.807 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
941 ns ± 1.02 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
1.02 µs ± 0.857 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ # Fast path number in intlist/symintlist
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
728 ns ± 0.503 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
749 ns ± 0.829 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
771 ns ± 0.727 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
800 ns ± 0.962 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
772 ns ± 0.622 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
883 ns ± 0.567 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
915 ns ± 0.638 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
[albandes@albandes-fedora-K2202N0104138 test]$ taskset 0x1 ipython foo.py
No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Running a.view(1)
735 ns ± 1.27 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1))
753 ns ± 2.57 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1))
774 ns ± 1.38 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.view((1, 1, 1, 1))
801 ns ± 0.835 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze(0)
773 ns ± 0.677 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0,))
873 ns ± 1.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
Running a.squeeze((0, 1))
907 ns ± 0.836 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
```
<details>
<summary>Test script</summary>
```python
import torch
from IPython import get_ipython
a = torch.empty(1)
print("Running ", "a.view(1)")
get_ipython().run_line_magic("timeit", "a.view(1)")
print("Running ", "a.view((1, 1))")
get_ipython().run_line_magic("timeit", "a.view((1, 1))")
print("Running ", "a.view((1, 1, 1))")
get_ipython().run_line_magic("timeit", "a.view((1, 1, 1))")
print("Running ", "a.view((1, 1, 1, 1))")
get_ipython().run_line_magic("timeit", "a.view((1, 1, 1, 1))")
a = torch.empty(1, 1, 1)
print("Running ", "a.squeeze(0)")
get_ipython().run_line_magic("timeit", "a.squeeze(0)")
print("Running ", "a.squeeze((0,))")
get_ipython().run_line_magic("timeit", "a.squeeze((0,))")
print("Running ", "a.squeeze((0, 1))")
get_ipython().run_line_magic("timeit", "a.squeeze((0, 1))")
```
</details>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/98764
Approved by: https://github.com/ngimel
178 lines
4.6 KiB
C++
178 lines
4.6 KiB
C++
#pragma once
|
|
|
|
#include <torch/csrc/Exceptions.h>
|
|
#include <torch/csrc/jit/frontend/tracer.h>
|
|
#include <torch/csrc/python_headers.h>
|
|
#include <torch/csrc/utils/object_ptr.h>
|
|
#include <torch/csrc/utils/tensor_numpy.h>
|
|
#include <cstdint>
|
|
#include <limits>
|
|
#include <stdexcept>
|
|
|
|
// largest integer that can be represented consecutively in a double
|
|
const int64_t DOUBLE_INT_MAX = 9007199254740992;
|
|
|
|
inline PyObject* THPUtils_packInt32(int32_t value) {
|
|
return PyLong_FromLong(value);
|
|
}
|
|
|
|
inline PyObject* THPUtils_packInt64(int64_t value) {
|
|
return PyLong_FromLongLong(value);
|
|
}
|
|
|
|
inline PyObject* THPUtils_packUInt32(uint32_t value) {
|
|
return PyLong_FromUnsignedLong(value);
|
|
}
|
|
|
|
inline PyObject* THPUtils_packUInt64(uint64_t value) {
|
|
return PyLong_FromUnsignedLongLong(value);
|
|
}
|
|
|
|
inline PyObject* THPUtils_packDoubleAsInt(double value) {
|
|
return PyLong_FromDouble(value);
|
|
}
|
|
|
|
inline bool THPUtils_checkLongExact(PyObject* obj) {
|
|
return PyLong_CheckExact(obj) && !PyBool_Check(obj);
|
|
}
|
|
|
|
inline bool THPUtils_checkLong(PyObject* obj) {
|
|
// Fast path
|
|
if (THPUtils_checkLongExact(obj)) {
|
|
return true;
|
|
}
|
|
|
|
#ifdef USE_NUMPY
|
|
if (torch::utils::is_numpy_int(obj)) {
|
|
return true;
|
|
}
|
|
#endif
|
|
|
|
return PyLong_Check(obj) && !PyBool_Check(obj);
|
|
}
|
|
|
|
inline int32_t THPUtils_unpackInt(PyObject* obj) {
|
|
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
|
int overflow;
|
|
long value = PyLong_AsLongAndOverflow(obj, &overflow);
|
|
if (value == -1 && PyErr_Occurred()) {
|
|
throw python_error();
|
|
}
|
|
if (overflow != 0) {
|
|
throw std::runtime_error("Overflow when unpacking long");
|
|
}
|
|
if (value > std::numeric_limits<int32_t>::max() ||
|
|
value < std::numeric_limits<int32_t>::min()) {
|
|
throw std::runtime_error("Overflow when unpacking long");
|
|
}
|
|
return (int32_t)value;
|
|
}
|
|
|
|
inline int64_t THPUtils_unpackLong(PyObject* obj) {
|
|
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
|
int overflow;
|
|
long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
|
|
if (value == -1 && PyErr_Occurred()) {
|
|
throw python_error();
|
|
}
|
|
if (overflow != 0) {
|
|
throw std::runtime_error("Overflow when unpacking long");
|
|
}
|
|
return (int64_t)value;
|
|
}
|
|
|
|
inline uint32_t THPUtils_unpackUInt32(PyObject* obj) {
|
|
unsigned long value = PyLong_AsUnsignedLong(obj);
|
|
if (PyErr_Occurred()) {
|
|
throw python_error();
|
|
}
|
|
if (value > std::numeric_limits<uint32_t>::max()) {
|
|
throw std::runtime_error("Overflow when unpacking unsigned long");
|
|
}
|
|
return (uint32_t)value;
|
|
}
|
|
|
|
inline uint64_t THPUtils_unpackUInt64(PyObject* obj) {
|
|
unsigned long long value = PyLong_AsUnsignedLongLong(obj);
|
|
if (PyErr_Occurred()) {
|
|
throw python_error();
|
|
}
|
|
return (uint64_t)value;
|
|
}
|
|
|
|
bool THPUtils_checkIndex(PyObject* obj);
|
|
|
|
inline int64_t THPUtils_unpackIndex(PyObject* obj) {
|
|
if (!THPUtils_checkLong(obj)) {
|
|
auto index = THPObjectPtr(PyNumber_Index(obj));
|
|
if (index == nullptr) {
|
|
throw python_error();
|
|
}
|
|
// NB: This needs to be called before `index` goes out of scope and the
|
|
// underlying object's refcount is decremented
|
|
return THPUtils_unpackLong(index.get());
|
|
}
|
|
return THPUtils_unpackLong(obj);
|
|
}
|
|
|
|
inline bool THPUtils_unpackBool(PyObject* obj) {
|
|
if (obj == Py_True) {
|
|
return true;
|
|
} else if (obj == Py_False) {
|
|
return false;
|
|
} else {
|
|
throw std::runtime_error("couldn't convert python object to boolean");
|
|
}
|
|
}
|
|
|
|
inline bool THPUtils_checkDouble(PyObject* obj) {
|
|
#ifdef USE_NUMPY
|
|
if (torch::utils::is_numpy_scalar(obj)) {
|
|
return true;
|
|
}
|
|
#endif
|
|
return PyFloat_Check(obj) || PyLong_Check(obj);
|
|
}
|
|
|
|
inline double THPUtils_unpackDouble(PyObject* obj) {
|
|
if (PyFloat_Check(obj)) {
|
|
return PyFloat_AS_DOUBLE(obj);
|
|
}
|
|
double value = PyFloat_AsDouble(obj);
|
|
if (value == -1 && PyErr_Occurred()) {
|
|
throw python_error();
|
|
}
|
|
return value;
|
|
}
|
|
|
|
inline c10::complex<double> THPUtils_unpackComplexDouble(PyObject* obj) {
|
|
Py_complex value = PyComplex_AsCComplex(obj);
|
|
if (value.real == -1.0 && PyErr_Occurred()) {
|
|
throw python_error();
|
|
}
|
|
|
|
return c10::complex<double>(value.real, value.imag);
|
|
}
|
|
|
|
inline bool THPUtils_unpackNumberAsBool(PyObject* obj) {
|
|
if (PyFloat_Check(obj)) {
|
|
return (bool)PyFloat_AS_DOUBLE(obj);
|
|
}
|
|
|
|
if (PyComplex_Check(obj)) {
|
|
double real_val = PyComplex_RealAsDouble(obj);
|
|
double imag_val = PyComplex_ImagAsDouble(obj);
|
|
return !(real_val == 0 && imag_val == 0);
|
|
}
|
|
|
|
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
|
|
int overflow;
|
|
long long value = PyLong_AsLongLongAndOverflow(obj, &overflow);
|
|
if (value == -1 && PyErr_Occurred()) {
|
|
throw python_error();
|
|
}
|
|
// No need to check overflow, because when overflow occured, it should
|
|
// return true in order to keep the same behavior of numpy.
|
|
return (bool)value;
|
|
}
|