From ba19416730eb9ec8a6b57ce8621c194ecdaed957 Mon Sep 17 00:00:00 2001 From: Dmitry Kurtaev Date: Wed, 20 Aug 2025 11:43:41 +0300 Subject: [PATCH] Merge pull request #27581 from dkurt:d.kuryaev/dlpack ### Pull Request Readiness Checklist resolves #16295 ``` docker run --gpus 0 -v ~/opencv:/opencv -v ~/opencv_contrib:/opencv_contrib -it nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 apt-get update && apt-get install -y cmake python3-dev python3-pip python3-venv && python3 -m venv .venv && source .venv/bin/activate && pip install -U pip && pip install -U numpy && pip install torch --index-url https://download.pytorch.org/whl/cu128 && cmake \ -DWITH_OPENCL=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DBUILD_DOCS=OFF \ -DWITH_CUDA=ON \ -DOPENCV_DNN_CUDA=ON \ -DOPENCV_EXTRA_MODULES_PATH=/opencv_contrib/modules \ -DBUILD_LIST=ts,cudev,python3 \ -S /opencv -B /opencv_build && cmake --build /opencv_build -j16 export PYTHONPATH=/opencv_build/lib/python3/:$PYTHONPATH ``` See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake --- 3rdparty/dlpack/LICENSE | 201 +++++++++++ 3rdparty/dlpack/include/dlpack/dlpack.h | 366 +++++++++++++++++++++ CMakeLists.txt | 1 + cmake/OpenCVDetectDLPack.cmake | 5 + modules/core/misc/python/pyopencv_core.hpp | 197 +++++++++++ modules/core/misc/python/pyopencv_cuda.hpp | 158 +++++++++ modules/python/src2/gen2.py | 3 + modules/python/test/test_cuda.py | 13 + 8 files changed, 944 insertions(+) create mode 100644 3rdparty/dlpack/LICENSE create mode 100644 3rdparty/dlpack/include/dlpack/dlpack.h create mode 100644 cmake/OpenCVDetectDLPack.cmake diff --git a/3rdparty/dlpack/LICENSE b/3rdparty/dlpack/LICENSE new file mode 100644 index 0000000000..20a9c8a7b4 --- /dev/null +++ b/3rdparty/dlpack/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2017 by Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/3rdparty/dlpack/include/dlpack/dlpack.h b/3rdparty/dlpack/include/dlpack/dlpack.h new file mode 100644 index 0000000000..5533f74f71 --- /dev/null +++ b/3rdparty/dlpack/include/dlpack/dlpack.h @@ -0,0 +1,366 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file dlpack.h + * \brief The common header of DLPack. + */ +#ifndef DLPACK_DLPACK_H_ +#define DLPACK_DLPACK_H_ + +/** + * \brief Compatibility with C++ + */ +#ifdef __cplusplus +#define DLPACK_EXTERN_C extern "C" +#else +#define DLPACK_EXTERN_C +#endif + +/*! \brief The current major version of dlpack */ +#define DLPACK_MAJOR_VERSION 1 + +/*! \brief The current minor version of dlpack */ +#define DLPACK_MINOR_VERSION 1 + +/*! \brief DLPACK_DLL prefix for windows */ +#ifdef _WIN32 +#ifdef DLPACK_EXPORTS +#define DLPACK_DLL __declspec(dllexport) +#else +#define DLPACK_DLL __declspec(dllimport) +#endif +#else +#define DLPACK_DLL +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * \brief The DLPack version. + * + * A change in major version indicates that we have changed the + * data layout of the ABI - DLManagedTensorVersioned. + * + * A change in minor version indicates that we have added new + * code, such as a new device type, but the ABI is kept the same. + * + * If an obtained DLPack tensor has a major version that disagrees + * with the version number specified in this header file + * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter + * (and it is safe to do so). It is not safe to access any other fields + * as the memory layout will have changed. + * + * In the case of a minor version mismatch, the tensor can be safely used as + * long as the consumer knows how to interpret all fields. Minor version + * updates indicate the addition of enumeration values. + */ +typedef struct { + /*! \brief DLPack major version. */ + uint32_t major; + /*! \brief DLPack minor version. */ + uint32_t minor; +} DLPackVersion; + +/*! + * \brief The device type in DLDevice. + */ +#ifdef __cplusplus +typedef enum : int32_t { +#else +typedef enum { +#endif + /*! \brief CPU device */ + kDLCPU = 1, + /*! \brief CUDA GPU device */ + kDLCUDA = 2, + /*! + * \brief Pinned CUDA CPU memory by cudaMallocHost + */ + kDLCUDAHost = 3, + /*! \brief OpenCL devices. */ + kDLOpenCL = 4, + /*! \brief Vulkan buffer for next generation graphics. */ + kDLVulkan = 7, + /*! \brief Metal for Apple GPU. */ + kDLMetal = 8, + /*! \brief Verilog simulator buffer */ + kDLVPI = 9, + /*! \brief ROCm GPUs for AMD GPUs */ + kDLROCM = 10, + /*! + * \brief Pinned ROCm CPU memory allocated by hipMallocHost + */ + kDLROCMHost = 11, + /*! + * \brief Reserved extension device type, + * used for quickly test extension device + * The semantics can differ depending on the implementation. + */ + kDLExtDev = 12, + /*! + * \brief CUDA managed/unified memory allocated by cudaMallocManaged + */ + kDLCUDAManaged = 13, + /*! + * \brief Unified shared memory allocated on a oneAPI non-partititioned + * device. Call to oneAPI runtime is required to determine the device + * type, the USM allocation type and the sycl context it is bound to. + * + */ + kDLOneAPI = 14, + /*! \brief GPU support for next generation WebGPU standard. */ + kDLWebGPU = 15, + /*! \brief Qualcomm Hexagon DSP */ + kDLHexagon = 16, + /*! \brief Microsoft MAIA devices */ + kDLMAIA = 17, +} DLDeviceType; + +/*! + * \brief A Device for Tensor and operator. + */ +typedef struct { + /*! \brief The device type used in the device. */ + DLDeviceType device_type; + /*! + * \brief The device index. + * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0. + */ + int32_t device_id; +} DLDevice; + +/*! + * \brief The type code options DLDataType. + */ +typedef enum { + /*! \brief signed integer */ + kDLInt = 0U, + /*! \brief unsigned integer */ + kDLUInt = 1U, + /*! \brief IEEE floating point */ + kDLFloat = 2U, + /*! + * \brief Opaque handle type, reserved for testing purposes. + * Frameworks need to agree on the handle data type for the exchange to be well-defined. + */ + kDLOpaqueHandle = 3U, + /*! \brief bfloat16 */ + kDLBfloat = 4U, + /*! + * \brief complex number + * (C/C++/Python layout: compact struct per complex number) + */ + kDLComplex = 5U, + /*! \brief boolean */ + kDLBool = 6U, + /*! \brief FP8 data types */ + kDLFloat8_e3m4 = 7U, + kDLFloat8_e4m3 = 8U, + kDLFloat8_e4m3b11fnuz = 9U, + kDLFloat8_e4m3fn = 10U, + kDLFloat8_e4m3fnuz = 11U, + kDLFloat8_e5m2 = 12U, + kDLFloat8_e5m2fnuz = 13U, + kDLFloat8_e8m0fnu = 14U, + /*! \brief FP6 data types + * Setting bits != 6 is currently unspecified, and the producer must ensure it is set + * while the consumer must stop importing if the value is unexpected. + */ + kDLFloat6_e2m3fn = 15U, + kDLFloat6_e3m2fn = 16U, + /*! \brief FP4 data types + * Setting bits != 4 is currently unspecified, and the producer must ensure it is set + * while the consumer must stop importing if the value is unexpected. + */ + kDLFloat4_e2m1fn = 17U, +} DLDataTypeCode; + +/*! + * \brief The data type the tensor can hold. The data type is assumed to follow the + * native endian-ness. An explicit error message should be raised when attempting to + * export an array with non-native endianness + * + * Examples + * - float: type_code = 2, bits = 32, lanes = 1 + * - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4 + * - int8: type_code = 0, bits = 8, lanes = 1 + * - std::complex: type_code = 5, bits = 64, lanes = 1 + * - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of bool is 8 bits) + * - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory) + * - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory) + * - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory) + * + * When a sub-byte type is packed, DLPack requires the data to be in little bit-endian, i.e., + * for a packed data set D ((D >> (i * bits)) && bit_mask) stores the i-th element. + */ +typedef struct { + /*! + * \brief Type code of base types. + * We keep it uint8_t instead of DLDataTypeCode for minimal memory + * footprint, but the value should be one of DLDataTypeCode enum values. + * */ + uint8_t code; + /*! + * \brief Number of bits, common choices are 8, 16, 32. + */ + uint8_t bits; + /*! \brief Number of lanes in the type, used for vector types. */ + uint16_t lanes; +} DLDataType; + +/*! + * \brief Plain C Tensor object, does not manage memory. + */ +typedef struct { + /*! + * \brief The data pointer points to the allocated data. This will be CUDA + * device pointer or cl_mem handle in OpenCL. It may be opaque on some device + * types. This pointer is always aligned to 256 bytes as in CUDA. The + * `byte_offset` field should be used to point to the beginning of the data. + * + * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed + * (after which this note will be updated); at the moment it is recommended + * to not rely on the data pointer being correctly aligned. + * + * For given DLTensor, the size of memory required to store the contents of + * data is calculated as follows: + * + * \code{.c} + * static inline size_t GetDataSize(const DLTensor* t) { + * size_t size = 1; + * for (tvm_index_t i = 0; i < t->ndim; ++i) { + * size *= t->shape[i]; + * } + * size *= (t->dtype.bits * t->dtype.lanes + 7) / 8; + * return size; + * } + * \endcode + * + * Note that if the tensor is of size zero, then the data pointer should be + * set to `NULL`. + */ + void* data; + /*! \brief The device of the tensor */ + DLDevice device; + /*! \brief Number of dimensions */ + int32_t ndim; + /*! \brief The data type of the pointer*/ + DLDataType dtype; + /*! \brief The shape of the tensor */ + int64_t* shape; + /*! + * \brief strides of the tensor (in number of elements, not bytes) + * can be NULL, indicating tensor is compact and row-majored. + */ + int64_t* strides; + /*! \brief The offset in bytes to the beginning pointer to data */ + uint64_t byte_offset; +} DLTensor; + +/*! + * \brief C Tensor object, manage memory of DLTensor. This data structure is + * intended to facilitate the borrowing of DLTensor by another framework. It is + * not meant to transfer the tensor. When the borrowing framework doesn't need + * the tensor, it should call the deleter to notify the host that the resource + * is no longer needed. + * + * \note This data structure is used as Legacy DLManagedTensor + * in DLPack exchange and is deprecated after DLPack v0.8 + * Use DLManagedTensorVersioned instead. + * This data structure may get renamed or deleted in future versions. + * + * \sa DLManagedTensorVersioned + */ +typedef struct DLManagedTensor { + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; + /*! \brief the context of the original host framework of DLManagedTensor in + * which DLManagedTensor is used in the framework. It can also be NULL. + */ + void * manager_ctx; + /*! + * \brief Destructor - this should be called + * to destruct the manager_ctx which backs the DLManagedTensor. It can be + * NULL if there is no way for the caller to provide a reasonable destructor. + * The destructor deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensor * self); +} DLManagedTensor; + +// bit masks used in in the DLManagedTensorVersioned + +/*! \brief bit mask to indicate that the tensor is read only. */ +#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) + +/*! + * \brief bit mask to indicate that the tensor is a copy made by the producer. + * + * If set, the tensor is considered solely owned throughout its lifetime by the + * consumer, until the producer-provided deleter is invoked. + */ +#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) + +/* + * \brief bit mask to indicate that whether a sub-byte type is packed or padded. + * + * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can + * be set by the producer to signal that a tensor of sub-byte type is padded. + */ +#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL) + +/*! + * \brief A versioned and managed C Tensor object, manage memory of DLTensor. + * + * This data structure is intended to facilitate the borrowing of DLTensor by + * another framework. It is not meant to transfer the tensor. When the borrowing + * framework doesn't need the tensor, it should call the deleter to notify the + * host that the resource is no longer needed. + * + * \note This is the current standard DLPack exchange data structure. + */ +struct DLManagedTensorVersioned { + /*! + * \brief The API and ABI version of the current managed Tensor + */ + DLPackVersion version; + /*! + * \brief the context of the original host framework. + * + * Stores DLManagedTensorVersioned is used in the + * framework. It can also be NULL. + */ + void *manager_ctx; + /*! + * \brief Destructor. + * + * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned. + * It can be NULL if there is no way for the caller to provide a reasonable + * destructor. The destructor deletes the argument self as well. + */ + void (*deleter)(struct DLManagedTensorVersioned *self); + /*! + * \brief Additional bitmask flags information about the tensor. + * + * By default the flags should be set to 0. + * + * \note Future ABI changes should keep everything until this field + * stable, to ensure that deleter can be correctly called. + * + * \sa DLPACK_FLAG_BITMASK_READ_ONLY + * \sa DLPACK_FLAG_BITMASK_IS_COPIED + */ + uint64_t flags; + /*! \brief DLTensor which is being memory managed */ + DLTensor dl_tensor; +}; + +#ifdef __cplusplus +} // DLPACK_EXTERN_C +#endif +#endif // DLPACK_DLPACK_H_ diff --git a/CMakeLists.txt b/CMakeLists.txt index bb3e9bb0a7..7ccb0700a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -640,6 +640,7 @@ ocv_cmake_hook(POST_CMAKE_BUILD_OPTIONS) # --- Python Support --- if(NOT IOS AND NOT XROS) include(cmake/OpenCVDetectPython.cmake) + include(cmake/OpenCVDetectDLPack.cmake) endif() include(cmake/OpenCVCompilerOptions.cmake) diff --git a/cmake/OpenCVDetectDLPack.cmake b/cmake/OpenCVDetectDLPack.cmake new file mode 100644 index 0000000000..bed505ad4c --- /dev/null +++ b/cmake/OpenCVDetectDLPack.cmake @@ -0,0 +1,5 @@ +find_package(dlpack QUIET) +if (NOT dlpack_FOUND) + ocv_include_directories("${OpenCV_SOURCE_DIR}/3rdparty/dlpack/include") + ocv_install_3rdparty_licenses(dlpack "${OpenCV_SOURCE_DIR}/3rdparty/dlpack/LICENSE") +endif() diff --git a/modules/core/misc/python/pyopencv_core.hpp b/modules/core/misc/python/pyopencv_core.hpp index 1aecd5b864..8c190b696b 100644 --- a/modules/core/misc/python/pyopencv_core.hpp +++ b/modules/core/misc/python/pyopencv_core.hpp @@ -3,6 +3,8 @@ #ifdef HAVE_OPENCV_CORE +#include "dlpack/dlpack.h" + static PyObject* pycvMakeType(PyObject* , PyObject* args, PyObject* kw) { const char *keywords[] = { "depth", "channels", NULL }; @@ -20,6 +22,201 @@ static PyObject* pycvMakeTypeCh(PyObject*, PyObject *value) { return PyInt_FromLong(CV_MAKETYPE(depth, channels)); } +#define CV_DLPACK_CAPSULE_NAME "dltensor" +#define CV_DLPACK_USED_CAPSULE_NAME "used_dltensor" + +template +bool fillDLPackTensor(const T& src, DLManagedTensor* tensor, const DLDevice& device); + +template +bool parseDLPackTensor(DLManagedTensor* tensor, T& obj, bool copy); + +template +int GetNumDims(const T& src); + +// source: https://github.com/dmlc/dlpack/blob/7f393bbb86a0ddd71fde3e700fc2affa5cdce72d/docs/source/python_spec.rst#L110 +static void dlpack_capsule_deleter(PyObject *self){ + if (PyCapsule_IsValid(self, CV_DLPACK_USED_CAPSULE_NAME)) { + return; + } + + DLManagedTensor *managed = (DLManagedTensor *)PyCapsule_GetPointer(self, CV_DLPACK_CAPSULE_NAME); + if (managed == NULL) { + PyErr_WriteUnraisable(self); + return; + } + + if (managed->deleter) { + managed->deleter(managed); + } +} + +static void array_dlpack_deleter(DLManagedTensor *self) +{ + if (!Py_IsInitialized()) { + return; + } + + PyGILState_STATE state = PyGILState_Ensure(); + + PyObject *array = (PyObject *)self->manager_ctx; + PyMem_Free(self); + Py_XDECREF(array); + + PyGILState_Release(state); +} + +template +static PyObject* to_dlpack(const T& src, PyObject* self, PyObject* py_args, PyObject* kw) +{ + int stream = 0; + PyObject* maxVersion = nullptr; + PyObject* dlDevice = nullptr; + bool copy = false; + const char* keywords[] = { "stream", "max_version", "dl_device", "copy", NULL }; + if (!PyArg_ParseTupleAndKeywords(py_args, kw, "|iOOp:__dlpack__", (char**)keywords, &stream, &maxVersion, &dlDevice, ©)) + return nullptr; + + DLDevice device = {(DLDeviceType)-1, 0}; + if (dlDevice && dlDevice != Py_None && PyTuple_Check(dlDevice)) + { + device.device_type = static_cast(PyLong_AsLong(PyTuple_GetItem(dlDevice, 0))); + device.device_id = PyLong_AsLong(PyTuple_GetItem(dlDevice, 1)); + } + + int ndim = GetNumDims(src); + void* ptr = PyMem_Malloc(sizeof(DLManagedTensor) + sizeof(int64_t) * ndim * 2); + if (!ptr) { + PyErr_NoMemory(); + return nullptr; + } + DLManagedTensor* tensor = reinterpret_cast(ptr); + tensor->manager_ctx = self; + tensor->deleter = array_dlpack_deleter; + tensor->dl_tensor.ndim = ndim; + tensor->dl_tensor.shape = reinterpret_cast(reinterpret_cast(ptr) + sizeof(DLManagedTensor)); + tensor->dl_tensor.strides = tensor->dl_tensor.shape + ndim; + fillDLPackTensor(src, tensor, device); + + PyObject* capsule = PyCapsule_New(ptr, CV_DLPACK_CAPSULE_NAME, dlpack_capsule_deleter); + if (!capsule) { + PyMem_Free(ptr); + return nullptr; + } + + // the capsule holds a reference + Py_INCREF(self); + + return capsule; +} + +template +static PyObject* from_dlpack(PyObject* py_args, PyObject* kw) +{ + PyObject* arr = nullptr; + PyObject* device = nullptr; + bool copy = false; + const char* keywords[] = { "device", "copy", NULL }; + if (!PyArg_ParseTupleAndKeywords(py_args, kw, "O|Op:from_dlpack", (char**)keywords, &arr, &device, ©)) + return nullptr; + + PyObject* capsule = nullptr; + if (PyCapsule_CheckExact(arr)) + { + capsule = arr; + } + else + { + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); + capsule = PyObject_CallMethodObjArgs(arr, PyString_FromString("__dlpack__"), NULL); + PyGILState_Release(gstate); + } + + DLManagedTensor* tensor = reinterpret_cast(PyCapsule_GetPointer(capsule, CV_DLPACK_CAPSULE_NAME)); + if (tensor == nullptr) + { + if (capsule != arr) + Py_DECREF(capsule); + return nullptr; + } + + T retval; + bool success = parseDLPackTensor(tensor, retval, copy); + if (success) + { + PyCapsule_SetName(capsule, CV_DLPACK_USED_CAPSULE_NAME); + } + if (capsule != arr) + Py_DECREF(capsule); + + return success ? pyopencv_from(retval) : nullptr; +} + +static DLDataType GetDLPackType(size_t elemSize1, int depth) { + DLDataType dtype; + dtype.bits = static_cast(8 * elemSize1); + dtype.lanes = 1; + switch (depth) + { + case CV_8S: case CV_16S: case CV_32S: dtype.code = kDLInt; break; + case CV_8U: case CV_16U: dtype.code = kDLUInt; break; + case CV_16F: case CV_32F: case CV_64F: dtype.code = kDLFloat; break; + default: + CV_Error(Error::StsNotImplemented, "__dlpack__ data type"); + } + return dtype; +} + +static int DLPackTypeToCVType(const DLDataType& dtype, int channels) { + if (dtype.code == kDLInt) + { + switch (dtype.bits) + { + case 8: return CV_8SC(channels); + case 16: return CV_16SC(channels); + case 32: return CV_32SC(channels); + default: + { + PyErr_SetString(PyExc_BufferError, + format("Unsupported int dlpack depth: %d", dtype.bits).c_str()); + return -1; + } + } + } + if (dtype.code == kDLUInt) + { + switch (dtype.bits) + { + case 8: return CV_8UC(channels); + case 16: return CV_16UC(channels); + default: + { + PyErr_SetString(PyExc_BufferError, + format("Unsupported uint dlpack depth: %d", dtype.bits).c_str()); + return -1; + } + } + } + if (dtype.code == kDLFloat) + { + switch (dtype.bits) + { + case 16: return CV_16FC(channels); + case 32: return CV_32FC(channels); + case 64: return CV_64FC(channels); + default: + { + PyErr_SetString(PyExc_BufferError, + format("Unsupported float dlpack depth: %d", dtype.bits).c_str()); + return -1; + } + } + } + PyErr_SetString(PyExc_BufferError, format("Unsupported dlpack data type: %d", dtype.code).c_str()); + return -1; +} + #define PYOPENCV_EXTRA_METHODS_CV \ {"CV_MAKETYPE", CV_PY_FN_WITH_KW(pycvMakeType), "CV_MAKETYPE(depth, channels) -> retval"}, \ {"CV_8UC", (PyCFunction)(pycvMakeTypeCh), METH_O, "CV_8UC(channels) -> retval"}, \ diff --git a/modules/core/misc/python/pyopencv_cuda.hpp b/modules/core/misc/python/pyopencv_cuda.hpp index a424498f27..5fd21ca630 100644 --- a/modules/core/misc/python/pyopencv_cuda.hpp +++ b/modules/core/misc/python/pyopencv_cuda.hpp @@ -21,17 +21,175 @@ template<> struct pyopencvVecConverter }; CV_PY_TO_CLASS(cuda::GpuMat) +CV_PY_TO_CLASS(cuda::GpuMatND) CV_PY_TO_CLASS(cuda::Stream) CV_PY_TO_CLASS(cuda::Event) CV_PY_TO_CLASS(cuda::HostMem) CV_PY_TO_CLASS_PTR(cuda::GpuMat) +CV_PY_TO_CLASS_PTR(cuda::GpuMatND) CV_PY_TO_CLASS_PTR(cuda::GpuMat::Allocator) CV_PY_FROM_CLASS(cuda::GpuMat) +CV_PY_FROM_CLASS(cuda::GpuMatND) CV_PY_FROM_CLASS(cuda::Stream) CV_PY_FROM_CLASS(cuda::HostMem) CV_PY_FROM_CLASS_PTR(cuda::GpuMat::Allocator) +template<> +bool fillDLPackTensor(const Ptr& src, DLManagedTensor* tensor, const DLDevice& device) +{ + if ((device.device_type != -1 && device.device_type != kDLCUDA) || device.device_id != 0) + { + PyErr_SetString(PyExc_BufferError, "GpuMat can be exported only on GPU:0"); + return false; + } + tensor->dl_tensor.data = src->cudaPtr(); + tensor->dl_tensor.device.device_type = kDLCUDA; + tensor->dl_tensor.device.device_id = 0; + tensor->dl_tensor.dtype = GetDLPackType(src->elemSize1(), src->depth()); + tensor->dl_tensor.shape[0] = src->rows; + tensor->dl_tensor.shape[1] = src->cols; + tensor->dl_tensor.shape[2] = src->channels(); + tensor->dl_tensor.strides[0] = src->step1(); + tensor->dl_tensor.strides[1] = src->channels(); + tensor->dl_tensor.strides[2] = 1; + tensor->dl_tensor.byte_offset = 0; + return true; +} + +template<> +bool fillDLPackTensor(const Ptr& src, DLManagedTensor* tensor, const DLDevice& device) +{ + if ((device.device_type != -1 && device.device_type != kDLCUDA) || device.device_id != 0) + { + PyErr_SetString(PyExc_BufferError, "GpuMatND can be exported only on GPU:0"); + return false; + } + tensor->dl_tensor.data = src->getDevicePtr(); + tensor->dl_tensor.device.device_type = kDLCUDA; + tensor->dl_tensor.device.device_id = 0; + tensor->dl_tensor.dtype = GetDLPackType(src->elemSize1(), CV_MAT_DEPTH(src->flags)); + for (int i = 0; i < src->dims; ++i) + tensor->dl_tensor.shape[i] = src->size[i]; + for (int i = 0; i < src->dims; ++i) + tensor->dl_tensor.strides[i] = src->step[i]; + tensor->dl_tensor.byte_offset = 0; + return true; +} + +template<> +bool parseDLPackTensor(DLManagedTensor* tensor, cv::cuda::GpuMat& obj, bool copy) +{ + if (tensor->dl_tensor.byte_offset != 0) + { + PyErr_SetString(PyExc_BufferError, "Unimplemented from_dlpack for GpuMat with memory offset"); + return false; + } + if (tensor->dl_tensor.ndim != 3) + { + PyErr_SetString(PyExc_BufferError, "cuda_GpuMat.from_dlpack expects a 3D tensor. Use cuda_GpuMatND.from_dlpack instead"); + return false; + } + if (tensor->dl_tensor.device.device_type != kDLCUDA) + { + PyErr_SetString(PyExc_BufferError, "cuda_GpuMat.from_dlpack expects a tensor on CUDA device"); + return false; + } + if (tensor->dl_tensor.strides[1] != tensor->dl_tensor.shape[2] || + tensor->dl_tensor.strides[2] != 1) + { + PyErr_SetString(PyExc_BufferError, "Unexpected strides for image. Try use GpuMatND"); + return false; + } + int type = DLPackTypeToCVType(tensor->dl_tensor.dtype, (int)tensor->dl_tensor.shape[2]); + if (type == -1) + return false; + + obj = cv::cuda::GpuMat( + static_cast(tensor->dl_tensor.shape[0]), + static_cast(tensor->dl_tensor.shape[1]), + type, + tensor->dl_tensor.data, + tensor->dl_tensor.strides[0] * tensor->dl_tensor.dtype.bits / 8 + ); + if (copy) + obj = obj.clone(); + return true; +} + +template<> +bool parseDLPackTensor(DLManagedTensor* tensor, cv::cuda::GpuMatND& obj, bool copy) +{ + if (tensor->dl_tensor.byte_offset != 0) + { + PyErr_SetString(PyExc_BufferError, "Unimplemented from_dlpack for GpuMat with memory offset"); + return false; + } + if (tensor->dl_tensor.device.device_type != kDLCUDA) + { + PyErr_SetString(PyExc_BufferError, "cuda_GpuMat.from_dlpack expects a tensor on CUDA device"); + return false; + } + int type = DLPackTypeToCVType(tensor->dl_tensor.dtype, (int)tensor->dl_tensor.shape[2]); + if (type == -1) + return false; + + std::vector steps(tensor->dl_tensor.ndim - 1); + std::vector sizes(tensor->dl_tensor.ndim); + for (int i = 0; i < tensor->dl_tensor.ndim - 1; ++i) + { + steps[i] = tensor->dl_tensor.strides[i] * tensor->dl_tensor.dtype.bits / 8; + sizes[i] = static_cast(tensor->dl_tensor.shape[i]); + } + sizes.back() = static_cast(tensor->dl_tensor.shape[tensor->dl_tensor.ndim - 1]); + obj = cv::cuda::GpuMatND(sizes, type, tensor->dl_tensor.data, steps); + if (copy) + obj = obj.clone(); + return true; +} + +template<> +int GetNumDims(const Ptr& src) { return 3; } + +template<> +int GetNumDims(const Ptr& src) { return src->dims; } + +static PyObject* pyDLPackGpuMat(PyObject* self, PyObject* py_args, PyObject* kw) { + Ptr * self1 = 0; + if (!pyopencv_cuda_GpuMat_getp(self, self1)) + return failmsgp("Incorrect type of self (must be 'cuda_GpuMat' or its derivative)"); + return to_dlpack(*(self1), self, py_args, kw); +} + +static PyObject* pyDLPackGpuMatND(PyObject* self, PyObject* py_args, PyObject* kw) { + Ptr * self1 = 0; + if (!pyopencv_cuda_GpuMatND_getp(self, self1)) + return failmsgp("Incorrect type of self (must be 'cuda_GpuMatND' or its derivative)"); + return to_dlpack(*(self1), self, py_args, kw); +} + +static PyObject* pyDLPackDeviceCUDA(PyObject*, PyObject*, PyObject*) { + return pyopencv_from(std::tuple(kDLCUDA, 0)); +} + +static PyObject* pyGpuMatFromDLPack(PyObject*, PyObject* py_args, PyObject* kw) { + return from_dlpack(py_args, kw); +} + +static PyObject* pyGpuMatNDFromDLPack(PyObject*, PyObject* py_args, PyObject* kw) { + return from_dlpack(py_args, kw); +} + +#define PYOPENCV_EXTRA_METHODS_cuda_GpuMat \ + {"__dlpack__", CV_PY_FN_WITH_KW(pyDLPackGpuMat), ""}, \ + {"__dlpack_device__", CV_PY_FN_WITH_KW(pyDLPackDeviceCUDA), ""}, \ + {"from_dlpack", CV_PY_FN_WITH_KW_(pyGpuMatFromDLPack, METH_STATIC), ""}, \ + +#define PYOPENCV_EXTRA_METHODS_cuda_GpuMatND \ + {"__dlpack__", CV_PY_FN_WITH_KW(pyDLPackGpuMatND), ""}, \ + {"__dlpack_device__", CV_PY_FN_WITH_KW(pyDLPackDeviceCUDA), ""}, \ + {"from_dlpack", CV_PY_FN_WITH_KW_(pyGpuMatNDFromDLPack, METH_STATIC), ""}, \ + #endif diff --git a/modules/python/src2/gen2.py b/modules/python/src2/gen2.py index 7d9f75063c..4649b0ebe8 100755 --- a/modules/python/src2/gen2.py +++ b/modules/python/src2/gen2.py @@ -133,6 +133,9 @@ static PyGetSetDef pyopencv_${name}_getseters[] = static PyMethodDef pyopencv_${name}_methods[] = { +#ifdef PYOPENCV_EXTRA_METHODS_${name} + PYOPENCV_EXTRA_METHODS_${name} +#endif ${methods_inits} {NULL, NULL} }; diff --git a/modules/python/test/test_cuda.py b/modules/python/test/test_cuda.py index 90336c1504..c41bfc957d 100644 --- a/modules/python/test/test_cuda.py +++ b/modules/python/test/test_cuda.py @@ -145,5 +145,18 @@ class cuda_test(NewOpenCVTests): self.assertEqual(True, hasattr(cv.cuda, 'fastNlMeansDenoisingColored')) self.assertEqual(True, hasattr(cv.cuda, 'nonLocalMeans')) + def test_dlpack_GpuMat(self): + for dtype in [np.int8, np.uint8, np.int16, np.uint16, np.float16, np.int32, np.float32, np.float64]: + for channels in [2, 3, 5]: + ref = (np.random.random((64, 128, channels)) * 255).astype(dtype) + src = cv.cuda_GpuMat() + src.upload(ref) + dst = cv.cuda_GpuMat.from_dlpack(src) + test = dst.download() + equal = np.array_equal(ref, test) + if not equal: + print(f"Failed test with dtype {dtype} and {channels} channels") + self.assertTrue(equal) + if __name__ == '__main__': NewOpenCVTests.bootstrap()