mirror of
https://github.com/zebrajr/opencv.git
synced 2025-12-06 00:19:46 +01:00
Enabled fp16 conversions, but disabled NEON FP16 arithmetics on Windows for ARM for now #27897 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
48 lines
1.5 KiB
C++
48 lines
1.5 KiB
C++
#include <stdio.h>
|
|
|
|
#if (defined __GNUC__ && (defined __arm__ || defined __aarch64__)) /* || (defined _MSC_VER && (defined _M_ARM64 || defined _M_ARM64EC)) */
|
|
// Windows + ARM64 case disabled: https://github.com/opencv/opencv/issues/25052
|
|
#include "arm_neon.h"
|
|
|
|
float16x8_t vld1q_as_f16(const float* src)
|
|
{
|
|
float32x4_t s0 = vld1q_f32(src), s1 = vld1q_f32(src + 4);
|
|
return vcombine_f16(vcvt_f16_f32(s0), vcvt_f16_f32(s1));
|
|
}
|
|
|
|
void vprintreg(const char* name, const float16x8_t& r)
|
|
{
|
|
float data[8];
|
|
vst1q_f32(data, vcvt_f32_f16(vget_low_f16(r)));
|
|
vst1q_f32(data + 4, vcvt_f32_f16(vget_high_f16(r)));
|
|
printf("%s: (%.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f)\n",
|
|
name, data[0], data[1], data[2], data[3],
|
|
data[4], data[5], data[6], data[7]);
|
|
}
|
|
|
|
void test()
|
|
{
|
|
const float src1[] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
|
|
const float src2[] = { 1.f, 3.f, 6.f, 10.f, 15.f, 21.f, 28.f, 36.f };
|
|
float16x8_t s1 = vld1q_as_f16(src1), s2 = vld1q_as_f16(src2);
|
|
float16x8_t d = vsubq_f16(s1, s1);
|
|
d = vfmaq_laneq_f16(d, s1, s2, 0);
|
|
d = vfmaq_laneq_f16(d, s1, s2, 1);
|
|
d = vfmaq_laneq_f16(d, s1, s2, 2);
|
|
d = vfmaq_laneq_f16(d, s1, s2, 3);
|
|
d = vfmaq_laneq_f16(d, s1, s2, 4);
|
|
d = vfmaq_laneq_f16(d, s1, s2, 5);
|
|
d = vfmaq_laneq_f16(d, s1, s2, 6);
|
|
d = vfmaq_laneq_f16(d, s1, s2, 7);
|
|
vprintreg("s1*s2[0]+s1*s2[1] + ... + s1*s2[7]", d);
|
|
}
|
|
#else
|
|
#error "NEON FP16 is not supported"
|
|
#endif
|
|
|
|
int main()
|
|
{
|
|
test();
|
|
return 0;
|
|
}
|