mirror of
https://github.com/zebrajr/opencv.git
synced 2025-12-06 12:19:50 +01:00
Merge pull request #27777 from pratham-mcw:dnn-softmax-loop-unroll
dnn: improve performance of softmax_3d with loop unrolling #27777 ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - This PR applies loop unrolling in the softmax function. - The change does not affect functional correctness. **Performance Improvements** - The optimization significantly improves the performance of softmax_3d on Windows ARM64 targets. <img width="703" height="203" alt="image" src="https://github.com/user-attachments/assets/85997c15-f543-432c-95e5-69099d71fe71" />
This commit is contained in:
parent
0ff1452400
commit
15d3c56548
|
|
@ -48,7 +48,16 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
|
||||||
size_t innerDim = i % innerSize;
|
size_t innerDim = i % innerSize;
|
||||||
size_t srcOffset = outerDim * outerStep + innerDim;
|
size_t srcOffset = outerDim * outerStep + innerDim;
|
||||||
// copy data from src to buf along axis, since the data may not be continuous
|
// copy data from src to buf along axis, since the data may not be continuous
|
||||||
for (size_t _cnDim = 0; _cnDim < axisStep; _cnDim++)
|
size_t _cnDim = 0;
|
||||||
|
#if CV_ENABLE_UNROLLED && defined(_M_ARM64)
|
||||||
|
for (; _cnDim + 3 < axisStep; _cnDim += 4) {
|
||||||
|
axisBuf[_cnDim + 0] = srcPtr[srcOffset + (_cnDim + 0 + axisBias) * cnStep];
|
||||||
|
axisBuf[_cnDim + 1] = srcPtr[srcOffset + (_cnDim + 1 + axisBias) * cnStep];
|
||||||
|
axisBuf[_cnDim + 2] = srcPtr[srcOffset + (_cnDim + 2 + axisBias) * cnStep];
|
||||||
|
axisBuf[_cnDim + 3] = srcPtr[srcOffset + (_cnDim + 3 + axisBias) * cnStep];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
for (; _cnDim < axisStep; _cnDim++)
|
||||||
axisBuf[_cnDim] = srcPtr[srcOffset + (_cnDim + axisBias) * cnStep];
|
axisBuf[_cnDim] = srcPtr[srcOffset + (_cnDim + axisBias) * cnStep];
|
||||||
|
|
||||||
float maxVal = -FLT_MAX;
|
float maxVal = -FLT_MAX;
|
||||||
|
|
@ -95,7 +104,16 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
|
||||||
s = 1.f / s;
|
s = 1.f / s;
|
||||||
|
|
||||||
// copy back the result to src
|
// copy back the result to src
|
||||||
for (size_t _cnDim = 0; _cnDim < axisStep; _cnDim++)
|
_cnDim = 0;
|
||||||
|
#if CV_ENABLE_UNROLLED && defined(_M_ARM64)
|
||||||
|
for (; _cnDim + 3 < axisStep; _cnDim += 4) {
|
||||||
|
dstPtr[srcOffset + (_cnDim + 0 + axisBias) * cnStep] = axisBuf[_cnDim + 0] * s;
|
||||||
|
dstPtr[srcOffset + (_cnDim + 1 + axisBias) * cnStep] = axisBuf[_cnDim + 1] * s;
|
||||||
|
dstPtr[srcOffset + (_cnDim + 2 + axisBias) * cnStep] = axisBuf[_cnDim + 2] * s;
|
||||||
|
dstPtr[srcOffset + (_cnDim + 3 + axisBias) * cnStep] = axisBuf[_cnDim + 3] * s;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
for (; _cnDim < axisStep; _cnDim++)
|
||||||
dstPtr[srcOffset + (_cnDim + axisBias) * cnStep] = axisBuf[_cnDim] * s;
|
dstPtr[srcOffset + (_cnDim + axisBias) * cnStep] = axisBuf[_cnDim] * s;
|
||||||
}
|
}
|
||||||
}, nstripes);
|
}, nstripes);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user