Merge pull request #27777 from pratham-mcw:dnn-softmax-loop-unroll

dnn: improve performance of softmax_3d with loop unrolling #27777

### Pull Request Readiness Checklist

See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request

- [x] I agree to contribute to the project under Apache 2 License.
- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [x] The PR is proposed to the proper branch

- This PR applies loop unrolling in the softmax function.
- The change does not affect functional correctness.

**Performance Improvements**
- The optimization significantly improves the performance of softmax_3d on Windows ARM64 targets.
<img width="703" height="203" alt="image" src="https://github.com/user-attachments/assets/85997c15-f543-432c-95e5-69099d71fe71" />
This commit is contained in:
pratham-mcw 2025-09-19 19:15:04 +05:30 committed by GitHub
parent 0ff1452400
commit 15d3c56548
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -48,7 +48,16 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
size_t innerDim = i % innerSize; size_t innerDim = i % innerSize;
size_t srcOffset = outerDim * outerStep + innerDim; size_t srcOffset = outerDim * outerStep + innerDim;
// copy data from src to buf along axis, since the data may not be continuous // copy data from src to buf along axis, since the data may not be continuous
for (size_t _cnDim = 0; _cnDim < axisStep; _cnDim++) size_t _cnDim = 0;
#if CV_ENABLE_UNROLLED && defined(_M_ARM64)
for (; _cnDim + 3 < axisStep; _cnDim += 4) {
axisBuf[_cnDim + 0] = srcPtr[srcOffset + (_cnDim + 0 + axisBias) * cnStep];
axisBuf[_cnDim + 1] = srcPtr[srcOffset + (_cnDim + 1 + axisBias) * cnStep];
axisBuf[_cnDim + 2] = srcPtr[srcOffset + (_cnDim + 2 + axisBias) * cnStep];
axisBuf[_cnDim + 3] = srcPtr[srcOffset + (_cnDim + 3 + axisBias) * cnStep];
}
#endif
for (; _cnDim < axisStep; _cnDim++)
axisBuf[_cnDim] = srcPtr[srcOffset + (_cnDim + axisBias) * cnStep]; axisBuf[_cnDim] = srcPtr[srcOffset + (_cnDim + axisBias) * cnStep];
float maxVal = -FLT_MAX; float maxVal = -FLT_MAX;
@ -95,7 +104,16 @@ void softmax(Mat &dst, const Mat &src, int axis, int axisBias, int axisStep){
s = 1.f / s; s = 1.f / s;
// copy back the result to src // copy back the result to src
for (size_t _cnDim = 0; _cnDim < axisStep; _cnDim++) _cnDim = 0;
#if CV_ENABLE_UNROLLED && defined(_M_ARM64)
for (; _cnDim + 3 < axisStep; _cnDim += 4) {
dstPtr[srcOffset + (_cnDim + 0 + axisBias) * cnStep] = axisBuf[_cnDim + 0] * s;
dstPtr[srcOffset + (_cnDim + 1 + axisBias) * cnStep] = axisBuf[_cnDim + 1] * s;
dstPtr[srcOffset + (_cnDim + 2 + axisBias) * cnStep] = axisBuf[_cnDim + 2] * s;
dstPtr[srcOffset + (_cnDim + 3 + axisBias) * cnStep] = axisBuf[_cnDim + 3] * s;
}
#endif
for (; _cnDim < axisStep; _cnDim++)
dstPtr[srcOffset + (_cnDim + axisBias) * cnStep] = axisBuf[_cnDim] * s; dstPtr[srcOffset + (_cnDim + axisBias) * cnStep] = axisBuf[_cnDim] * s;
} }
}, nstripes); }, nstripes);