Skip to content

Commit

Permalink
avfilter/tonemapx: use fma neon intrinsics
Browse files Browse the repository at this point in the history
No observable performance difference observed, and some compilers
even generate the same instructions for mla and fma intrinsics.

This is just a cleanup to always use fma for float32 for consistency.
  • Loading branch information
gnattu committed Sep 29, 2024
1 parent 5b8c2f3 commit 1e09671
Showing 1 changed file with 12 additions and 12 deletions.
24 changes: 12 additions & 12 deletions debian/patches/0060-add-simd-optimized-tonemapx-filter.patch
Original file line number Diff line number Diff line change
Expand Up @@ -430,9 +430,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+ float32x4_t luma4 = vdupq_n_f32(0);
+ float32x4_t overbright4;
+ // Group A
+ luma4 = vmlaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
+ luma4 = vmlaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
+ luma4 = vmlaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
+ luma4 = vfmaq_n_f32(luma4, r_linx4a, (float)av_q2d(coeffs->cr));
+ luma4 = vfmaq_n_f32(luma4, g_linx4a, (float)av_q2d(coeffs->cg));
+ luma4 = vfmaq_n_f32(luma4, b_linx4a, (float)av_q2d(coeffs->cb));
+ overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
+ r_linx4a = vmlsq_f32(r_linx4a, r_linx4a, overbright4);
+ r_linx4a = vmlaq_f32(r_linx4a, luma4, overbright4);
Expand All @@ -442,9 +442,9 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+ b_linx4a = vmlaq_f32(b_linx4a, luma4, overbright4);
+ // Group B
+ luma4 = vdupq_n_f32(0);
+ luma4 = vmlaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
+ luma4 = vmlaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
+ luma4 = vmlaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
+ luma4 = vfmaq_n_f32(luma4, r_linx4b, (float)av_q2d(coeffs->cr));
+ luma4 = vfmaq_n_f32(luma4, g_linx4b, (float)av_q2d(coeffs->cg));
+ luma4 = vfmaq_n_f32(luma4, b_linx4b, (float)av_q2d(coeffs->cb));
+ overbright4 = vdivq_f32(vmaxq_f32(vsubq_f32(luma4, desat4), eps_x4), vmaxq_f32(luma4, eps_x4));
+ r_linx4b = vmlsq_f32(r_linx4b, r_linx4b, overbright4);
+ r_linx4b = vmlaq_f32(r_linx4b, luma4, overbright4);
Expand All @@ -462,12 +462,12 @@ Index: FFmpeg/libavfilter/aarch64/vf_tonemapx_intrin_neon.c
+ g_linx4b = vmulq_f32(g_linx4b, mapvalx4b);
+ b_linx4b = vmulq_f32(b_linx4b, mapvalx4b);
+
+ r_linx4a = vmlaq_n_f32(offset, r_linx4a, 32767);
+ r_linx4b = vmlaq_n_f32(offset, r_linx4b, 32767);
+ g_linx4a = vmlaq_n_f32(offset, g_linx4a, 32767);
+ g_linx4b = vmlaq_n_f32(offset, g_linx4b, 32767);
+ b_linx4a = vmlaq_n_f32(offset, b_linx4a, 32767);
+ b_linx4b = vmlaq_n_f32(offset, b_linx4b, 32767);
+ r_linx4a = vfmaq_n_f32(offset, r_linx4a, 32767);
+ r_linx4b = vfmaq_n_f32(offset, r_linx4b, 32767);
+ g_linx4a = vfmaq_n_f32(offset, g_linx4a, 32767);
+ g_linx4b = vfmaq_n_f32(offset, g_linx4b, 32767);
+ b_linx4a = vfmaq_n_f32(offset, b_linx4a, 32767);
+ b_linx4b = vfmaq_n_f32(offset, b_linx4b, 32767);
+
+ rx4a = vcvtq_s32_f32(r_linx4a);
+ rx4a = vminq_s32(rx4a, output_upper_bound);
Expand Down

0 comments on commit 1e09671

Please sign in to comment.