x86/aacpsdsp: add ps_hybrid_analysis_fma3

This replace the sse3 version, which was not really faster than the sse one.

Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer 2022-09-20 16:02:49 -03:00
parent 2bcf86d53d
commit 48615f0a78
2 changed files with 25 additions and 23 deletions

View File

@ -403,10 +403,8 @@ HYBRID_SYNTHESIS_DEINT
%macro PS_HYBRID_ANALYSIS_IN 1 %macro PS_HYBRID_ANALYSIS_IN 1
movu m0, [inq+mmsize*%1] movu m0, [inq+mmsize*%1]
movu m1, [inq+mmsize*(5-%1)+8] movu m1, [inq+mmsize*(5-%1)+8]
mova m3, m0 shufps m3, m0, m0, q2301
mova m4, m1 shufps m4, m1, m1, q0123
shufps m3, m3, q2301
shufps m4, m4, q0123
shufps m1, m1, q1032 shufps m1, m1, q1032
%if cpuflag(sse3) %if cpuflag(sse3)
addsubps m3, m4 addsubps m3, m4
@ -424,6 +422,15 @@ HYBRID_SYNTHESIS_DEINT
%macro PS_HYBRID_ANALYSIS_LOOP 3 %macro PS_HYBRID_ANALYSIS_LOOP 3
mova m2, [filterq+nq+mmsize*%3] mova m2, [filterq+nq+mmsize*%3]
shufps m2, m2, q2301 shufps m2, m2, q2301
%if cpuflag(fma3)
%if %3
fmaddps m3, m2, [rsp+mmsize*%3*2], m3
fmaddps m0, m2, [rsp+mmsize+mmsize*%3*2], m0
%else
mulps m3, m2, [rsp]
mulps m0, m2, [rsp+mmsize]
%endif
%else ; cpuflag(sse)
mova %2, [rsp+mmsize*%3*2] mova %2, [rsp+mmsize*%3*2]
mova %1, [rsp+mmsize+mmsize*%3*2] mova %1, [rsp+mmsize+mmsize*%3*2]
mulps %2, m2 mulps %2, m2
@ -432,20 +439,21 @@ HYBRID_SYNTHESIS_DEINT
addps m3, %2 addps m3, %2
addps m0, %1 addps m0, %1
%endif %endif
%endif
%endmacro %endmacro
%macro PS_HYBRID_ANALYSIS 0 %macro PS_HYBRID_ANALYSIS 0
cglobal ps_hybrid_analysis, 5, 5, 8, 24 * 4, out, in, filter, stride, n cglobal ps_hybrid_analysis, 5, 5, 5 + notcpuflag(fma3) * 3, 24 * 4, out, in, filter, stride, n
%if cpuflag(sse3) %if cpuflag(sse3)
%define MOVH movsd %define MOVH movsd
%else %else
%define MOVH movlps %define MOVH movlps
mova m7, [ps_p1m1p1m1]
%endif %endif
shl strideq, 3 shl strideq, 3
shl nd, 6 shl nd, 6
add filterq, nq add filterq, nq
neg nq neg nq
mova m7, [ps_p1m1p1m1]
PS_HYBRID_ANALYSIS_IN 0 PS_HYBRID_ANALYSIS_IN 0
PS_HYBRID_ANALYSIS_IN 1 PS_HYBRID_ANALYSIS_IN 1
PS_HYBRID_ANALYSIS_IN 2 PS_HYBRID_ANALYSIS_IN 2
@ -456,30 +464,22 @@ align 16
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1 PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2 PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
%if cpuflag(sse3) shufps m1, m3, m3, q2301
pshufd m3, m3, q2301 shufps m2, m0, m0, q2301
xorps m0, m7
hsubps m3, m0
pshufd m1, m3, q0020
pshufd m3, m3, q0031
addps m1, m3
movsd m2, [inq+6*8]
%else
mova m1, m3
mova m2, m0
shufps m1, m1, q2301
shufps m2, m2, q2301
subps m1, m3 subps m1, m3
addps m2, m0 addps m2, m0
unpcklps m3, m1, m2 unpcklps m3, m1, m2
unpckhps m1, m2 unpckhps m1, m2
addps m1, m3 addps m1, m3
movu m2, [inq+6*8] ; faster than movlps and no risk of overread movu m2, [inq+6*8] ; faster than movlps and no risk of overread
%endif
movss m3, [filterq+nq+8*6] movss m3, [filterq+nq+8*6]
SPLATD m3 SPLATD m3
%if cpuflag(fma3)
fmaddps m1, m2, m3, m1
%else
mulps m2, m3 mulps m2, m3
addps m1, m2 addps m1, m2
%endif
MOVH [outq], m1 MOVH [outq], m1
add outq, strideq add outq, strideq
add nq, 64 add nq, 64
@ -489,5 +489,5 @@ align 16
INIT_XMM sse INIT_XMM sse
PS_HYBRID_ANALYSIS PS_HYBRID_ANALYSIS
INIT_XMM sse3 INIT_XMM fma3
PS_HYBRID_ANALYSIS PS_HYBRID_ANALYSIS

View File

@ -33,7 +33,7 @@ void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2], void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
const float (*filter)[8][2], const float (*filter)[8][2],
ptrdiff_t stride, int n); ptrdiff_t stride, int n);
void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2], void ff_ps_hybrid_analysis_fma3(float (*out)[2], float (*in)[2],
const float (*filter)[8][2], const float (*filter)[8][2],
ptrdiff_t stride, int n); ptrdiff_t stride, int n);
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2], void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
@ -64,9 +64,11 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
s->add_squares = ff_ps_add_squares_sse3; s->add_squares = ff_ps_add_squares_sse3;
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3; s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3; s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
} }
if (EXTERNAL_SSE4(cpu_flags)) { if (EXTERNAL_SSE4(cpu_flags)) {
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4; s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
} }
if (EXTERNAL_FMA3(cpu_flags)) {
s->hybrid_analysis = ff_ps_hybrid_analysis_fma3;
}
} }