x86/aacpsdsp: add ps_hybrid_analysis_fma3
This replace the sse3 version, which was not really faster than the sse one. Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
2bcf86d53d
commit
48615f0a78
@ -403,10 +403,8 @@ HYBRID_SYNTHESIS_DEINT
|
|||||||
%macro PS_HYBRID_ANALYSIS_IN 1
|
%macro PS_HYBRID_ANALYSIS_IN 1
|
||||||
movu m0, [inq+mmsize*%1]
|
movu m0, [inq+mmsize*%1]
|
||||||
movu m1, [inq+mmsize*(5-%1)+8]
|
movu m1, [inq+mmsize*(5-%1)+8]
|
||||||
mova m3, m0
|
shufps m3, m0, m0, q2301
|
||||||
mova m4, m1
|
shufps m4, m1, m1, q0123
|
||||||
shufps m3, m3, q2301
|
|
||||||
shufps m4, m4, q0123
|
|
||||||
shufps m1, m1, q1032
|
shufps m1, m1, q1032
|
||||||
%if cpuflag(sse3)
|
%if cpuflag(sse3)
|
||||||
addsubps m3, m4
|
addsubps m3, m4
|
||||||
@ -424,6 +422,15 @@ HYBRID_SYNTHESIS_DEINT
|
|||||||
%macro PS_HYBRID_ANALYSIS_LOOP 3
|
%macro PS_HYBRID_ANALYSIS_LOOP 3
|
||||||
mova m2, [filterq+nq+mmsize*%3]
|
mova m2, [filterq+nq+mmsize*%3]
|
||||||
shufps m2, m2, q2301
|
shufps m2, m2, q2301
|
||||||
|
%if cpuflag(fma3)
|
||||||
|
%if %3
|
||||||
|
fmaddps m3, m2, [rsp+mmsize*%3*2], m3
|
||||||
|
fmaddps m0, m2, [rsp+mmsize+mmsize*%3*2], m0
|
||||||
|
%else
|
||||||
|
mulps m3, m2, [rsp]
|
||||||
|
mulps m0, m2, [rsp+mmsize]
|
||||||
|
%endif
|
||||||
|
%else ; cpuflag(sse)
|
||||||
mova %2, [rsp+mmsize*%3*2]
|
mova %2, [rsp+mmsize*%3*2]
|
||||||
mova %1, [rsp+mmsize+mmsize*%3*2]
|
mova %1, [rsp+mmsize+mmsize*%3*2]
|
||||||
mulps %2, m2
|
mulps %2, m2
|
||||||
@ -432,20 +439,21 @@ HYBRID_SYNTHESIS_DEINT
|
|||||||
addps m3, %2
|
addps m3, %2
|
||||||
addps m0, %1
|
addps m0, %1
|
||||||
%endif
|
%endif
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro PS_HYBRID_ANALYSIS 0
|
%macro PS_HYBRID_ANALYSIS 0
|
||||||
cglobal ps_hybrid_analysis, 5, 5, 8, 24 * 4, out, in, filter, stride, n
|
cglobal ps_hybrid_analysis, 5, 5, 5 + notcpuflag(fma3) * 3, 24 * 4, out, in, filter, stride, n
|
||||||
%if cpuflag(sse3)
|
%if cpuflag(sse3)
|
||||||
%define MOVH movsd
|
%define MOVH movsd
|
||||||
%else
|
%else
|
||||||
%define MOVH movlps
|
%define MOVH movlps
|
||||||
|
mova m7, [ps_p1m1p1m1]
|
||||||
%endif
|
%endif
|
||||||
shl strideq, 3
|
shl strideq, 3
|
||||||
shl nd, 6
|
shl nd, 6
|
||||||
add filterq, nq
|
add filterq, nq
|
||||||
neg nq
|
neg nq
|
||||||
mova m7, [ps_p1m1p1m1]
|
|
||||||
PS_HYBRID_ANALYSIS_IN 0
|
PS_HYBRID_ANALYSIS_IN 0
|
||||||
PS_HYBRID_ANALYSIS_IN 1
|
PS_HYBRID_ANALYSIS_IN 1
|
||||||
PS_HYBRID_ANALYSIS_IN 2
|
PS_HYBRID_ANALYSIS_IN 2
|
||||||
@ -456,30 +464,22 @@ align 16
|
|||||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
|
PS_HYBRID_ANALYSIS_LOOP m5, m6, 1
|
||||||
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
|
PS_HYBRID_ANALYSIS_LOOP m5, m6, 2
|
||||||
|
|
||||||
%if cpuflag(sse3)
|
shufps m1, m3, m3, q2301
|
||||||
pshufd m3, m3, q2301
|
shufps m2, m0, m0, q2301
|
||||||
xorps m0, m7
|
|
||||||
hsubps m3, m0
|
|
||||||
pshufd m1, m3, q0020
|
|
||||||
pshufd m3, m3, q0031
|
|
||||||
addps m1, m3
|
|
||||||
movsd m2, [inq+6*8]
|
|
||||||
%else
|
|
||||||
mova m1, m3
|
|
||||||
mova m2, m0
|
|
||||||
shufps m1, m1, q2301
|
|
||||||
shufps m2, m2, q2301
|
|
||||||
subps m1, m3
|
subps m1, m3
|
||||||
addps m2, m0
|
addps m2, m0
|
||||||
unpcklps m3, m1, m2
|
unpcklps m3, m1, m2
|
||||||
unpckhps m1, m2
|
unpckhps m1, m2
|
||||||
addps m1, m3
|
addps m1, m3
|
||||||
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
|
movu m2, [inq+6*8] ; faster than movlps and no risk of overread
|
||||||
%endif
|
|
||||||
movss m3, [filterq+nq+8*6]
|
movss m3, [filterq+nq+8*6]
|
||||||
SPLATD m3
|
SPLATD m3
|
||||||
|
%if cpuflag(fma3)
|
||||||
|
fmaddps m1, m2, m3, m1
|
||||||
|
%else
|
||||||
mulps m2, m3
|
mulps m2, m3
|
||||||
addps m1, m2
|
addps m1, m2
|
||||||
|
%endif
|
||||||
MOVH [outq], m1
|
MOVH [outq], m1
|
||||||
add outq, strideq
|
add outq, strideq
|
||||||
add nq, 64
|
add nq, 64
|
||||||
@ -489,5 +489,5 @@ align 16
|
|||||||
|
|
||||||
INIT_XMM sse
|
INIT_XMM sse
|
||||||
PS_HYBRID_ANALYSIS
|
PS_HYBRID_ANALYSIS
|
||||||
INIT_XMM sse3
|
INIT_XMM fma3
|
||||||
PS_HYBRID_ANALYSIS
|
PS_HYBRID_ANALYSIS
|
||||||
|
@ -33,7 +33,7 @@ void ff_ps_mul_pair_single_sse (float (*dst)[2], float (*src0)[2],
|
|||||||
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
|
void ff_ps_hybrid_analysis_sse (float (*out)[2], float (*in)[2],
|
||||||
const float (*filter)[8][2],
|
const float (*filter)[8][2],
|
||||||
ptrdiff_t stride, int n);
|
ptrdiff_t stride, int n);
|
||||||
void ff_ps_hybrid_analysis_sse3(float (*out)[2], float (*in)[2],
|
void ff_ps_hybrid_analysis_fma3(float (*out)[2], float (*in)[2],
|
||||||
const float (*filter)[8][2],
|
const float (*filter)[8][2],
|
||||||
ptrdiff_t stride, int n);
|
ptrdiff_t stride, int n);
|
||||||
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||||
@ -64,9 +64,11 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
|||||||
s->add_squares = ff_ps_add_squares_sse3;
|
s->add_squares = ff_ps_add_squares_sse3;
|
||||||
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
|
s->stereo_interpolate[0] = ff_ps_stereo_interpolate_sse3;
|
||||||
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
|
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
|
||||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
|
|
||||||
}
|
}
|
||||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
|
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
|
||||||
}
|
}
|
||||||
|
if (EXTERNAL_FMA3(cpu_flags)) {
|
||||||
|
s->hybrid_analysis = ff_ps_hybrid_analysis_fma3;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user