avfilter/bwdif: add avx2 filter_line function

8-bit:
2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3
10-bit:
2.00x faster (1703±1.7 vs. 853±2.0 decicycles) compared with ssse3
This commit is contained in:
James Darnley 2023-02-20 20:55:08 +01:00
parent a937723ca9
commit 073ec3b9da
2 changed files with 36 additions and 5 deletions

View File

@ -26,18 +26,22 @@
%include "libavutil/x86/x86util.asm" %include "libavutil/x86/x86util.asm"
SECTION_RODATA SECTION_RODATA 32
pw_coefhf: times 4 dw 1016, 5570 pw_coefhf: times 8 dw 1016, 5570
pw_coefhf1: times 8 dw -3801 pw_coefhf1: times 16 dw -3801
pw_coefsp: times 4 dw 5077, -981 pw_coefsp: times 8 dw 5077, -981
pw_splfdif: times 4 dw -768, 768 pw_splfdif: times 8 dw -768, 768
SECTION .text SECTION .text
%macro LOAD8 2 %macro LOAD8 2
%if mmsize == 32
pmovzxbw %1, %2
%else
movh %1, %2 movh %1, %2
punpcklbw %1, m7 punpcklbw %1, m7
%endif
%endmacro %endmacro
%macro LOAD12 2 %macro LOAD12 2
@ -45,8 +49,14 @@ SECTION .text
%endmacro %endmacro
%macro DISP8 0 %macro DISP8 0
%if mmsize == 32
vextracti128 xm1, m2, 1
packuswb xm2, xm1
movu [dstq], xm2
%else
packuswb m2, m2 packuswb m2, m2
movh [dstq], m2 movh [dstq], m2
%endif
%endmacro %endmacro
%macro DISP12 0 %macro DISP12 0
@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
prefs, mrefs, prefs2, mrefs2, \ prefs, mrefs, prefs2, mrefs2, \
prefs3, mrefs3, prefs4, \ prefs3, mrefs3, prefs4, \
mrefs4, parity, clip_max mrefs4, parity, clip_max
%if mmsize == 32
vpbroadcastw m12, WORD clip_maxm
%else
movd m12, DWORD clip_maxm movd m12, DWORD clip_maxm
SPLATW m12, m12, 0 SPLATW m12, m12, 0
%endif
%else %else
cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
prefs, mrefs, prefs2, mrefs2, \ prefs, mrefs, prefs2, mrefs2, \
@ -264,3 +278,8 @@ INIT_XMM ssse3
BWDIF BWDIF
INIT_XMM sse2 INIT_XMM sse2
BWDIF BWDIF
%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
INIT_YMM avx2
BWDIF
%endif

View File

@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2, int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max); int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next, void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2, int w, int prefs, int mrefs, int prefs2,
@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
int w, int prefs, int mrefs, int prefs2, int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4, int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max); int mrefs4, int parity, int clip_max);
void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void *next,
int w, int prefs, int mrefs, int prefs2,
int mrefs2, int prefs3, int mrefs3, int prefs4,
int mrefs4, int parity, int clip_max);
av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth) av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
{ {
@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
bwdif->filter_line = ff_bwdif_filter_line_sse2; bwdif->filter_line = ff_bwdif_filter_line_sse2;
if (EXTERNAL_SSSE3(cpu_flags)) if (EXTERNAL_SSSE3(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_ssse3; bwdif->filter_line = ff_bwdif_filter_line_ssse3;
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_avx2;
} else if (bit_depth <= 12) { } else if (bit_depth <= 12) {
if (EXTERNAL_SSE2(cpu_flags)) if (EXTERNAL_SSE2(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2; bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
if (EXTERNAL_SSSE3(cpu_flags)) if (EXTERNAL_SSSE3(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3; bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
} }
} }