avfilter/bwdif: add avx2 filter_line function
8-bit: 2.24x faster (1925±1.3 vs. 859±2.2 decicycles) compared with ssse3 10-bit: 2.00x faster (1703±1.7 vs. 853±2.0 decicycles) compared with ssse3
This commit is contained in:
parent
a937723ca9
commit
073ec3b9da
@ -26,18 +26,22 @@
|
|||||||
|
|
||||||
%include "libavutil/x86/x86util.asm"
|
%include "libavutil/x86/x86util.asm"
|
||||||
|
|
||||||
SECTION_RODATA
|
SECTION_RODATA 32
|
||||||
|
|
||||||
pw_coefhf: times 4 dw 1016, 5570
|
pw_coefhf: times 8 dw 1016, 5570
|
||||||
pw_coefhf1: times 8 dw -3801
|
pw_coefhf1: times 16 dw -3801
|
||||||
pw_coefsp: times 4 dw 5077, -981
|
pw_coefsp: times 8 dw 5077, -981
|
||||||
pw_splfdif: times 4 dw -768, 768
|
pw_splfdif: times 8 dw -768, 768
|
||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
%macro LOAD8 2
|
%macro LOAD8 2
|
||||||
|
%if mmsize == 32
|
||||||
|
pmovzxbw %1, %2
|
||||||
|
%else
|
||||||
movh %1, %2
|
movh %1, %2
|
||||||
punpcklbw %1, m7
|
punpcklbw %1, m7
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro LOAD12 2
|
%macro LOAD12 2
|
||||||
@ -45,8 +49,14 @@ SECTION .text
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro DISP8 0
|
%macro DISP8 0
|
||||||
|
%if mmsize == 32
|
||||||
|
vextracti128 xm1, m2, 1
|
||||||
|
packuswb xm2, xm1
|
||||||
|
movu [dstq], xm2
|
||||||
|
%else
|
||||||
packuswb m2, m2
|
packuswb m2, m2
|
||||||
movh [dstq], m2
|
movh [dstq], m2
|
||||||
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro DISP12 0
|
%macro DISP12 0
|
||||||
@ -244,8 +254,12 @@ cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
|
|||||||
prefs, mrefs, prefs2, mrefs2, \
|
prefs, mrefs, prefs2, mrefs2, \
|
||||||
prefs3, mrefs3, prefs4, \
|
prefs3, mrefs3, prefs4, \
|
||||||
mrefs4, parity, clip_max
|
mrefs4, parity, clip_max
|
||||||
|
%if mmsize == 32
|
||||||
|
vpbroadcastw m12, WORD clip_maxm
|
||||||
|
%else
|
||||||
movd m12, DWORD clip_maxm
|
movd m12, DWORD clip_maxm
|
||||||
SPLATW m12, m12, 0
|
SPLATW m12, m12, 0
|
||||||
|
%endif
|
||||||
%else
|
%else
|
||||||
cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
|
cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
|
||||||
prefs, mrefs, prefs2, mrefs2, \
|
prefs, mrefs, prefs2, mrefs2, \
|
||||||
@ -264,3 +278,8 @@ INIT_XMM ssse3
|
|||||||
BWDIF
|
BWDIF
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
BWDIF
|
BWDIF
|
||||||
|
|
||||||
|
%if HAVE_AVX2_EXTERNAL && ARCH_X86_64
|
||||||
|
INIT_YMM avx2
|
||||||
|
BWDIF
|
||||||
|
%endif
|
||||||
|
@ -32,6 +32,10 @@ void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next,
|
|||||||
int w, int prefs, int mrefs, int prefs2,
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||||
int mrefs4, int parity, int clip_max);
|
int mrefs4, int parity, int clip_max);
|
||||||
|
void ff_bwdif_filter_line_avx2(void *dst, void *prev, void *cur, void *next,
|
||||||
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
|
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||||
|
int mrefs4, int parity, int clip_max);
|
||||||
|
|
||||||
void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
|
void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
|
||||||
int w, int prefs, int mrefs, int prefs2,
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
@ -41,6 +45,10 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
|
|||||||
int w, int prefs, int mrefs, int prefs2,
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||||
int mrefs4, int parity, int clip_max);
|
int mrefs4, int parity, int clip_max);
|
||||||
|
void ff_bwdif_filter_line_12bit_avx2(void *dst, void *prev, void *cur, void *next,
|
||||||
|
int w, int prefs, int mrefs, int prefs2,
|
||||||
|
int mrefs2, int prefs3, int mrefs3, int prefs4,
|
||||||
|
int mrefs4, int parity, int clip_max);
|
||||||
|
|
||||||
av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
|
av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
|
||||||
{
|
{
|
||||||
@ -51,10 +59,14 @@ av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
|
|||||||
bwdif->filter_line = ff_bwdif_filter_line_sse2;
|
bwdif->filter_line = ff_bwdif_filter_line_sse2;
|
||||||
if (EXTERNAL_SSSE3(cpu_flags))
|
if (EXTERNAL_SSSE3(cpu_flags))
|
||||||
bwdif->filter_line = ff_bwdif_filter_line_ssse3;
|
bwdif->filter_line = ff_bwdif_filter_line_ssse3;
|
||||||
|
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
|
||||||
|
bwdif->filter_line = ff_bwdif_filter_line_avx2;
|
||||||
} else if (bit_depth <= 12) {
|
} else if (bit_depth <= 12) {
|
||||||
if (EXTERNAL_SSE2(cpu_flags))
|
if (EXTERNAL_SSE2(cpu_flags))
|
||||||
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
|
bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
|
||||||
if (EXTERNAL_SSSE3(cpu_flags))
|
if (EXTERNAL_SSSE3(cpu_flags))
|
||||||
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
|
bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
|
||||||
|
if (ARCH_X86_64 && EXTERNAL_AVX2(cpu_flags))
|
||||||
|
bwdif->filter_line = ff_bwdif_filter_line_12bit_avx2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user