avcodec/h264: add avx 8-bit h264_idct_add
Haswell: - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext Skylake-U: - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
This commit is contained in:
parent
b5325c6711
commit
f61d454ca1
@ -65,7 +65,15 @@ SECTION .text
|
|||||||
|
|
||||||
IDCT4_1D w, 0, 1, 2, 3, 4, 5
|
IDCT4_1D w, 0, 1, 2, 3, 4, 5
|
||||||
mova m6, [pw_32]
|
mova m6, [pw_32]
|
||||||
|
%if mmsize == 8
|
||||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||||
|
%else
|
||||||
|
punpcklwd m0, m1
|
||||||
|
punpcklwd m2, m3
|
||||||
|
SBUTTERFLY dq, 0, 2, 4
|
||||||
|
MOVHL m1, m0
|
||||||
|
MOVHL m3, m2
|
||||||
|
%endif
|
||||||
paddw m0, m6
|
paddw m0, m6
|
||||||
IDCT4_1D w, 0, 1, 2, 3, 4, 5
|
IDCT4_1D w, 0, 1, 2, 3, 4, 5
|
||||||
pxor m7, m7
|
pxor m7, m7
|
||||||
@ -1131,3 +1139,26 @@ INIT_MMX mmx
|
|||||||
IDCT_DC_DEQUANT 0
|
IDCT_DC_DEQUANT 0
|
||||||
INIT_MMX sse2
|
INIT_MMX sse2
|
||||||
IDCT_DC_DEQUANT 7
|
IDCT_DC_DEQUANT 7
|
||||||
|
|
||||||
|
INIT_XMM avx
|
||||||
|
|
||||||
|
; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
|
||||||
|
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
|
||||||
|
movd %3, [%7]
|
||||||
|
movd %4, [%7+%8]
|
||||||
|
psraw %1, %6
|
||||||
|
psraw %2, %6
|
||||||
|
punpcklbw %3, %5
|
||||||
|
punpcklbw %4, %5
|
||||||
|
paddw %3, %1
|
||||||
|
paddw %4, %2
|
||||||
|
packuswb %3, %5
|
||||||
|
packuswb %4, %5
|
||||||
|
movd [%7], %3
|
||||||
|
movd [%7+%8], %4
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
|
||||||
|
movsxdifnidn stride_q, stride_d
|
||||||
|
IDCT4_ADD dst_q, block_q, stride_q
|
||||||
|
RET
|
||||||
|
|||||||
@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
|||||||
int stride);
|
int stride);
|
||||||
|
|
||||||
IDCT_ADD_FUNC(, 8, mmx)
|
IDCT_ADD_FUNC(, 8, mmx)
|
||||||
|
IDCT_ADD_FUNC(, 8, avx)
|
||||||
IDCT_ADD_FUNC(, 10, sse2)
|
IDCT_ADD_FUNC(, 10, sse2)
|
||||||
IDCT_ADD_FUNC(_dc, 8, mmxext)
|
IDCT_ADD_FUNC(_dc, 8, mmxext)
|
||||||
IDCT_ADD_FUNC(_dc, 10, mmxext)
|
IDCT_ADD_FUNC(_dc, 10, mmxext)
|
||||||
@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
|||||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
|
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx;
|
||||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
|
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c->h264_idct_add = ff_h264_idct_add_8_avx;
|
||||||
}
|
}
|
||||||
} else if (bit_depth == 10) {
|
} else if (bit_depth == 10) {
|
||||||
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
if (EXTERNAL_MMXEXT(cpu_flags)) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user