avcodec/h264: add avx 8-bit h264_idct_add
Haswell: - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext Skylake-U: - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxext
This commit is contained in:
		
							parent
							
								
									b5325c6711
								
							
						
					
					
						commit
						f61d454ca1
					
				| @ -65,7 +65,15 @@ SECTION .text | |||||||
| 
 | 
 | ||||||
|     IDCT4_1D      w, 0, 1, 2, 3, 4, 5 |     IDCT4_1D      w, 0, 1, 2, 3, 4, 5 | ||||||
|     mova         m6, [pw_32] |     mova         m6, [pw_32] | ||||||
|  |     %if mmsize == 8 | ||||||
|         TRANSPOSE4x4W 0, 1, 2, 3, 4 |         TRANSPOSE4x4W 0, 1, 2, 3, 4 | ||||||
|  |     %else | ||||||
|  |         punpcklwd m0, m1 | ||||||
|  |         punpcklwd m2, m3 | ||||||
|  |         SBUTTERFLY dq, 0, 2, 4 | ||||||
|  |         MOVHL m1, m0 | ||||||
|  |         MOVHL m3, m2 | ||||||
|  |     %endif | ||||||
|     paddw        m0, m6 |     paddw        m0, m6 | ||||||
|     IDCT4_1D      w, 0, 1, 2, 3, 4, 5 |     IDCT4_1D      w, 0, 1, 2, 3, 4, 5 | ||||||
|     pxor         m7, m7 |     pxor         m7, m7 | ||||||
| @ -1131,3 +1139,26 @@ INIT_MMX mmx | |||||||
| IDCT_DC_DEQUANT 0 | IDCT_DC_DEQUANT 0 | ||||||
| INIT_MMX sse2 | INIT_MMX sse2 | ||||||
| IDCT_DC_DEQUANT 7 | IDCT_DC_DEQUANT 7 | ||||||
|  | 
 | ||||||
|  | INIT_XMM avx | ||||||
|  | 
 | ||||||
|  | ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet | ||||||
|  | %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride | ||||||
|  |     movd       %3, [%7] | ||||||
|  |     movd       %4, [%7+%8] | ||||||
|  |     psraw      %1, %6 | ||||||
|  |     psraw      %2, %6 | ||||||
|  |     punpcklbw  %3, %5 | ||||||
|  |     punpcklbw  %4, %5 | ||||||
|  |     paddw      %3, %1 | ||||||
|  |     paddw      %4, %2 | ||||||
|  |     packuswb   %3, %5 | ||||||
|  |     packuswb   %4, %5 | ||||||
|  |     movd     [%7], %3 | ||||||
|  |     movd  [%7+%8], %4 | ||||||
|  | %endmacro | ||||||
|  | 
 | ||||||
|  | cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ | ||||||
|  |     movsxdifnidn stride_q, stride_d | ||||||
|  |     IDCT4_ADD    dst_q, block_q, stride_q | ||||||
|  | RET | ||||||
|  | |||||||
| @ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst,    \ | |||||||
|                                                        int stride); |                                                        int stride); | ||||||
| 
 | 
 | ||||||
| IDCT_ADD_FUNC(, 8, mmx) | IDCT_ADD_FUNC(, 8, mmx) | ||||||
|  | IDCT_ADD_FUNC(, 8, avx) | ||||||
| IDCT_ADD_FUNC(, 10, sse2) | IDCT_ADD_FUNC(, 10, sse2) | ||||||
| IDCT_ADD_FUNC(_dc, 8, mmxext) | IDCT_ADD_FUNC(_dc, 8, mmxext) | ||||||
| IDCT_ADD_FUNC(_dc, 10, mmxext) | IDCT_ADD_FUNC(_dc, 10, mmxext) | ||||||
| @ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, | |||||||
|                 c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx; |                 c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma422_8_avx; | ||||||
|                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx; |                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx; | ||||||
|             } |             } | ||||||
|  | 
 | ||||||
|  |             c->h264_idct_add        = ff_h264_idct_add_8_avx; | ||||||
|         } |         } | ||||||
|     } else if (bit_depth == 10) { |     } else if (bit_depth == 10) { | ||||||
|         if (EXTERNAL_MMXEXT(cpu_flags)) { |         if (EXTERNAL_MMXEXT(cpu_flags)) { | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user