Various VP8 x86 deblocking speedups
SSSE3 versions, improve SSE2 versions a bit. SSE2/SSSE3 mbedge h functions are currently broken, so explicitly disable them. Originally committed as revision 24403 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
		
							parent
							
								
									8a810ccbba
								
							
						
					
					
						commit
						7dd224a42d
					
				| @ -223,64 +223,31 @@ extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16], int stride) | ||||
| extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]); | ||||
| extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride); | ||||
| 
 | ||||
| extern void ff_vp8_v_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim); | ||||
| extern void ff_vp8_v_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); | ||||
| extern void ff_vp8_v_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim); | ||||
| extern void ff_vp8_h_loop_filter_simple_mmx   (uint8_t *dst, int stride, int flim); | ||||
| extern void ff_vp8_h_loop_filter_simple_mmxext(uint8_t *dst, int stride, int flim); | ||||
| extern void ff_vp8_h_loop_filter_simple_sse2  (uint8_t *dst, int stride, int flim); | ||||
| #define DECLARE_LOOP_FILTER(NAME)\ | ||||
| extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ | ||||
| extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, int stride, int flim);\ | ||||
| extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\ | ||||
|                                                     int e, int i, int hvt);\ | ||||
| extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, int stride,\ | ||||
|                                                     int e, int i, int hvt);\ | ||||
| extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\ | ||||
|                                                     int s, int e, int i, int hvt);\ | ||||
| extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, uint8_t *dstV,\ | ||||
|                                                     int s, int e, int i, int hvt);\ | ||||
| extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\ | ||||
|                                                     int e, int i, int hvt);\ | ||||
| extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, int stride,\ | ||||
|                                                     int e, int i, int hvt);\ | ||||
| extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\ | ||||
|                                                     int s, int e, int i, int hvt);\ | ||||
| extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, uint8_t *dstV,\ | ||||
|                                                     int s, int e, int i, int hvt); | ||||
| 
 | ||||
| extern void ff_vp8_v_loop_filter16y_inner_mmx   (uint8_t *dst, int stride, | ||||
|                                                  int e, int i, int hvt); | ||||
| extern void ff_vp8_v_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, | ||||
|                                                  int e, int i, int hvt); | ||||
| extern void ff_vp8_v_loop_filter16y_inner_sse2  (uint8_t *dst, int stride, | ||||
|                                                  int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter16y_inner_mmx   (uint8_t *dst, int stride, | ||||
|                                                  int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter16y_inner_mmxext(uint8_t *dst, int stride, | ||||
|                                                  int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter16y_inner_sse2  (uint8_t *dst, int stride, | ||||
|                                                  int e, int i, int hvt); | ||||
| DECLARE_LOOP_FILTER(mmx) | ||||
| DECLARE_LOOP_FILTER(mmxext) | ||||
| DECLARE_LOOP_FILTER(sse2) | ||||
| DECLARE_LOOP_FILTER(ssse3) | ||||
| 
 | ||||
| extern void ff_vp8_v_loop_filter8uv_inner_mmx   (uint8_t *dstU, uint8_t *dstV, | ||||
|                                                  int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_v_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, | ||||
|                                                  int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_v_loop_filter8uv_inner_sse2  (uint8_t *dstU, uint8_t *dstV, | ||||
|                                                  int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter8uv_inner_mmx   (uint8_t *dstU, uint8_t *dstV, | ||||
|                                                  int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter8uv_inner_mmxext(uint8_t *dstU, uint8_t *dstV, | ||||
|                                                  int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter8uv_inner_sse2  (uint8_t *dstU, uint8_t *dstV, | ||||
|                                                  int s, int e, int i, int hvt); | ||||
| 
 | ||||
| extern void ff_vp8_v_loop_filter16y_mbedge_mmx   (uint8_t *dst, int stride, | ||||
|                                                   int e, int i, int hvt); | ||||
| extern void ff_vp8_v_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride, | ||||
|                                                   int e, int i, int hvt); | ||||
| extern void ff_vp8_v_loop_filter16y_mbedge_sse2  (uint8_t *dst, int stride, | ||||
|                                                   int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter16y_mbedge_mmx   (uint8_t *dst, int stride, | ||||
|                                                   int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter16y_mbedge_mmxext(uint8_t *dst, int stride, | ||||
|                                                   int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter16y_mbedge_sse2  (uint8_t *dst, int stride, | ||||
|                                                   int e, int i, int hvt); | ||||
| 
 | ||||
| extern void ff_vp8_v_loop_filter8uv_mbedge_mmx   (uint8_t *dstU, uint8_t *dstV, | ||||
|                                                   int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_v_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV, | ||||
|                                                   int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_v_loop_filter8uv_mbedge_sse2  (uint8_t *dstU, uint8_t *dstV, | ||||
|                                                   int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter8uv_mbedge_mmx   (uint8_t *dstU, uint8_t *dstV, | ||||
|                                                   int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter8uv_mbedge_mmxext(uint8_t *dstU, uint8_t *dstV, | ||||
|                                                   int s, int e, int i, int hvt); | ||||
| extern void ff_vp8_h_loop_filter8uv_mbedge_sse2  (uint8_t *dstU, uint8_t *dstV, | ||||
|                                                   int s, int e, int i, int hvt); | ||||
| #endif | ||||
| 
 | ||||
| #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ | ||||
| @ -384,8 +351,8 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | ||||
|         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; | ||||
|         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; | ||||
| 
 | ||||
|         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmxext; | ||||
|         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmxext; | ||||
|         //c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
 | ||||
|         //c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
 | ||||
|     } | ||||
| 
 | ||||
|     if (mm_flags & FF_MM_SSSE3) { | ||||
| @ -395,6 +362,19 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | ||||
|         VP8_BILINEAR_MC_FUNC(0, 16, ssse3); | ||||
|         VP8_BILINEAR_MC_FUNC(1, 8, ssse3); | ||||
|         VP8_BILINEAR_MC_FUNC(2, 4, ssse3); | ||||
| 
 | ||||
|         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; | ||||
|         c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; | ||||
| 
 | ||||
|         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; | ||||
|         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; | ||||
|         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; | ||||
|         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3; | ||||
| 
 | ||||
|         c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3; | ||||
|         //c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
 | ||||
|         c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3; | ||||
|         //c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
 | ||||
|     } | ||||
| 
 | ||||
|     if (mm_flags & FF_MM_SSE4) { | ||||
|  | ||||
| @ -1229,18 +1229,22 @@ cglobal vp8_luma_dc_wht_mmx, 2,3 | ||||
|     movd    [%7+%9*2], m%4 | ||||
| %endmacro | ||||
| 
 | ||||
| %macro SPLATB_REG 3 | ||||
| %macro SPLATB_REG 3-4 | ||||
|     movd           %1, %2 | ||||
| %ifidn %3, ssse3 | ||||
|     pshufb         %1, %4 | ||||
| %else | ||||
|     punpcklbw      %1, %1 | ||||
| %if mmsize == 16 ; sse2 | ||||
|     punpcklwd      %1, %1 | ||||
|     pshufd         %1, %1, 0x0 | ||||
|     pshuflw        %1, %1, 0x0 | ||||
|     punpcklqdq     %1, %1 | ||||
| %elifidn %3, mmx | ||||
|     punpcklwd      %1, %1 | ||||
|     punpckldq      %1, %1 | ||||
| %else ; mmxext | ||||
|     pshufw         %1, %1, 0x0 | ||||
| %endif | ||||
| %endif | ||||
| %endmacro | ||||
| 
 | ||||
| %macro SIMPLE_LOOPFILTER 3 | ||||
| @ -1252,7 +1256,10 @@ cglobal vp8_%2_loop_filter_simple_%1, 3, %3 | ||||
| %if mmsize == 8 ; mmx/mmxext | ||||
|     mov            r3, 2 | ||||
| %endif | ||||
|     SPLATB_REG     m7, r2, %1       ; splat "flim" into register | ||||
| %ifidn %1, ssse3 | ||||
|     pxor           m0, m0 | ||||
| %endif | ||||
|     SPLATB_REG     m7, r2, %1, m0   ; splat "flim" into register | ||||
| 
 | ||||
|     ; set up indexes to address 4 rows | ||||
|     mov            r2, r1 | ||||
| @ -1398,6 +1405,8 @@ SIMPLE_LOOPFILTER mmxext, h, 6 | ||||
| INIT_XMM | ||||
| SIMPLE_LOOPFILTER sse2,   v, 3 | ||||
| SIMPLE_LOOPFILTER sse2,   h, 6 | ||||
| SIMPLE_LOOPFILTER ssse3,  v, 3 | ||||
| SIMPLE_LOOPFILTER ssse3,  h, 6 | ||||
| 
 | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, | ||||
| @ -1433,11 +1442,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 | ||||
| %define stack_reg   hev_thr_reg | ||||
| %endif | ||||
| 
 | ||||
| %ifidn %1, ssse3 | ||||
|     pxor             m7, m7 | ||||
| %endif | ||||
| 
 | ||||
| %ifndef m8 ; mmx/mmxext or sse2 on x86-32 | ||||
|     ; splat function arguments | ||||
|     SPLATB_REG       m0, E_reg, %1   ; E | ||||
|     SPLATB_REG       m1, I_reg, %1   ; I | ||||
|     SPLATB_REG       m2, hev_thr_reg, %1 ; hev_thresh | ||||
|     SPLATB_REG       m0, E_reg, %1, m7 ; E | ||||
|     SPLATB_REG       m1, I_reg, %1, m7 ; I | ||||
|     SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh | ||||
| 
 | ||||
|     ; align stack | ||||
|     mov       stack_reg, rsp         ; backup stack pointer | ||||
| @ -1470,9 +1483,9 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 | ||||
| %define q0backup m8 | ||||
| 
 | ||||
|     ; splat function arguments | ||||
|     SPLATB_REG   flim_E, E_reg, %1   ; E | ||||
|     SPLATB_REG   flim_I, I_reg, %1   ; I | ||||
|     SPLATB_REG  hev_thr, hev_thr_reg, %1 ; hev_thresh | ||||
|     SPLATB_REG   flim_E, E_reg, %1, m7 ; E | ||||
|     SPLATB_REG   flim_I, I_reg, %1, m7 ; I | ||||
|     SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh | ||||
| %endif | ||||
| 
 | ||||
| %if mmsize == 8 && %4 == 16 ; mmx/mmxext | ||||
| @ -1884,15 +1897,15 @@ cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 | ||||
| %endmacro | ||||
| 
 | ||||
| INIT_MMX | ||||
| INNER_LOOPFILTER mmx,    v, 6, 16, 8 | ||||
| INNER_LOOPFILTER mmx,    h, 6, 16, 8 | ||||
| INNER_LOOPFILTER mmxext, v, 6, 16, 8 | ||||
| INNER_LOOPFILTER mmxext, h, 6, 16, 8 | ||||
| INNER_LOOPFILTER mmx,    v, 6, 16, 0 | ||||
| INNER_LOOPFILTER mmx,    h, 6, 16, 0 | ||||
| INNER_LOOPFILTER mmxext, v, 6, 16, 0 | ||||
| INNER_LOOPFILTER mmxext, h, 6, 16, 0 | ||||
| 
 | ||||
| INNER_LOOPFILTER mmx,    v, 6,  8, 8 | ||||
| INNER_LOOPFILTER mmx,    h, 6,  8, 8 | ||||
| INNER_LOOPFILTER mmxext, v, 6,  8, 8 | ||||
| INNER_LOOPFILTER mmxext, h, 6,  8, 8 | ||||
| INNER_LOOPFILTER mmx,    v, 6,  8, 0 | ||||
| INNER_LOOPFILTER mmx,    h, 6,  8, 0 | ||||
| INNER_LOOPFILTER mmxext, v, 6,  8, 0 | ||||
| INNER_LOOPFILTER mmxext, h, 6,  8, 0 | ||||
| 
 | ||||
| INIT_XMM | ||||
| INNER_LOOPFILTER sse2,   v, 5, 16, 13 | ||||
| @ -1904,6 +1917,15 @@ INNER_LOOPFILTER sse2,   h, 6, 16, 13 | ||||
| INNER_LOOPFILTER sse2,   v, 6,  8, 13 | ||||
| INNER_LOOPFILTER sse2,   h, 6,  8, 13 | ||||
| 
 | ||||
| INNER_LOOPFILTER ssse3,  v, 5, 16, 13 | ||||
| %ifdef m8 | ||||
| INNER_LOOPFILTER ssse3,  h, 5, 16, 13 | ||||
| %else | ||||
| INNER_LOOPFILTER ssse3,  h, 6, 16, 13 | ||||
| %endif | ||||
| INNER_LOOPFILTER ssse3,  v, 6,  8, 13 | ||||
| INNER_LOOPFILTER ssse3,  h, 6,  8, 13 | ||||
| 
 | ||||
| ;----------------------------------------------------------------------------- | ||||
| ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, | ||||
| ;                                            int flimE, int flimI, int hev_thr); | ||||
| @ -1984,11 +2006,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 | ||||
| %define stack_reg   hev_thr_reg | ||||
| %endif | ||||
| 
 | ||||
| %ifidn %1, ssse3 | ||||
|     pxor             m7, m7 | ||||
| %endif | ||||
| 
 | ||||
| %ifndef m8 ; mmx/mmxext or sse2 on x86-32 | ||||
|     ; splat function arguments | ||||
|     SPLATB_REG       m0, E_reg, %1   ; E | ||||
|     SPLATB_REG       m1, I_reg, %1   ; I | ||||
|     SPLATB_REG       m2, hev_thr_reg, %1 ; hev_thresh | ||||
|     SPLATB_REG       m0, E_reg, %1, m7 ; E | ||||
|     SPLATB_REG       m1, I_reg, %1, m7 ; I | ||||
|     SPLATB_REG       m2, hev_thr_reg, %1, m7 ; hev_thresh | ||||
| 
 | ||||
|     ; align stack | ||||
|     mov       stack_reg, rsp         ; backup stack pointer | ||||
| @ -2028,9 +2054,9 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 | ||||
| %define lim_sign m15 | ||||
| 
 | ||||
|     ; splat function arguments | ||||
|     SPLATB_REG   flim_E, E_reg, %1   ; E | ||||
|     SPLATB_REG   flim_I, I_reg, %1   ; I | ||||
|     SPLATB_REG  hev_thr, hev_thr_reg, %1 ; hev_thresh | ||||
|     SPLATB_REG   flim_E, E_reg, %1, m7 ; E | ||||
|     SPLATB_REG   flim_I, I_reg, %1, m7 ; I | ||||
|     SPLATB_REG  hev_thr, hev_thr_reg, %1, m7 ; hev_thresh | ||||
| %endif | ||||
| 
 | ||||
| %if mmsize == 8 && %4 == 16 ; mmx/mmxext | ||||
| @ -2521,15 +2547,15 @@ cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 | ||||
| %endmacro | ||||
| 
 | ||||
| INIT_MMX | ||||
| MBEDGE_LOOPFILTER mmx,    v, 6, 16, 8 | ||||
| MBEDGE_LOOPFILTER mmx,    h, 6, 16, 8 | ||||
| MBEDGE_LOOPFILTER mmxext, v, 6, 16, 8 | ||||
| MBEDGE_LOOPFILTER mmxext, h, 6, 16, 8 | ||||
| MBEDGE_LOOPFILTER mmx,    v, 6, 16, 0 | ||||
| MBEDGE_LOOPFILTER mmx,    h, 6, 16, 0 | ||||
| MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 | ||||
| MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 | ||||
| 
 | ||||
| MBEDGE_LOOPFILTER mmx,    v, 6,  8, 8 | ||||
| MBEDGE_LOOPFILTER mmx,    h, 6,  8, 8 | ||||
| MBEDGE_LOOPFILTER mmxext, v, 6,  8, 8 | ||||
| MBEDGE_LOOPFILTER mmxext, h, 6,  8, 8 | ||||
| MBEDGE_LOOPFILTER mmx,    v, 6,  8, 0 | ||||
| MBEDGE_LOOPFILTER mmx,    h, 6,  8, 0 | ||||
| MBEDGE_LOOPFILTER mmxext, v, 6,  8, 0 | ||||
| MBEDGE_LOOPFILTER mmxext, h, 6,  8, 0 | ||||
| 
 | ||||
| INIT_XMM | ||||
| MBEDGE_LOOPFILTER sse2,   v, 5, 16, 16 | ||||
| @ -2540,3 +2566,12 @@ MBEDGE_LOOPFILTER sse2,   h, 6, 16, 16 | ||||
| %endif | ||||
| MBEDGE_LOOPFILTER sse2,   v, 6,  8, 16 | ||||
| MBEDGE_LOOPFILTER sse2,   h, 6,  8, 16 | ||||
| 
 | ||||
| MBEDGE_LOOPFILTER ssse3,  v, 5, 16, 16 | ||||
| %ifdef m8 | ||||
| MBEDGE_LOOPFILTER ssse3,  h, 5, 16, 16 | ||||
| %else | ||||
| MBEDGE_LOOPFILTER ssse3,  h, 6, 16, 16 | ||||
| %endif | ||||
| MBEDGE_LOOPFILTER ssse3,  v, 6,  8, 16 | ||||
| MBEDGE_LOOPFILTER ssse3,  h, 6,  8, 16 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user