The only systems which benefit from these are truely ancient 32bit x86s as all other systems use at least the SSE2 versions (this includes all x64 cpus (which is why this code is restricted to x86-32)). Signed-off-by: Andreas Rheinhardt <andreas.rheinhardt@outlook.com>
		
			
				
	
	
		
			144 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			144 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
| ;******************************************************************************
 | |
| ;* SIMD-optimized HuffYUV functions
 | |
| ;* Copyright (c) 2008 Loren Merritt
 | |
| ;* Copyright (c) 2014 Christophe Gisquet
 | |
| ;*
 | |
| ;* This file is part of FFmpeg.
 | |
| ;*
 | |
| ;* FFmpeg is free software; you can redistribute it and/or
 | |
| ;* modify it under the terms of the GNU Lesser General Public
 | |
| ;* License as published by the Free Software Foundation; either
 | |
| ;* version 2.1 of the License, or (at your option) any later version.
 | |
| ;*
 | |
| ;* FFmpeg is distributed in the hope that it will be useful,
 | |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| ;* Lesser General Public License for more details.
 | |
| ;*
 | |
| ;* You should have received a copy of the GNU Lesser General Public
 | |
| ;* License along with FFmpeg; if not, write to the Free Software
 | |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
| ;******************************************************************************
 | |
| 
 | |
| %include "libavutil/x86/x86util.asm"
 | |
| 
 | |
| SECTION .text
 | |
| 
 | |
| %include "libavcodec/x86/huffyuvdsp_template.asm"
 | |
| 
 | |
| ;------------------------------------------------------------------------------
 | |
| ; void (*add_int16)(uint16_t *dst, const uint16_t *src, unsigned mask, int w);
 | |
| ;------------------------------------------------------------------------------
 | |
| 
 | |
| %macro ADD_INT16 0
 | |
| cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
 | |
|     test srcq, mmsize-1
 | |
|     jnz .unaligned
 | |
|     test dstq, mmsize-1
 | |
|     jnz .unaligned
 | |
|     INT16_LOOP a, add
 | |
| .unaligned:
 | |
|     INT16_LOOP u, add
 | |
| %endmacro
 | |
| 
 | |
| INIT_XMM sse2
 | |
| ADD_INT16
 | |
| 
 | |
| %if HAVE_AVX2_EXTERNAL
 | |
| INIT_YMM avx2
 | |
| ADD_INT16
 | |
| %endif
 | |
| 
 | |
| ; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
 | |
| ;                               intptr_t w, uint8_t *left)
 | |
| INIT_XMM sse2
 | |
| cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
 | |
|     shl           wq, 2
 | |
|     movd          m0, [leftq]
 | |
|     lea         dstq, [dstq + wq]
 | |
|     lea         srcq, [srcq + wq]
 | |
|     LSHIFT        m0, mmsize-4
 | |
|     neg           wq
 | |
| .loop:
 | |
|     movu          m1, [srcq+wq]
 | |
|     mova          m2, m1
 | |
|     LSHIFT        m1, 4
 | |
|     paddb         m1, m2
 | |
|     pshufd        m0, m0, q3333
 | |
|     mova          m2, m1
 | |
|     LSHIFT        m1, 8
 | |
|     paddb         m1, m2
 | |
|     paddb         m0, m1
 | |
|     movu   [dstq+wq], m0
 | |
|     add           wq, mmsize
 | |
|     jl         .loop
 | |
|     movd          m0, [dstq-4]
 | |
|     movd     [leftq], m0
 | |
|     REP_RET
 | |
| 
 | |
| 
 | |
| ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
 | |
| INIT_MMX mmxext
 | |
| cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
 | |
|     add      wd, wd
 | |
|     movd    mm6, maskd
 | |
|     SPLATW  mm6, mm6
 | |
|     movq    mm0, [topq]
 | |
|     movq    mm2, mm0
 | |
|     movd    mm4, [left_topq]
 | |
|     psllq   mm2, 16
 | |
|     movq    mm1, mm0
 | |
|     por     mm4, mm2
 | |
|     movd    mm3, [leftq]
 | |
|     psubw   mm0, mm4 ; t-tl
 | |
|     add    dstq, wq
 | |
|     add    topq, wq
 | |
|     add   diffq, wq
 | |
|     neg      wq
 | |
|     jmp .skip
 | |
| .loop:
 | |
|     movq    mm4, [topq+wq]
 | |
|     movq    mm0, mm4
 | |
|     psllq   mm4, 16
 | |
|     por     mm4, mm1
 | |
|     movq    mm1, mm0 ; t
 | |
|     psubw   mm0, mm4 ; t-tl
 | |
| .skip:
 | |
|     movq    mm2, [diffq+wq]
 | |
| %assign i 0
 | |
| %rep 4
 | |
|     movq    mm4, mm0
 | |
|     paddw   mm4, mm3 ; t-tl+l
 | |
|     pand    mm4, mm6
 | |
|     movq    mm5, mm3
 | |
|     pmaxsw  mm3, mm1
 | |
|     pminsw  mm5, mm1
 | |
|     pminsw  mm3, mm4
 | |
|     pmaxsw  mm3, mm5 ; median
 | |
|     paddw   mm3, mm2 ; +residual
 | |
|     pand    mm3, mm6
 | |
| %if i==0
 | |
|     movq    mm7, mm3
 | |
|     psllq   mm7, 48
 | |
| %else
 | |
|     movq    mm4, mm3
 | |
|     psrlq   mm7, 16
 | |
|     psllq   mm4, 48
 | |
|     por     mm7, mm4
 | |
| %endif
 | |
| %if i<3
 | |
|     psrlq   mm0, 16
 | |
|     psrlq   mm1, 16
 | |
|     psrlq   mm2, 16
 | |
| %endif
 | |
| %assign i i+1
 | |
| %endrep
 | |
|     movq [dstq+wq], mm7
 | |
|     add      wq, 8
 | |
|     jl .loop
 | |
|     movzx   r2d, word [dstq-2]
 | |
|     mov [leftq], r2d
 | |
|     movzx   r2d, word [topq-2]
 | |
|     mov [left_topq], r2d
 | |
|     RET
 |