From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether.
		
			
				
	
	
		
			170 lines
		
	
	
		
			5.1 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			170 lines
		
	
	
		
			5.1 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
;******************************************************************************
 | 
						|
;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
 | 
						|
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
 | 
						|
;*
 | 
						|
;* This file is part of FFmpeg.
 | 
						|
;*
 | 
						|
;* FFmpeg is free software; you can redistribute it and/or
 | 
						|
;* modify it under the terms of the GNU Lesser General Public
 | 
						|
;* License as published by the Free Software Foundation; either
 | 
						|
;* version 2.1 of the License, or (at your option) any later version.
 | 
						|
;*
 | 
						|
;* FFmpeg is distributed in the hope that it will be useful,
 | 
						|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
;* Lesser General Public License for more details.
 | 
						|
;*
 | 
						|
;* You should have received a copy of the GNU Lesser General Public
 | 
						|
;* License along with FFmpeg; if not, write to the Free Software
 | 
						|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
						|
;******************************************************************************
 | 
						|
 | 
						|
%include "libavutil/x86/x86util.asm"
 | 
						|
 | 
						|
SECTION_RODATA
 | 
						|
pw_row_coeffs:  times 4 dw 13
 | 
						|
                times 4 dw 17
 | 
						|
                times 4 dw  7
 | 
						|
pd_512: times 2 dd 0x200
 | 
						|
pw_col_coeffs:  dw 13,  13,  13, -13
 | 
						|
                dw 17,   7,   7, -17
 | 
						|
                dw 13, -13,  13,  13
 | 
						|
                dw -7,  17, -17,  -7
 | 
						|
 | 
						|
SECTION .text
 | 
						|
 | 
						|
%macro IDCT_DC_NOROUND 1
 | 
						|
    imul   %1, 13*13*3
 | 
						|
    sar    %1, 11
 | 
						|
%endmacro
 | 
						|
 | 
						|
%macro IDCT_DC_ROUND 1
 | 
						|
    imul   %1, 13*13
 | 
						|
    add    %1, 0x200
 | 
						|
    sar    %1, 10
 | 
						|
%endmacro
 | 
						|
 | 
						|
INIT_MMX mmxext
 | 
						|
cglobal rv34_idct_dc_noround, 1, 2, 0
 | 
						|
    movsx   r1, word [r0]
 | 
						|
    IDCT_DC_NOROUND r1
 | 
						|
    movd    m0, r1d
 | 
						|
    pshufw  m0, m0, 0
 | 
						|
    movq    [r0+ 0], m0
 | 
						|
    movq    [r0+ 8], m0
 | 
						|
    movq    [r0+16], m0
 | 
						|
    movq    [r0+24], m0
 | 
						|
    RET
 | 
						|
 | 
						|
; Load coeffs and perform row transform
 | 
						|
; Output: coeffs in mm[0467], rounder in mm5
 | 
						|
%macro ROW_TRANSFORM  1
 | 
						|
    pxor        mm7, mm7
 | 
						|
    mova        mm0, [%1+ 0*8]
 | 
						|
    mova        mm1, [%1+ 1*8]
 | 
						|
    mova        mm2, [%1+ 2*8]
 | 
						|
    mova        mm3, [%1+ 3*8]
 | 
						|
    mova  [%1+ 0*8], mm7
 | 
						|
    mova  [%1+ 1*8], mm7
 | 
						|
    mova  [%1+ 2*8], mm7
 | 
						|
    mova  [%1+ 3*8], mm7
 | 
						|
    mova        mm4, mm0
 | 
						|
    mova        mm6, [pw_row_coeffs+ 0]
 | 
						|
    paddsw      mm0, mm2                ; b0 + b2
 | 
						|
    psubsw      mm4, mm2                ; b0 - b2
 | 
						|
    pmullw      mm0, mm6                ; *13 = z0
 | 
						|
    pmullw      mm4, mm6                ; *13 = z1
 | 
						|
    mova        mm5, mm1
 | 
						|
    pmullw      mm1, [pw_row_coeffs+ 8] ; b1*17
 | 
						|
    pmullw      mm5, [pw_row_coeffs+16] ; b1* 7
 | 
						|
    mova        mm7, mm3
 | 
						|
    pmullw      mm3, [pw_row_coeffs+ 8] ; b3*17
 | 
						|
    pmullw      mm7, [pw_row_coeffs+16] ; b3* 7
 | 
						|
    paddsw      mm1, mm7                ; z3 = b1*17 + b3* 7
 | 
						|
    psubsw      mm5, mm3                ; z2 = b1* 7 - b3*17
 | 
						|
    mova        mm7, mm0
 | 
						|
    mova        mm6, mm4
 | 
						|
    paddsw      mm0, mm1                ; z0 + z3
 | 
						|
    psubsw      mm7, mm1                ; z0 - z3
 | 
						|
    paddsw      mm4, mm5                ; z1 + z2
 | 
						|
    psubsw      mm6, mm5                ; z1 - z2
 | 
						|
    mova        mm5, [pd_512]           ; 0x200
 | 
						|
%endmacro
 | 
						|
 | 
						|
; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
 | 
						|
%macro COL_TRANSFORM  4
 | 
						|
    pshufw      mm3, %2, 0xDD        ; col. 1,3,1,3
 | 
						|
    pshufw       %2, %2, 0x88        ; col. 0,2,0,2
 | 
						|
    pmaddwd      %2, %3              ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
 | 
						|
    pmaddwd     mm3, %4              ; 17*c1+ 7*c3 |  7*c1-17*c3 = z3 | z2
 | 
						|
    paddd        %2, mm5
 | 
						|
    pshufw      mm1,  %2, 01001110b  ;    z1 | z0
 | 
						|
    pshufw      mm2, mm3, 01001110b  ;    z2 | z3
 | 
						|
    paddd        %2, mm3             ; z0+z3 | z1+z2
 | 
						|
    psubd       mm1, mm2             ; z1-z2 | z0-z3
 | 
						|
    movd        mm3, %1
 | 
						|
    psrad        %2, 10
 | 
						|
    pxor        mm2, mm2
 | 
						|
    psrad       mm1, 10
 | 
						|
    punpcklbw   mm3, mm2
 | 
						|
    packssdw     %2, mm1
 | 
						|
    paddw        %2, mm3
 | 
						|
    packuswb     %2, %2
 | 
						|
    movd         %1, %2
 | 
						|
%endmacro
 | 
						|
INIT_MMX mmxext
 | 
						|
cglobal rv34_idct_add, 3,3,0, d, s, b
 | 
						|
    ROW_TRANSFORM       bq
 | 
						|
    COL_TRANSFORM     [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
 | 
						|
    mova               mm0, [pw_col_coeffs+ 0]
 | 
						|
    COL_TRANSFORM  [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
 | 
						|
    mova               mm4, [pw_col_coeffs+ 8]
 | 
						|
    lea                 dq, [dq + 2*sq]
 | 
						|
    COL_TRANSFORM     [dq], mm6, mm0, mm4
 | 
						|
    COL_TRANSFORM  [dq+sq], mm7, mm0, mm4
 | 
						|
    ret
 | 
						|
 | 
						|
; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
 | 
						|
%macro RV34_IDCT_DC_ADD 0
 | 
						|
cglobal rv34_idct_dc_add, 3, 3, 6
 | 
						|
    ; load data
 | 
						|
    IDCT_DC_ROUND r2
 | 
						|
    pxor       m1, m1
 | 
						|
 | 
						|
    ; calculate DC
 | 
						|
    movd       m0, r2d
 | 
						|
    lea        r2, [r0+r1*2]
 | 
						|
    movd       m2, [r0]
 | 
						|
    movd       m3, [r0+r1]
 | 
						|
    pshuflw    m0, m0, 0
 | 
						|
    movd       m4, [r2]
 | 
						|
    movd       m5, [r2+r1]
 | 
						|
    punpcklqdq m0, m0
 | 
						|
    punpckldq  m2, m3
 | 
						|
    punpckldq  m4, m5
 | 
						|
    punpcklbw  m2, m1
 | 
						|
    punpcklbw  m4, m1
 | 
						|
    paddw      m2, m0
 | 
						|
    paddw      m4, m0
 | 
						|
    packuswb   m2, m4
 | 
						|
    movd      [r0], m2
 | 
						|
%if cpuflag(sse4)
 | 
						|
    pextrd [r0+r1], m2, 1
 | 
						|
    pextrd    [r2], m2, 2
 | 
						|
    pextrd [r2+r1], m2, 3
 | 
						|
%else
 | 
						|
    psrldq     m2, 4
 | 
						|
    movd   [r0+r1], m2
 | 
						|
    psrldq     m2, 4
 | 
						|
    movd      [r2], m2
 | 
						|
    psrldq     m2, 4
 | 
						|
    movd   [r2+r1], m2
 | 
						|
%endif
 | 
						|
    RET
 | 
						|
%endmacro
 | 
						|
 | 
						|
INIT_XMM sse2
 | 
						|
RV34_IDCT_DC_ADD
 | 
						|
INIT_XMM sse4
 | 
						|
RV34_IDCT_DC_ADD
 |