160 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			160 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
| ;******************************************************************************
 | |
| ;* optimized bswap buffer functions
 | |
| ;* Copyright (c) 2008 Loren Merritt
 | |
| ;* Copyright (c) 2003-2013 Michael Niedermayer
 | |
| ;* Copyright (c) 2013 Daniel Kang
 | |
| ;*
 | |
| ;* This file is part of FFmpeg.
 | |
| ;*
 | |
| ;* FFmpeg is free software; you can redistribute it and/or
 | |
| ;* modify it under the terms of the GNU Lesser General Public
 | |
| ;* License as published by the Free Software Foundation; either
 | |
| ;* version 2.1 of the License, or (at your option) any later version.
 | |
| ;*
 | |
| ;* FFmpeg is distributed in the hope that it will be useful,
 | |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| ;* Lesser General Public License for more details.
 | |
| ;*
 | |
| ;* You should have received a copy of the GNU Lesser General Public
 | |
| ;* License along with FFmpeg; if not, write to the Free Software
 | |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
| ;******************************************************************************
 | |
| 
 | |
| %include "libavutil/x86/x86util.asm"
 | |
| 
 | |
| SECTION_RODATA
 | |
| pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 | |
| 
 | |
| cextern pb_80
 | |
| 
 | |
| SECTION .text
 | |
| 
 | |
| ; %1 = aligned/unaligned
 | |
| %macro BSWAP_LOOPS  1
 | |
|     mov      r3d, r2d
 | |
|     sar      r2d, 3
 | |
|     jz       .left4_%1
 | |
| %if cpuflag(avx2)
 | |
|     sar      r2d, 1
 | |
|     jz       .left8_%1
 | |
| %endif
 | |
| .loop8_%1:
 | |
|     mov%1    m0, [r1 +  0]
 | |
|     mov%1    m1, [r1 + mmsize]
 | |
| %if cpuflag(ssse3)||cpuflag(avx2)
 | |
|     pshufb   m0, m2
 | |
|     pshufb   m1, m2
 | |
|     mov%1    [r0 +  0], m0
 | |
|     mov%1    [r0 + mmsize], m1
 | |
| %else
 | |
|     pshuflw  m0, m0, 10110001b
 | |
|     pshuflw  m1, m1, 10110001b
 | |
|     pshufhw  m0, m0, 10110001b
 | |
|     pshufhw  m1, m1, 10110001b
 | |
|     mova     m2, m0
 | |
|     mova     m3, m1
 | |
|     psllw    m0, 8
 | |
|     psllw    m1, 8
 | |
|     psrlw    m2, 8
 | |
|     psrlw    m3, 8
 | |
|     por      m2, m0
 | |
|     por      m3, m1
 | |
|     mov%1    [r0 +  0], m2
 | |
|     mov%1    [r0 + 16], m3
 | |
| %endif
 | |
|     add      r0, mmsize*2
 | |
|     add      r1, mmsize*2
 | |
|     dec      r2d
 | |
|     jnz      .loop8_%1
 | |
| %if cpuflag(avx2)
 | |
| .left8_%1:
 | |
|     mov      r2d, r3d
 | |
|     test     r3d, 8
 | |
|     jz       .left4_%1
 | |
|     mov%1    m0, [r1]
 | |
|     pshufb   m0, m2
 | |
|     mov%1    [r0 +  0], m0
 | |
|     add r1, mmsize
 | |
|     add r0, mmsize
 | |
| %endif
 | |
| .left4_%1:
 | |
|     mov      r2d, r3d
 | |
|     test     r3d, 4
 | |
|     jz       .left
 | |
|     mov%1    xm0, [r1]
 | |
| %if cpuflag(ssse3)
 | |
|     pshufb   xm0, xm2
 | |
|     mov%1    [r0], xm0
 | |
| %else
 | |
|     pshuflw  m0, m0, 10110001b
 | |
|     pshufhw  m0, m0, 10110001b
 | |
|     mova     m2, m0
 | |
|     psllw    m0, 8
 | |
|     psrlw    m2, 8
 | |
|     por      m2, m0
 | |
|     mov%1    [r0], m2
 | |
| %endif
 | |
|     add      r1, 16
 | |
|     add      r0, 16
 | |
| %endmacro
 | |
| 
 | |
| ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
 | |
| %macro BSWAP32_BUF 0
 | |
| %if cpuflag(ssse3)||cpuflag(avx2)
 | |
| cglobal bswap32_buf, 3,4,3
 | |
|     mov      r3, r1
 | |
|     VBROADCASTI128  m2, [pb_bswap32]
 | |
| %else
 | |
| cglobal bswap32_buf, 3,4,5
 | |
|     mov      r3, r1
 | |
| %endif
 | |
|     or       r3, r0
 | |
|     test     r3, mmsize - 1
 | |
|     jz       .start_align
 | |
|     BSWAP_LOOPS  u
 | |
|     jmp      .left
 | |
| .start_align:
 | |
|     BSWAP_LOOPS  a
 | |
| .left:
 | |
| %if cpuflag(ssse3)
 | |
|     test     r2d, 2
 | |
|     jz       .left1
 | |
|     movq     xm0, [r1]
 | |
|     pshufb   xm0, xm2
 | |
|     movq     [r0], xm0
 | |
|     add      r1, 8
 | |
|     add      r0, 8
 | |
| .left1:
 | |
|     test     r2d, 1
 | |
|     jz       .end
 | |
|     mov      r2d, [r1]
 | |
|     bswap    r2d
 | |
|     mov      [r0], r2d
 | |
| %else
 | |
|     and      r2d, 3
 | |
|     jz       .end
 | |
| .loop2:
 | |
|     mov      r3d, [r1]
 | |
|     bswap    r3d
 | |
|     mov      [r0], r3d
 | |
|     add      r1, 4
 | |
|     add      r0, 4
 | |
|     dec      r2d
 | |
|     jnz      .loop2
 | |
| %endif
 | |
| .end:
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| INIT_XMM sse2
 | |
| BSWAP32_BUF
 | |
| 
 | |
| INIT_XMM ssse3
 | |
| BSWAP32_BUF
 | |
| 
 | |
| %if HAVE_AVX2_EXTERNAL
 | |
| INIT_YMM avx2
 | |
| BSWAP32_BUF
 | |
| %endif
 |