302 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			302 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
;******************************************************************************
 | 
						|
;* SIMD-optimized functions for the DCA decoder
 | 
						|
;* Copyright (C) 2016 James Almer
 | 
						|
;*
 | 
						|
;* This file is part of FFmpeg.
 | 
						|
;*
 | 
						|
;* FFmpeg is free software; you can redistribute it and/or
 | 
						|
;* modify it under the terms of the GNU Lesser General Public
 | 
						|
;* License as published by the Free Software Foundation; either
 | 
						|
;* version 2.1 of the License, or (at your option) any later version.
 | 
						|
;*
 | 
						|
;* FFmpeg is distributed in the hope that it will be useful,
 | 
						|
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
;* Lesser General Public License for more details.
 | 
						|
;*
 | 
						|
;* You should have received a copy of the GNU Lesser General Public
 | 
						|
;* License along with FFmpeg; if not, write to the Free Software
 | 
						|
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
						|
;******************************************************************************
 | 
						|
 | 
						|
%include "libavutil/x86/x86util.asm"
 | 
						|
 | 
						|
SECTION .text
 | 
						|
 | 
						|
%define sizeof_float 4
 | 
						|
%define FMA3_OFFSET (8 * cpuflag(fma3))
 | 
						|
 | 
						|
%macro LFE_FIR0_FLOAT 0
 | 
						|
cglobal lfe_fir0_float, 4, 6, 12 + cpuflag(fma3)*4, samples, lfe, coeff, nblocks, cnt1, cnt2
 | 
						|
    shr nblocksd, 1
 | 
						|
    sub     lfeq, 7*sizeof_float
 | 
						|
    mov    cnt1d, 32*sizeof_float
 | 
						|
    mov    cnt2d, 32*sizeof_float-8-FMA3_OFFSET
 | 
						|
    lea   coeffq, [coeffq+cnt1q*8]
 | 
						|
    add samplesq, cnt1q
 | 
						|
    neg    cnt1q
 | 
						|
 | 
						|
.loop:
 | 
						|
%if cpuflag(avx)
 | 
						|
    cvtdq2ps  m4, [lfeq+16]
 | 
						|
    cvtdq2ps  m5, [lfeq   ]
 | 
						|
    shufps    m7, m4, m4, q0123
 | 
						|
    shufps    m6, m5, m5, q0123
 | 
						|
%elif cpuflag(sse2)
 | 
						|
    movu      m4, [lfeq+16]
 | 
						|
    movu      m5, [lfeq   ]
 | 
						|
    cvtdq2ps  m4, m4
 | 
						|
    cvtdq2ps  m5, m5
 | 
						|
    pshufd    m7, m4, q0123
 | 
						|
    pshufd    m6, m5, q0123
 | 
						|
%else
 | 
						|
    cvtpi2ps  m4, [lfeq+16]
 | 
						|
    cvtpi2ps  m0, [lfeq+24]
 | 
						|
    cvtpi2ps  m5, [lfeq   ]
 | 
						|
    cvtpi2ps  m1, [lfeq+8 ]
 | 
						|
    shufps    m4, m0, q1010
 | 
						|
    shufps    m5, m1, q1010
 | 
						|
    shufps    m7, m4, m4, q0123
 | 
						|
    shufps    m6, m5, m5, q0123
 | 
						|
%endif
 | 
						|
 | 
						|
.inner_loop:
 | 
						|
%if ARCH_X86_64
 | 
						|
    movaps    m8, [coeffq+cnt1q*8   ]
 | 
						|
    movaps    m9, [coeffq+cnt1q*8+16]
 | 
						|
    movaps   m10, [coeffq+cnt1q*8+32]
 | 
						|
    movaps   m11, [coeffq+cnt1q*8+48]
 | 
						|
%if cpuflag(fma3)
 | 
						|
    movaps   m12, [coeffq+cnt1q*8+64]
 | 
						|
    movaps   m13, [coeffq+cnt1q*8+80]
 | 
						|
    movaps   m14, [coeffq+cnt1q*8+96]
 | 
						|
    movaps   m15, [coeffq+cnt1q*8+112]
 | 
						|
    mulps     m0, m7, m8
 | 
						|
    mulps     m1, m7, m10
 | 
						|
    mulps     m2, m7, m12
 | 
						|
    mulps     m3, m7, m14
 | 
						|
    fmaddps   m0, m6, m9, m0
 | 
						|
    fmaddps   m1, m6, m11, m1
 | 
						|
    fmaddps   m2, m6, m13, m2
 | 
						|
    fmaddps   m3, m6, m15, m3
 | 
						|
 | 
						|
    haddps    m0, m1
 | 
						|
    haddps    m2, m3
 | 
						|
    haddps    m0, m2
 | 
						|
    movaps [samplesq+cnt1q], m0
 | 
						|
%else
 | 
						|
    mulps     m0, m7, m8
 | 
						|
    mulps     m1, m6, m9
 | 
						|
    mulps     m2, m7, m10
 | 
						|
    mulps     m3, m6, m11
 | 
						|
    addps     m0, m1
 | 
						|
    addps     m2, m3
 | 
						|
 | 
						|
    unpckhps  m3, m0, m2
 | 
						|
    unpcklps  m0, m2
 | 
						|
    addps     m3, m0
 | 
						|
    movhlps   m2, m3
 | 
						|
    addps     m2, m3
 | 
						|
    movlps [samplesq+cnt1q], m2
 | 
						|
%endif
 | 
						|
%else ; ARCH_X86_32
 | 
						|
%if cpuflag(fma3)
 | 
						|
    mulps     m0, m7, [coeffq+cnt1q*8    ]
 | 
						|
    mulps     m1, m7, [coeffq+cnt1q*8+32 ]
 | 
						|
    mulps     m2, m7, [coeffq+cnt1q*8+64 ]
 | 
						|
    mulps     m3, m7, [coeffq+cnt1q*8+96 ]
 | 
						|
    fmaddps   m0, m6, [coeffq+cnt1q*8+16 ], m0
 | 
						|
    fmaddps   m1, m6, [coeffq+cnt1q*8+48 ], m1
 | 
						|
    fmaddps   m2, m6, [coeffq+cnt1q*8+80 ], m2
 | 
						|
    fmaddps   m3, m6, [coeffq+cnt1q*8+112], m3
 | 
						|
 | 
						|
    haddps    m0, m1
 | 
						|
    haddps    m2, m3
 | 
						|
    haddps    m0, m2
 | 
						|
    movaps [samplesq+cnt1q], m0
 | 
						|
%else
 | 
						|
    mulps     m0, m7, [coeffq+cnt1q*8   ]
 | 
						|
    mulps     m1, m6, [coeffq+cnt1q*8+16]
 | 
						|
    mulps     m2, m7, [coeffq+cnt1q*8+32]
 | 
						|
    mulps     m3, m6, [coeffq+cnt1q*8+48]
 | 
						|
    addps     m0, m1
 | 
						|
    addps     m2, m3
 | 
						|
 | 
						|
    unpckhps  m3, m0, m2
 | 
						|
    unpcklps  m0, m2
 | 
						|
    addps     m3, m0
 | 
						|
    movhlps   m2, m3
 | 
						|
    addps     m2, m3
 | 
						|
    movlps [samplesq+cnt1q], m2
 | 
						|
%endif
 | 
						|
%endif; ARCH
 | 
						|
 | 
						|
%if ARCH_X86_64
 | 
						|
%if cpuflag(fma3)
 | 
						|
    mulps     m8, m5
 | 
						|
    mulps    m10, m5
 | 
						|
    mulps    m12, m5
 | 
						|
    mulps    m14, m5
 | 
						|
    fmaddps   m8, m4, m9, m8
 | 
						|
    fmaddps  m10, m4, m11, m10
 | 
						|
    fmaddps  m12, m4, m13, m12
 | 
						|
    fmaddps  m14, m4, m15, m14
 | 
						|
 | 
						|
    haddps   m10, m8
 | 
						|
    haddps   m14, m12
 | 
						|
    haddps   m14, m10
 | 
						|
    movaps [samplesq+cnt2q], m14
 | 
						|
%else
 | 
						|
    mulps     m8, m5
 | 
						|
    mulps     m9, m4
 | 
						|
    mulps    m10, m5
 | 
						|
    mulps    m11, m4
 | 
						|
    addps     m8, m9
 | 
						|
    addps    m10, m11
 | 
						|
 | 
						|
    unpckhps m11, m10, m8
 | 
						|
    unpcklps m10, m8
 | 
						|
    addps    m11, m10
 | 
						|
    movhlps   m8, m11
 | 
						|
    addps     m8, m11
 | 
						|
    movlps [samplesq+cnt2q], m8
 | 
						|
%endif
 | 
						|
%else ; ARCH_X86_32
 | 
						|
%if cpuflag(fma3)
 | 
						|
    mulps     m0, m5, [coeffq+cnt1q*8    ]
 | 
						|
    mulps     m1, m5, [coeffq+cnt1q*8+32 ]
 | 
						|
    mulps     m2, m5, [coeffq+cnt1q*8+64 ]
 | 
						|
    mulps     m3, m5, [coeffq+cnt1q*8+96 ]
 | 
						|
    fmaddps   m0, m4, [coeffq+cnt1q*8+16 ], m0
 | 
						|
    fmaddps   m1, m4, [coeffq+cnt1q*8+48 ], m1
 | 
						|
    fmaddps   m2, m4, [coeffq+cnt1q*8+80 ], m2
 | 
						|
    fmaddps   m3, m4, [coeffq+cnt1q*8+112], m3
 | 
						|
 | 
						|
    haddps    m1, m0
 | 
						|
    haddps    m3, m2
 | 
						|
    haddps    m3, m1
 | 
						|
    movaps [samplesq+cnt2q], m3
 | 
						|
%else
 | 
						|
    mulps     m0, m5, [coeffq+cnt1q*8   ]
 | 
						|
    mulps     m1, m4, [coeffq+cnt1q*8+16]
 | 
						|
    mulps     m2, m5, [coeffq+cnt1q*8+32]
 | 
						|
    mulps     m3, m4, [coeffq+cnt1q*8+48]
 | 
						|
    addps     m0, m1
 | 
						|
    addps     m2, m3
 | 
						|
 | 
						|
    unpckhps  m3, m2, m0
 | 
						|
    unpcklps  m2, m0
 | 
						|
    addps     m3, m2
 | 
						|
    movhlps   m0, m3
 | 
						|
    addps     m0, m3
 | 
						|
    movlps [samplesq+cnt2q], m0
 | 
						|
%endif
 | 
						|
%endif; ARCH
 | 
						|
 | 
						|
    sub    cnt2d, 8 + FMA3_OFFSET
 | 
						|
    add    cnt1q, 8 + FMA3_OFFSET
 | 
						|
    jl .inner_loop
 | 
						|
 | 
						|
    add     lfeq, 4
 | 
						|
    add samplesq,  64*sizeof_float
 | 
						|
    mov    cnt1q, -32*sizeof_float
 | 
						|
    mov    cnt2d,  32*sizeof_float-8-FMA3_OFFSET
 | 
						|
    sub nblocksd, 1
 | 
						|
    jg .loop
 | 
						|
    RET
 | 
						|
%endmacro
 | 
						|
 | 
						|
%if ARCH_X86_32
 | 
						|
INIT_XMM sse
 | 
						|
LFE_FIR0_FLOAT
 | 
						|
%endif
 | 
						|
INIT_XMM sse2
 | 
						|
LFE_FIR0_FLOAT
 | 
						|
%if HAVE_AVX_EXTERNAL
 | 
						|
INIT_XMM avx
 | 
						|
LFE_FIR0_FLOAT
 | 
						|
%endif
 | 
						|
%if HAVE_FMA3_EXTERNAL
 | 
						|
INIT_XMM fma3
 | 
						|
LFE_FIR0_FLOAT
 | 
						|
%endif
 | 
						|
 | 
						|
%macro LFE_FIR1_FLOAT 0
 | 
						|
cglobal lfe_fir1_float, 4, 6, 10, samples, lfe, coeff, nblocks, cnt1, cnt2
 | 
						|
    shr nblocksd, 2
 | 
						|
    sub     lfeq, 3*sizeof_float
 | 
						|
    mov    cnt1d, 64*sizeof_float
 | 
						|
    mov    cnt2d, 64*sizeof_float-16
 | 
						|
    lea   coeffq, [coeffq+cnt1q*4]
 | 
						|
    add samplesq, cnt1q
 | 
						|
    neg    cnt1q
 | 
						|
 | 
						|
.loop:
 | 
						|
%if cpuflag(avx)
 | 
						|
    cvtdq2ps  m4, [lfeq]
 | 
						|
    shufps    m5, m4, m4, q0123
 | 
						|
%elif cpuflag(sse2)
 | 
						|
    movu      m4, [lfeq]
 | 
						|
    cvtdq2ps  m4, m4
 | 
						|
    pshufd    m5, m4, q0123
 | 
						|
%endif
 | 
						|
 | 
						|
.inner_loop:
 | 
						|
    movaps    m6, [coeffq+cnt1q*4   ]
 | 
						|
    movaps    m7, [coeffq+cnt1q*4+16]
 | 
						|
    mulps     m0, m5, m6
 | 
						|
    mulps     m1, m5, m7
 | 
						|
%if ARCH_X86_64
 | 
						|
    movaps    m8, [coeffq+cnt1q*4+32]
 | 
						|
    movaps    m9, [coeffq+cnt1q*4+48]
 | 
						|
    mulps     m2, m5, m8
 | 
						|
    mulps     m3, m5, m9
 | 
						|
%else
 | 
						|
    mulps     m2, m5, [coeffq+cnt1q*4+32]
 | 
						|
    mulps     m3, m5, [coeffq+cnt1q*4+48]
 | 
						|
%endif
 | 
						|
 | 
						|
    haddps    m0, m1
 | 
						|
    haddps    m2, m3
 | 
						|
    haddps    m0, m2
 | 
						|
    movaps [samplesq+cnt1q], m0
 | 
						|
 | 
						|
    mulps     m6, m4
 | 
						|
    mulps     m7, m4
 | 
						|
%if ARCH_X86_64
 | 
						|
    mulps     m8, m4
 | 
						|
    mulps     m9, m4
 | 
						|
 | 
						|
    haddps    m6, m7
 | 
						|
    haddps    m8, m9
 | 
						|
    haddps    m6, m8
 | 
						|
%else
 | 
						|
    mulps     m2, m4, [coeffq+cnt1q*4+32]
 | 
						|
    mulps     m3, m4, [coeffq+cnt1q*4+48]
 | 
						|
 | 
						|
    haddps    m6, m7
 | 
						|
    haddps    m2, m3
 | 
						|
    haddps    m6, m2
 | 
						|
%endif
 | 
						|
    movaps [samplesq+cnt2q], m6
 | 
						|
 | 
						|
    sub    cnt2d, 16
 | 
						|
    add    cnt1q, 16
 | 
						|
    jl .inner_loop
 | 
						|
 | 
						|
    add     lfeq, sizeof_float
 | 
						|
    add samplesq, 128*sizeof_float
 | 
						|
    mov    cnt1q, -64*sizeof_float
 | 
						|
    mov    cnt2d,  64*sizeof_float-16
 | 
						|
    sub nblocksd, 1
 | 
						|
    jg .loop
 | 
						|
    RET
 | 
						|
%endmacro
 | 
						|
 | 
						|
INIT_XMM sse3
 | 
						|
LFE_FIR1_FLOAT
 | 
						|
%if HAVE_AVX_EXTERNAL
 | 
						|
INIT_XMM avx
 | 
						|
LFE_FIR1_FLOAT
 | 
						|
%endif
 |