* commit '12655c48049f9a52e5504bde90fe738862b0ff08': libavresample: NEON optimized FIR audio resampling Merged-by: Michael Niedermayer <michaelni@gmx.at>
		
			
				
	
	
		
			359 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			359 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
/*
 | 
						|
 * Copyright (c) 2014 Peter Meerwald <pmeerw@pmeerw.net>
 | 
						|
 *
 | 
						|
 * This file is part of FFmpeg.
 | 
						|
 *
 | 
						|
 * FFmpeg is free software; you can redistribute it and/or
 | 
						|
 * modify it under the terms of the GNU Lesser General Public
 | 
						|
 * License as published by the Free Software Foundation; either
 | 
						|
 * version 2.1 of the License, or (at your option) any later version.
 | 
						|
 *
 | 
						|
 * FFmpeg is distributed in the hope that it will be useful,
 | 
						|
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
						|
 * Lesser General Public License for more details.
 | 
						|
 *
 | 
						|
 * You should have received a copy of the GNU Lesser General Public
 | 
						|
 * License along with FFmpeg; if not, write to the Free Software
 | 
						|
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | 
						|
 */
 | 
						|
 | 
						|
#include "libavutil/arm/asm.S"
 | 
						|
 | 
						|
#include "asm-offsets.h"
 | 
						|
 | 
						|
.macro resample_one     fmt, es=2
 | 
						|
function ff_resample_one_\fmt\()_neon, export=1
 | 
						|
        push            {r4, r5}
 | 
						|
        add             r1, r1, r2, lsl #\es
 | 
						|
 | 
						|
        ldr             r2, [r0, #PHASE_SHIFT+4] /* phase_mask */
 | 
						|
        ldr             ip, [sp, #8] /* index */
 | 
						|
        ldr             r5, [r0, #FILTER_LENGTH]
 | 
						|
        and             r2, ip, r2 /* (index & phase_mask) */
 | 
						|
        ldr             r4, [r0, #PHASE_SHIFT]
 | 
						|
        lsr             r4, ip, r4 /* compute sample_index */
 | 
						|
        mul             r2, r2, r5
 | 
						|
 | 
						|
        ldr             ip, [r0, #FILTER_BANK]
 | 
						|
        add             r3, r3, r4, lsl #\es /* &src[sample_index] */
 | 
						|
 | 
						|
        cmp             r5, #8
 | 
						|
        add             r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */
 | 
						|
 | 
						|
        blt             5f
 | 
						|
8:
 | 
						|
        subs            r5, r5, #8
 | 
						|
        LOAD4
 | 
						|
        MUL4
 | 
						|
7:
 | 
						|
        LOAD4
 | 
						|
        beq             6f
 | 
						|
        cmp             r5, #8
 | 
						|
        MLA4
 | 
						|
        blt             4f
 | 
						|
        subs            r5, r5, #8
 | 
						|
        LOAD4
 | 
						|
        MLA4
 | 
						|
        b               7b
 | 
						|
6:
 | 
						|
        MLA4
 | 
						|
        STORE
 | 
						|
        pop             {r4, r5}
 | 
						|
        bx              lr
 | 
						|
5:
 | 
						|
        INIT4
 | 
						|
4:      /* remaining filter_length 1 to 7 */
 | 
						|
        cmp             r5, #4
 | 
						|
        blt             2f
 | 
						|
        subs            r5, r5, #4
 | 
						|
        LOAD4
 | 
						|
        MLA4
 | 
						|
        beq             0f
 | 
						|
2:      /* remaining filter_length 1 to 3 */
 | 
						|
        cmp             r5, #2
 | 
						|
        blt             1f
 | 
						|
        subs            r5, r5, #2
 | 
						|
        LOAD2
 | 
						|
        MLA2
 | 
						|
        beq             0f
 | 
						|
1:      /* remaining filter_length 1 */
 | 
						|
        LOAD1
 | 
						|
        MLA1
 | 
						|
0:
 | 
						|
        STORE
 | 
						|
        pop             {r4, r5}
 | 
						|
        bx              lr
 | 
						|
endfunc
 | 
						|
 | 
						|
.purgem LOAD1
 | 
						|
.purgem LOAD2
 | 
						|
.purgem LOAD4
 | 
						|
.purgem MLA1
 | 
						|
.purgem MLA2
 | 
						|
.purgem MLA4
 | 
						|
.purgem MUL4
 | 
						|
.purgem INIT4
 | 
						|
.purgem STORE
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
/* float32 */
 | 
						|
.macro  LOAD1
 | 
						|
        veor.32         d0, d0
 | 
						|
        vld1.32         {d0[0]}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d4[0]}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  LOAD2
 | 
						|
        vld1.32         {d0}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d4}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  LOAD4
 | 
						|
        vld1.32         {d0,d1}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d4,d5}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  MLA1
 | 
						|
        vmla.f32        d16, d0, d4[0]
 | 
						|
.endm
 | 
						|
.macro  MLA2
 | 
						|
        vmla.f32        d16, d0, d4
 | 
						|
.endm
 | 
						|
.macro  MLA4
 | 
						|
        vmla.f32        d16, d0, d4
 | 
						|
        vmla.f32        d17, d1, d5
 | 
						|
.endm
 | 
						|
.macro  MUL4
 | 
						|
        vmul.f32        d16, d0, d4
 | 
						|
        vmul.f32        d17, d1, d5
 | 
						|
.endm
 | 
						|
.macro  INIT4
 | 
						|
        veor.f32        q8, q8
 | 
						|
.endm
 | 
						|
.macro  STORE
 | 
						|
        vpadd.f32       d16, d16, d17
 | 
						|
        vpadd.f32       d16, d16, d16
 | 
						|
        vst1.32         d16[0], [r1]
 | 
						|
.endm
 | 
						|
 | 
						|
resample_one flt, 2
 | 
						|
 | 
						|
 | 
						|
/* s32 */
 | 
						|
.macro  LOAD1
 | 
						|
        veor.32         d0, d0
 | 
						|
        vld1.32         {d0[0]}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d4[0]}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  LOAD2
 | 
						|
        vld1.32         {d0}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d4}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  LOAD4
 | 
						|
        vld1.32         {d0,d1}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d4,d5}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  MLA1
 | 
						|
        vmlal.s32       q8, d0, d4[0]
 | 
						|
.endm
 | 
						|
.macro  MLA2
 | 
						|
        vmlal.s32       q8, d0, d4
 | 
						|
.endm
 | 
						|
.macro  MLA4
 | 
						|
        vmlal.s32       q8, d0, d4
 | 
						|
        vmlal.s32       q9, d1, d5
 | 
						|
.endm
 | 
						|
.macro  MUL4
 | 
						|
        vmull.s32       q8, d0, d4
 | 
						|
        vmull.s32       q9, d1, d5
 | 
						|
.endm
 | 
						|
.macro  INIT4
 | 
						|
        veor.s64        q8, q8
 | 
						|
        veor.s64        q9, q9
 | 
						|
.endm
 | 
						|
.macro  STORE
 | 
						|
        vadd.s64        q8, q8, q9
 | 
						|
        vadd.s64        d16, d16, d17
 | 
						|
        vqrshrn.s64     d16, q8, #30
 | 
						|
        vst1.32         d16[0], [r1]
 | 
						|
.endm
 | 
						|
 | 
						|
resample_one s32, 2
 | 
						|
 | 
						|
 | 
						|
/* s16 */
 | 
						|
.macro  LOAD1
 | 
						|
        veor.16         d0, d0
 | 
						|
        vld1.16         {d0[0]}, [r0]! /* load filter */
 | 
						|
        vld1.16         {d4[0]}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  LOAD2
 | 
						|
        veor.16         d0, d0
 | 
						|
        vld1.32         {d0[0]}, [r0]! /* load filter */
 | 
						|
        veor.16         d4, d4
 | 
						|
        vld1.32         {d4[0]}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  LOAD4
 | 
						|
        vld1.16         {d0}, [r0]! /* load filter */
 | 
						|
        vld1.16         {d4}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  MLA1
 | 
						|
        vmlal.s16       q8, d0, d4[0]
 | 
						|
.endm
 | 
						|
.macro  MLA2
 | 
						|
        vmlal.s16       q8, d0, d4
 | 
						|
.endm
 | 
						|
.macro  MLA4
 | 
						|
        vmlal.s16       q8, d0, d4
 | 
						|
.endm
 | 
						|
.macro  MUL4
 | 
						|
        vmull.s16       q8, d0, d4
 | 
						|
.endm
 | 
						|
.macro  INIT4
 | 
						|
        veor.s32        q8, q8
 | 
						|
.endm
 | 
						|
.macro  STORE
 | 
						|
        vpadd.s32       d16, d16, d17
 | 
						|
        vpadd.s32       d16, d16, d16
 | 
						|
        vqrshrn.s32     d16, q8, #15
 | 
						|
        vst1.16         d16[0], [r1]
 | 
						|
.endm
 | 
						|
 | 
						|
resample_one s16, 1
 | 
						|
 | 
						|
 | 
						|
.macro resample_linear  fmt, es=2
 | 
						|
function ff_resample_linear_\fmt\()_neon, export=1
 | 
						|
        push            {r4, r5}
 | 
						|
        add             r1, r1, r2, lsl #\es
 | 
						|
 | 
						|
        ldr             r2, [r0, #PHASE_SHIFT+4] /* phase_mask */
 | 
						|
        ldr             ip, [sp, #8] /* index */
 | 
						|
        ldr             r5, [r0, #FILTER_LENGTH]
 | 
						|
        and             r2, ip, r2 /* (index & phase_mask) */
 | 
						|
        ldr             r4, [r0, #PHASE_SHIFT]
 | 
						|
        lsr             r4, ip, r4 /* compute sample_index */
 | 
						|
        mul             r2, r2, r5
 | 
						|
 | 
						|
        ldr             ip, [r0, #FILTER_BANK]
 | 
						|
        add             r3, r3, r4, lsl #\es /* &src[sample_index] */
 | 
						|
 | 
						|
        cmp             r5, #8
 | 
						|
        ldr             r4, [r0, #SRC_INCR]
 | 
						|
        add             r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */
 | 
						|
        add             r2, r0, r5, lsl #\es /* filter[... + c->filter_length] */
 | 
						|
 | 
						|
        blt             5f
 | 
						|
8:
 | 
						|
        subs            r5, r5, #8
 | 
						|
        LOAD4
 | 
						|
        MUL4
 | 
						|
7:
 | 
						|
        LOAD4
 | 
						|
        beq             6f
 | 
						|
        cmp             r5, #8
 | 
						|
        MLA4
 | 
						|
        blt             4f
 | 
						|
        subs            r5, r5, #8
 | 
						|
        LOAD4
 | 
						|
        MLA4
 | 
						|
        b               7b
 | 
						|
6:
 | 
						|
        MLA4
 | 
						|
        STORE
 | 
						|
        pop             {r4, r5}
 | 
						|
        bx              lr
 | 
						|
5:
 | 
						|
        INIT4
 | 
						|
4:      /* remaining filter_length 1 to 7 */
 | 
						|
        cmp             r5, #4
 | 
						|
        blt             2f
 | 
						|
        subs            r5, r5, #4
 | 
						|
        LOAD4
 | 
						|
        MLA4
 | 
						|
        beq             0f
 | 
						|
2:      /* remaining filter_length 1 to 3 */
 | 
						|
        cmp             r5, #2
 | 
						|
        blt             1f
 | 
						|
        subs            r5, r5, #2
 | 
						|
        LOAD2
 | 
						|
        MLA2
 | 
						|
        beq             0f
 | 
						|
1:      /* remaining filter_length 1 */
 | 
						|
        LOAD1
 | 
						|
        MLA1
 | 
						|
0:
 | 
						|
        STORE
 | 
						|
        pop             {r4, r5}
 | 
						|
        bx              lr
 | 
						|
endfunc
 | 
						|
 | 
						|
.purgem LOAD1
 | 
						|
.purgem LOAD2
 | 
						|
.purgem LOAD4
 | 
						|
.purgem MLA1
 | 
						|
.purgem MLA2
 | 
						|
.purgem MLA4
 | 
						|
.purgem MUL4
 | 
						|
.purgem INIT4
 | 
						|
.purgem STORE
 | 
						|
.endm
 | 
						|
 | 
						|
 | 
						|
/* float32 linear */
 | 
						|
.macro  LOAD1
 | 
						|
        veor.32         d0, d0
 | 
						|
        veor.32         d2, d2
 | 
						|
        vld1.32         {d0[0]}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d2[0]}, [r2]! /* load filter */
 | 
						|
        vld1.32         {d4[0]}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  LOAD2
 | 
						|
        vld1.32         {d0}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d2}, [r2]! /* load filter */
 | 
						|
        vld1.32         {d4}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  LOAD4
 | 
						|
        vld1.32         {d0,d1}, [r0]! /* load filter */
 | 
						|
        vld1.32         {d2,d3}, [r2]! /* load filter */
 | 
						|
        vld1.32         {d4,d5}, [r3]! /* load src */
 | 
						|
.endm
 | 
						|
.macro  MLA1
 | 
						|
        vmla.f32        d18, d0, d4[0]
 | 
						|
        vmla.f32        d16, d2, d4[0]
 | 
						|
.endm
 | 
						|
.macro  MLA2
 | 
						|
        vmla.f32        d18, d0, d4
 | 
						|
        vmla.f32        d16, d2, d4
 | 
						|
.endm
 | 
						|
.macro  MLA4
 | 
						|
        vmla.f32        q9, q0, q2
 | 
						|
        vmla.f32        q8, q1, q2
 | 
						|
.endm
 | 
						|
.macro  MUL4
 | 
						|
        vmul.f32        q9, q0, q2
 | 
						|
        vmul.f32        q8, q1, q2
 | 
						|
.endm
 | 
						|
.macro  INIT4
 | 
						|
        veor.f32        q9, q9
 | 
						|
        veor.f32        q8, q8
 | 
						|
.endm
 | 
						|
.macro  STORE
 | 
						|
        vldr            s0, [sp, #12] /* frac */
 | 
						|
        vmov            s1, r4
 | 
						|
        vcvt.f32.s32    d0, d0
 | 
						|
 | 
						|
        vsub.f32        q8, q8, q9 /* v2 - val */
 | 
						|
        vpadd.f32       d18, d18, d19
 | 
						|
        vpadd.f32       d16, d16, d17
 | 
						|
        vpadd.f32       d2, d18, d18
 | 
						|
        vpadd.f32       d1, d16, d16
 | 
						|
 | 
						|
        vmul.f32        s2, s2, s0 /* (v2 - val) * frac */
 | 
						|
        vdiv.f32        s2, s2, s1 /* / c->src_incr */
 | 
						|
        vadd.f32        s4, s4, s2
 | 
						|
 | 
						|
        vstr            s4, [r1]
 | 
						|
.endm
 | 
						|
 | 
						|
resample_linear flt, 2
 |