* commit '12655c48049f9a52e5504bde90fe738862b0ff08': libavresample: NEON optimized FIR audio resampling Merged-by: Michael Niedermayer <michaelni@gmx.at>
		
			
				
	
	
		
			359 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			359 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2014 Peter Meerwald <pmeerw@pmeerw.net>
 | |
|  *
 | |
|  * This file is part of FFmpeg.
 | |
|  *
 | |
|  * FFmpeg is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Lesser General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2.1 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * FFmpeg is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public
 | |
|  * License along with FFmpeg; if not, write to the Free Software
 | |
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
|  */
 | |
| 
 | |
| #include "libavutil/arm/asm.S"
 | |
| 
 | |
| #include "asm-offsets.h"
 | |
| 
 | |
| .macro resample_one     fmt, es=2
 | |
| function ff_resample_one_\fmt\()_neon, export=1
 | |
|         push            {r4, r5}
 | |
|         add             r1, r1, r2, lsl #\es
 | |
| 
 | |
|         ldr             r2, [r0, #PHASE_SHIFT+4] /* phase_mask */
 | |
|         ldr             ip, [sp, #8] /* index */
 | |
|         ldr             r5, [r0, #FILTER_LENGTH]
 | |
|         and             r2, ip, r2 /* (index & phase_mask) */
 | |
|         ldr             r4, [r0, #PHASE_SHIFT]
 | |
|         lsr             r4, ip, r4 /* compute sample_index */
 | |
|         mul             r2, r2, r5
 | |
| 
 | |
|         ldr             ip, [r0, #FILTER_BANK]
 | |
|         add             r3, r3, r4, lsl #\es /* &src[sample_index] */
 | |
| 
 | |
|         cmp             r5, #8
 | |
|         add             r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */
 | |
| 
 | |
|         blt             5f
 | |
| 8:
 | |
|         subs            r5, r5, #8
 | |
|         LOAD4
 | |
|         MUL4
 | |
| 7:
 | |
|         LOAD4
 | |
|         beq             6f
 | |
|         cmp             r5, #8
 | |
|         MLA4
 | |
|         blt             4f
 | |
|         subs            r5, r5, #8
 | |
|         LOAD4
 | |
|         MLA4
 | |
|         b               7b
 | |
| 6:
 | |
|         MLA4
 | |
|         STORE
 | |
|         pop             {r4, r5}
 | |
|         bx              lr
 | |
| 5:
 | |
|         INIT4
 | |
| 4:      /* remaining filter_length 1 to 7 */
 | |
|         cmp             r5, #4
 | |
|         blt             2f
 | |
|         subs            r5, r5, #4
 | |
|         LOAD4
 | |
|         MLA4
 | |
|         beq             0f
 | |
| 2:      /* remaining filter_length 1 to 3 */
 | |
|         cmp             r5, #2
 | |
|         blt             1f
 | |
|         subs            r5, r5, #2
 | |
|         LOAD2
 | |
|         MLA2
 | |
|         beq             0f
 | |
| 1:      /* remaining filter_length 1 */
 | |
|         LOAD1
 | |
|         MLA1
 | |
| 0:
 | |
|         STORE
 | |
|         pop             {r4, r5}
 | |
|         bx              lr
 | |
| endfunc
 | |
| 
 | |
| .purgem LOAD1
 | |
| .purgem LOAD2
 | |
| .purgem LOAD4
 | |
| .purgem MLA1
 | |
| .purgem MLA2
 | |
| .purgem MLA4
 | |
| .purgem MUL4
 | |
| .purgem INIT4
 | |
| .purgem STORE
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /* float32 */
 | |
| .macro  LOAD1
 | |
|         veor.32         d0, d0
 | |
|         vld1.32         {d0[0]}, [r0]! /* load filter */
 | |
|         vld1.32         {d4[0]}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  LOAD2
 | |
|         vld1.32         {d0}, [r0]! /* load filter */
 | |
|         vld1.32         {d4}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  LOAD4
 | |
|         vld1.32         {d0,d1}, [r0]! /* load filter */
 | |
|         vld1.32         {d4,d5}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  MLA1
 | |
|         vmla.f32        d16, d0, d4[0]
 | |
| .endm
 | |
| .macro  MLA2
 | |
|         vmla.f32        d16, d0, d4
 | |
| .endm
 | |
| .macro  MLA4
 | |
|         vmla.f32        d16, d0, d4
 | |
|         vmla.f32        d17, d1, d5
 | |
| .endm
 | |
| .macro  MUL4
 | |
|         vmul.f32        d16, d0, d4
 | |
|         vmul.f32        d17, d1, d5
 | |
| .endm
 | |
| .macro  INIT4
 | |
|         veor.f32        q8, q8
 | |
| .endm
 | |
| .macro  STORE
 | |
|         vpadd.f32       d16, d16, d17
 | |
|         vpadd.f32       d16, d16, d16
 | |
|         vst1.32         d16[0], [r1]
 | |
| .endm
 | |
| 
 | |
| resample_one flt, 2
 | |
| 
 | |
| 
 | |
| /* s32 */
 | |
| .macro  LOAD1
 | |
|         veor.32         d0, d0
 | |
|         vld1.32         {d0[0]}, [r0]! /* load filter */
 | |
|         vld1.32         {d4[0]}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  LOAD2
 | |
|         vld1.32         {d0}, [r0]! /* load filter */
 | |
|         vld1.32         {d4}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  LOAD4
 | |
|         vld1.32         {d0,d1}, [r0]! /* load filter */
 | |
|         vld1.32         {d4,d5}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  MLA1
 | |
|         vmlal.s32       q8, d0, d4[0]
 | |
| .endm
 | |
| .macro  MLA2
 | |
|         vmlal.s32       q8, d0, d4
 | |
| .endm
 | |
| .macro  MLA4
 | |
|         vmlal.s32       q8, d0, d4
 | |
|         vmlal.s32       q9, d1, d5
 | |
| .endm
 | |
| .macro  MUL4
 | |
|         vmull.s32       q8, d0, d4
 | |
|         vmull.s32       q9, d1, d5
 | |
| .endm
 | |
| .macro  INIT4
 | |
|         veor.s64        q8, q8
 | |
|         veor.s64        q9, q9
 | |
| .endm
 | |
| .macro  STORE
 | |
|         vadd.s64        q8, q8, q9
 | |
|         vadd.s64        d16, d16, d17
 | |
|         vqrshrn.s64     d16, q8, #30
 | |
|         vst1.32         d16[0], [r1]
 | |
| .endm
 | |
| 
 | |
| resample_one s32, 2
 | |
| 
 | |
| 
 | |
| /* s16 */
 | |
| .macro  LOAD1
 | |
|         veor.16         d0, d0
 | |
|         vld1.16         {d0[0]}, [r0]! /* load filter */
 | |
|         vld1.16         {d4[0]}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  LOAD2
 | |
|         veor.16         d0, d0
 | |
|         vld1.32         {d0[0]}, [r0]! /* load filter */
 | |
|         veor.16         d4, d4
 | |
|         vld1.32         {d4[0]}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  LOAD4
 | |
|         vld1.16         {d0}, [r0]! /* load filter */
 | |
|         vld1.16         {d4}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  MLA1
 | |
|         vmlal.s16       q8, d0, d4[0]
 | |
| .endm
 | |
| .macro  MLA2
 | |
|         vmlal.s16       q8, d0, d4
 | |
| .endm
 | |
| .macro  MLA4
 | |
|         vmlal.s16       q8, d0, d4
 | |
| .endm
 | |
| .macro  MUL4
 | |
|         vmull.s16       q8, d0, d4
 | |
| .endm
 | |
| .macro  INIT4
 | |
|         veor.s32        q8, q8
 | |
| .endm
 | |
| .macro  STORE
 | |
|         vpadd.s32       d16, d16, d17
 | |
|         vpadd.s32       d16, d16, d16
 | |
|         vqrshrn.s32     d16, q8, #15
 | |
|         vst1.16         d16[0], [r1]
 | |
| .endm
 | |
| 
 | |
| resample_one s16, 1
 | |
| 
 | |
| 
 | |
| .macro resample_linear  fmt, es=2
 | |
| function ff_resample_linear_\fmt\()_neon, export=1
 | |
|         push            {r4, r5}
 | |
|         add             r1, r1, r2, lsl #\es
 | |
| 
 | |
|         ldr             r2, [r0, #PHASE_SHIFT+4] /* phase_mask */
 | |
|         ldr             ip, [sp, #8] /* index */
 | |
|         ldr             r5, [r0, #FILTER_LENGTH]
 | |
|         and             r2, ip, r2 /* (index & phase_mask) */
 | |
|         ldr             r4, [r0, #PHASE_SHIFT]
 | |
|         lsr             r4, ip, r4 /* compute sample_index */
 | |
|         mul             r2, r2, r5
 | |
| 
 | |
|         ldr             ip, [r0, #FILTER_BANK]
 | |
|         add             r3, r3, r4, lsl #\es /* &src[sample_index] */
 | |
| 
 | |
|         cmp             r5, #8
 | |
|         ldr             r4, [r0, #SRC_INCR]
 | |
|         add             r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */
 | |
|         add             r2, r0, r5, lsl #\es /* filter[... + c->filter_length] */
 | |
| 
 | |
|         blt             5f
 | |
| 8:
 | |
|         subs            r5, r5, #8
 | |
|         LOAD4
 | |
|         MUL4
 | |
| 7:
 | |
|         LOAD4
 | |
|         beq             6f
 | |
|         cmp             r5, #8
 | |
|         MLA4
 | |
|         blt             4f
 | |
|         subs            r5, r5, #8
 | |
|         LOAD4
 | |
|         MLA4
 | |
|         b               7b
 | |
| 6:
 | |
|         MLA4
 | |
|         STORE
 | |
|         pop             {r4, r5}
 | |
|         bx              lr
 | |
| 5:
 | |
|         INIT4
 | |
| 4:      /* remaining filter_length 1 to 7 */
 | |
|         cmp             r5, #4
 | |
|         blt             2f
 | |
|         subs            r5, r5, #4
 | |
|         LOAD4
 | |
|         MLA4
 | |
|         beq             0f
 | |
| 2:      /* remaining filter_length 1 to 3 */
 | |
|         cmp             r5, #2
 | |
|         blt             1f
 | |
|         subs            r5, r5, #2
 | |
|         LOAD2
 | |
|         MLA2
 | |
|         beq             0f
 | |
| 1:      /* remaining filter_length 1 */
 | |
|         LOAD1
 | |
|         MLA1
 | |
| 0:
 | |
|         STORE
 | |
|         pop             {r4, r5}
 | |
|         bx              lr
 | |
| endfunc
 | |
| 
 | |
| .purgem LOAD1
 | |
| .purgem LOAD2
 | |
| .purgem LOAD4
 | |
| .purgem MLA1
 | |
| .purgem MLA2
 | |
| .purgem MLA4
 | |
| .purgem MUL4
 | |
| .purgem INIT4
 | |
| .purgem STORE
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /* float32 linear */
 | |
| .macro  LOAD1
 | |
|         veor.32         d0, d0
 | |
|         veor.32         d2, d2
 | |
|         vld1.32         {d0[0]}, [r0]! /* load filter */
 | |
|         vld1.32         {d2[0]}, [r2]! /* load filter */
 | |
|         vld1.32         {d4[0]}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  LOAD2
 | |
|         vld1.32         {d0}, [r0]! /* load filter */
 | |
|         vld1.32         {d2}, [r2]! /* load filter */
 | |
|         vld1.32         {d4}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  LOAD4
 | |
|         vld1.32         {d0,d1}, [r0]! /* load filter */
 | |
|         vld1.32         {d2,d3}, [r2]! /* load filter */
 | |
|         vld1.32         {d4,d5}, [r3]! /* load src */
 | |
| .endm
 | |
| .macro  MLA1
 | |
|         vmla.f32        d18, d0, d4[0]
 | |
|         vmla.f32        d16, d2, d4[0]
 | |
| .endm
 | |
| .macro  MLA2
 | |
|         vmla.f32        d18, d0, d4
 | |
|         vmla.f32        d16, d2, d4
 | |
| .endm
 | |
| .macro  MLA4
 | |
|         vmla.f32        q9, q0, q2
 | |
|         vmla.f32        q8, q1, q2
 | |
| .endm
 | |
| .macro  MUL4
 | |
|         vmul.f32        q9, q0, q2
 | |
|         vmul.f32        q8, q1, q2
 | |
| .endm
 | |
| .macro  INIT4
 | |
|         veor.f32        q9, q9
 | |
|         veor.f32        q8, q8
 | |
| .endm
 | |
| .macro  STORE
 | |
|         vldr            s0, [sp, #12] /* frac */
 | |
|         vmov            s1, r4
 | |
|         vcvt.f32.s32    d0, d0
 | |
| 
 | |
|         vsub.f32        q8, q8, q9 /* v2 - val */
 | |
|         vpadd.f32       d18, d18, d19
 | |
|         vpadd.f32       d16, d16, d17
 | |
|         vpadd.f32       d2, d18, d18
 | |
|         vpadd.f32       d1, d16, d16
 | |
| 
 | |
|         vmul.f32        s2, s2, s0 /* (v2 - val) * frac */
 | |
|         vdiv.f32        s2, s2, s1 /* / c->src_incr */
 | |
|         vadd.f32        s4, s4, s2
 | |
| 
 | |
|         vstr            s4, [r1]
 | |
| .endm
 | |
| 
 | |
| resample_linear flt, 2
 |