Merge remote-tracking branch 'qatar/master'
* qatar/master: arm: Add VFP-accelerated version of qmf_32_subbands Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
		
						commit
						23f250d2bc
					
				| @ -26,6 +26,12 @@ | ||||
| 
 | ||||
| void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs, | ||||
|                         int decifactor, float scale); | ||||
| void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, | ||||
|                                 SynthFilterContext *synth, FFTContext *imdct, | ||||
|                                 float synth_buf_ptr[512], | ||||
|                                 int *synth_buf_offset, float synth_buf2[32], | ||||
|                                 const float window[512], float *samples_out, | ||||
|                                 float raXin[32], float scale); | ||||
| void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, | ||||
|                          int decifactor, float scale); | ||||
| 
 | ||||
| @ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s) | ||||
| { | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
| 
 | ||||
|     if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) | ||||
|     if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) { | ||||
|         s->lfe_fir = ff_dca_lfe_fir_vfp; | ||||
|         s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp; | ||||
|     } | ||||
|     if (have_neon(cpu_flags)) | ||||
|         s->lfe_fir = ff_dca_lfe_fir_neon; | ||||
| } | ||||
|  | ||||
| @ -218,3 +218,276 @@ endfunc | ||||
|         .unreq  POST1
 | ||||
|         .unreq  POST2
 | ||||
|         .unreq  POST3
 | ||||
| 
 | ||||
| 
 | ||||
| IN      .req    a1 | ||||
| SBACT   .req    a2 | ||||
| OLDFPSCR .req   a3 | ||||
| IMDCT   .req    a4 | ||||
| WINDOW  .req    v1 | ||||
| OUT     .req    v2 | ||||
| BUF     .req    v3 | ||||
| SCALEINT .req   v4 @ only used in softfp case
 | ||||
| COUNT   .req    v5 | ||||
| 
 | ||||
| SCALE   .req    s0 | ||||
| 
 | ||||
| /* Stack layout differs in softfp and hardfp cases: | ||||
|  * | ||||
|  * hardfp | ||||
|  *      fp -> 6 arg words saved by caller | ||||
|  *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes) | ||||
|  *            s16-s23 on entry | ||||
|  *            align 16 | ||||
|  *     buf -> 8*32*4 bytes buffer | ||||
|  *            s0 on entry | ||||
|  *      sp -> 3 arg words for callee | ||||
|  * | ||||
|  * softfp | ||||
|  *      fp -> 7 arg words saved by caller | ||||
|  *            a4,v1-v5,fp,lr on entry | ||||
|  *            s16-s23 on entry | ||||
|  *            align 16 | ||||
|  *     buf -> 8*32*4 bytes buffer | ||||
|  *      sp -> 4 arg words for callee | ||||
|  */ | ||||
| 
 | ||||
| /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, | ||||
|  *                                 SynthFilterContext *synth, FFTContext *imdct, | ||||
|  *                                 float (*synth_buf_ptr)[512], | ||||
|  *                                 int *synth_buf_offset, float (*synth_buf2)[32], | ||||
|  *                                 const float (*window)[512], float *samples_out, | ||||
|  *                                 float (*raXin)[32], float scale);
 | ||||
|  */ | ||||
| function ff_dca_qmf_32_subbands_vfp, export=1 | ||||
| VFP     push    {a3-a4,v1-v3,v5,fp,lr} | ||||
| NOVFP   push    {a4,v1-v5,fp,lr} | ||||
|         add     fp, sp, #8*4 | ||||
|         vpush   {s16-s23} | ||||
|         @ The buffer pointed at by raXin isn't big enough for us to do a
 | ||||
|         @ complete matrix transposition as we want to, so allocate an
 | ||||
|         @ alternative buffer from the stack. Align to 4 words for speed.
 | ||||
|         sub     BUF, sp, #8*32*4 | ||||
|         bic     BUF, BUF, #15 | ||||
|         mov     sp, BUF | ||||
|         ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
 | ||||
|         fmrx    OLDFPSCR, FPSCR | ||||
|         fmxr    FPSCR, lr | ||||
|         @ COUNT is used to count down 2 things at once:
 | ||||
|         @ bits 0-4 are the number of word pairs remaining in the output row
 | ||||
|         @ bits 5-31 are the number of words to copy (with possible negation)
 | ||||
|         @   from the source matrix before we start zeroing the remainder
 | ||||
|         mov     COUNT, #(-4 << 5) + 16 | ||||
|         adds    COUNT, COUNT, SBACT, lsl #5 | ||||
|         bmi     2f | ||||
| 1: | ||||
|         vldr    s8,  [IN, #(0*8+0)*4] | ||||
|         vldr    s10, [IN, #(0*8+1)*4] | ||||
|         vldr    s12, [IN, #(0*8+2)*4] | ||||
|         vldr    s14, [IN, #(0*8+3)*4] | ||||
|         vldr    s16, [IN, #(0*8+4)*4] | ||||
|         vldr    s18, [IN, #(0*8+5)*4] | ||||
|         vldr    s20, [IN, #(0*8+6)*4] | ||||
|         vldr    s22, [IN, #(0*8+7)*4] | ||||
|         vneg.f  s8, s8 | ||||
|         vldr    s9,  [IN, #(1*8+0)*4] | ||||
|         vldr    s11, [IN, #(1*8+1)*4] | ||||
|         vldr    s13, [IN, #(1*8+2)*4] | ||||
|         vldr    s15, [IN, #(1*8+3)*4] | ||||
|         vneg.f  s16, s16 | ||||
|         vldr    s17, [IN, #(1*8+4)*4] | ||||
|         vldr    s19, [IN, #(1*8+5)*4] | ||||
|         vldr    s21, [IN, #(1*8+6)*4] | ||||
|         vldr    s23, [IN, #(1*8+7)*4] | ||||
|         vstr    d4,  [BUF, #(0*32+0)*4] | ||||
|         vstr    d5,  [BUF, #(1*32+0)*4] | ||||
|         vstr    d6,  [BUF, #(2*32+0)*4] | ||||
|         vstr    d7,  [BUF, #(3*32+0)*4] | ||||
|         vstr    d8,  [BUF, #(4*32+0)*4] | ||||
|         vstr    d9,  [BUF, #(5*32+0)*4] | ||||
|         vstr    d10, [BUF, #(6*32+0)*4] | ||||
|         vstr    d11, [BUF, #(7*32+0)*4] | ||||
|         vldr    s9,  [IN, #(3*8+0)*4] | ||||
|         vldr    s11, [IN, #(3*8+1)*4] | ||||
|         vldr    s13, [IN, #(3*8+2)*4] | ||||
|         vldr    s15, [IN, #(3*8+3)*4] | ||||
|         vldr    s17, [IN, #(3*8+4)*4] | ||||
|         vldr    s19, [IN, #(3*8+5)*4] | ||||
|         vldr    s21, [IN, #(3*8+6)*4] | ||||
|         vldr    s23, [IN, #(3*8+7)*4] | ||||
|         vneg.f  s9, s9 | ||||
|         vldr    s8,  [IN, #(2*8+0)*4] | ||||
|         vldr    s10, [IN, #(2*8+1)*4] | ||||
|         vldr    s12, [IN, #(2*8+2)*4] | ||||
|         vldr    s14, [IN, #(2*8+3)*4] | ||||
|         vneg.f  s17, s17 | ||||
|         vldr    s16, [IN, #(2*8+4)*4] | ||||
|         vldr    s18, [IN, #(2*8+5)*4] | ||||
|         vldr    s20, [IN, #(2*8+6)*4] | ||||
|         vldr    s22, [IN, #(2*8+7)*4] | ||||
|         vstr    d4,  [BUF, #(0*32+2)*4] | ||||
|         vstr    d5,  [BUF, #(1*32+2)*4] | ||||
|         vstr    d6,  [BUF, #(2*32+2)*4] | ||||
|         vstr    d7,  [BUF, #(3*32+2)*4] | ||||
|         vstr    d8,  [BUF, #(4*32+2)*4] | ||||
|         vstr    d9,  [BUF, #(5*32+2)*4] | ||||
|         vstr    d10, [BUF, #(6*32+2)*4] | ||||
|         vstr    d11, [BUF, #(7*32+2)*4] | ||||
|         add     IN, IN, #4*8*4 | ||||
|         add     BUF, BUF, #4*4 | ||||
|         subs    COUNT, COUNT, #(4 << 5) + 2 | ||||
|         bpl     1b | ||||
| 2:      @ Now deal with trailing < 4 samples
 | ||||
|         adds    COUNT, COUNT, #3 << 5 | ||||
|         bmi     4f  @ sb_act was a multiple of 4
 | ||||
|         bics    lr, COUNT, #0x1F | ||||
|         bne     3f | ||||
|         @ sb_act was n*4+1
 | ||||
|         vldr    s8,  [IN, #(0*8+0)*4] | ||||
|         vldr    s10, [IN, #(0*8+1)*4] | ||||
|         vldr    s12, [IN, #(0*8+2)*4] | ||||
|         vldr    s14, [IN, #(0*8+3)*4] | ||||
|         vldr    s16, [IN, #(0*8+4)*4] | ||||
|         vldr    s18, [IN, #(0*8+5)*4] | ||||
|         vldr    s20, [IN, #(0*8+6)*4] | ||||
|         vldr    s22, [IN, #(0*8+7)*4] | ||||
|         vneg.f  s8, s8 | ||||
|         vldr    s9,  zero | ||||
|         vldr    s11, zero | ||||
|         vldr    s13, zero | ||||
|         vldr    s15, zero | ||||
|         vneg.f  s16, s16 | ||||
|         vldr    s17, zero | ||||
|         vldr    s19, zero | ||||
|         vldr    s21, zero | ||||
|         vldr    s23, zero | ||||
|         vstr    d4,  [BUF, #(0*32+0)*4] | ||||
|         vstr    d5,  [BUF, #(1*32+0)*4] | ||||
|         vstr    d6,  [BUF, #(2*32+0)*4] | ||||
|         vstr    d7,  [BUF, #(3*32+0)*4] | ||||
|         vstr    d8,  [BUF, #(4*32+0)*4] | ||||
|         vstr    d9,  [BUF, #(5*32+0)*4] | ||||
|         vstr    d10, [BUF, #(6*32+0)*4] | ||||
|         vstr    d11, [BUF, #(7*32+0)*4] | ||||
|         add     BUF, BUF, #2*4 | ||||
|         sub     COUNT, COUNT, #1 | ||||
|         b       4f | ||||
| 3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
 | ||||
|         vldr    s8,  [IN, #(0*8+0)*4] | ||||
|         vldr    s10, [IN, #(0*8+1)*4] | ||||
|         vldr    s12, [IN, #(0*8+2)*4] | ||||
|         vldr    s14, [IN, #(0*8+3)*4] | ||||
|         vldr    s16, [IN, #(0*8+4)*4] | ||||
|         vldr    s18, [IN, #(0*8+5)*4] | ||||
|         vldr    s20, [IN, #(0*8+6)*4] | ||||
|         vldr    s22, [IN, #(0*8+7)*4] | ||||
|         vneg.f  s8, s8 | ||||
|         vldr    s9,  [IN, #(1*8+0)*4] | ||||
|         vldr    s11, [IN, #(1*8+1)*4] | ||||
|         vldr    s13, [IN, #(1*8+2)*4] | ||||
|         vldr    s15, [IN, #(1*8+3)*4] | ||||
|         vneg.f  s16, s16 | ||||
|         vldr    s17, [IN, #(1*8+4)*4] | ||||
|         vldr    s19, [IN, #(1*8+5)*4] | ||||
|         vldr    s21, [IN, #(1*8+6)*4] | ||||
|         vldr    s23, [IN, #(1*8+7)*4] | ||||
|         vstr    d4,  [BUF, #(0*32+0)*4] | ||||
|         vstr    d5,  [BUF, #(1*32+0)*4] | ||||
|         vstr    d6,  [BUF, #(2*32+0)*4] | ||||
|         vstr    d7,  [BUF, #(3*32+0)*4] | ||||
|         vstr    d8,  [BUF, #(4*32+0)*4] | ||||
|         vstr    d9,  [BUF, #(5*32+0)*4] | ||||
|         vstr    d10, [BUF, #(6*32+0)*4] | ||||
|         vstr    d11, [BUF, #(7*32+0)*4] | ||||
|         add     BUF, BUF, #2*4 | ||||
|         sub     COUNT, COUNT, #(2 << 5) + 1 | ||||
|         bics    lr, COUNT, #0x1F | ||||
|         bne     4f | ||||
|         @ sb_act was n*4+3
 | ||||
|         vldr    s8,  [IN, #(2*8+0)*4] | ||||
|         vldr    s10, [IN, #(2*8+1)*4] | ||||
|         vldr    s12, [IN, #(2*8+2)*4] | ||||
|         vldr    s14, [IN, #(2*8+3)*4] | ||||
|         vldr    s16, [IN, #(2*8+4)*4] | ||||
|         vldr    s18, [IN, #(2*8+5)*4] | ||||
|         vldr    s20, [IN, #(2*8+6)*4] | ||||
|         vldr    s22, [IN, #(2*8+7)*4] | ||||
|         vldr    s9,  zero | ||||
|         vldr    s11, zero | ||||
|         vldr    s13, zero | ||||
|         vldr    s15, zero | ||||
|         vldr    s17, zero | ||||
|         vldr    s19, zero | ||||
|         vldr    s21, zero | ||||
|         vldr    s23, zero | ||||
|         vstr    d4,  [BUF, #(0*32+0)*4] | ||||
|         vstr    d5,  [BUF, #(1*32+0)*4] | ||||
|         vstr    d6,  [BUF, #(2*32+0)*4] | ||||
|         vstr    d7,  [BUF, #(3*32+0)*4] | ||||
|         vstr    d8,  [BUF, #(4*32+0)*4] | ||||
|         vstr    d9,  [BUF, #(5*32+0)*4] | ||||
|         vstr    d10, [BUF, #(6*32+0)*4] | ||||
|         vstr    d11, [BUF, #(7*32+0)*4] | ||||
|         add     BUF, BUF, #2*4 | ||||
|         sub     COUNT, COUNT, #1 | ||||
| 4:      @ Now fill the remainder with 0
 | ||||
|         vldr    s8, zero | ||||
|         vldr    s9, zero | ||||
|         ands    COUNT, COUNT, #0x1F | ||||
|         beq     6f | ||||
| 5:      vstr    d4, [BUF, #(0*32+0)*4] | ||||
|         vstr    d4, [BUF, #(1*32+0)*4] | ||||
|         vstr    d4, [BUF, #(2*32+0)*4] | ||||
|         vstr    d4, [BUF, #(3*32+0)*4] | ||||
|         vstr    d4, [BUF, #(4*32+0)*4] | ||||
|         vstr    d4, [BUF, #(5*32+0)*4] | ||||
|         vstr    d4, [BUF, #(6*32+0)*4] | ||||
|         vstr    d4, [BUF, #(7*32+0)*4] | ||||
|         add     BUF, BUF, #2*4 | ||||
|         subs    COUNT, COUNT, #1 | ||||
|         bne     5b | ||||
| 6: | ||||
|         fmxr    FPSCR, OLDFPSCR | ||||
|         ldr     WINDOW, [fp, #3*4] | ||||
|         ldr     OUT, [fp, #4*4] | ||||
|         sub     BUF, BUF, #32*4 | ||||
| NOVFP   ldr     SCALEINT, [fp, #6*4] | ||||
|         mov     COUNT, #8 | ||||
| VFP     vpush   {SCALE} | ||||
| VFP     sub     sp, sp, #3*4 | ||||
| NOVFP   sub     sp, sp, #4*4 | ||||
| 7: | ||||
| VFP     ldr     a1, [fp, #-7*4]     @ imdct
 | ||||
| NOVFP   ldr     a1, [fp, #-8*4] | ||||
|         ldmia   fp, {a2-a4} | ||||
| VFP     stmia   sp, {WINDOW, OUT, BUF} | ||||
| NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT} | ||||
| VFP     vldr    SCALE, [sp, #3*4] | ||||
|         bl      ff_synth_filter_float_vfp | ||||
|         add     OUT, OUT, #32*4 | ||||
|         add     BUF, BUF, #32*4 | ||||
|         subs    COUNT, COUNT, #1 | ||||
|         bne     7b | ||||
| 
 | ||||
| A       sub     sp, fp, #(8+8)*4 | ||||
| T       sub     fp, fp, #(8+8)*4 | ||||
| T       mov     sp, fp | ||||
|         vpop    {s16-s23} | ||||
| VFP     pop     {a3-a4,v1-v3,v5,fp,pc} | ||||
| NOVFP   pop     {a4,v1-v5,fp,pc} | ||||
| endfunc | ||||
| 
 | ||||
|         .unreq  IN
 | ||||
|         .unreq  SBACT
 | ||||
|         .unreq  OLDFPSCR
 | ||||
|         .unreq  IMDCT
 | ||||
|         .unreq  WINDOW
 | ||||
|         .unreq  OUT
 | ||||
|         .unreq  BUF
 | ||||
|         .unreq  SCALEINT
 | ||||
|         .unreq  COUNT
 | ||||
| 
 | ||||
|         .unreq  SCALE
 | ||||
| 
 | ||||
|         .align 2
 | ||||
| zero:   .word   0 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user