aarch64: Reindent all assembly to 8/24 column indentation
libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally uses a layered indentation style to visually show how different unrolled/interleaved phases fit together. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
		
							parent
							
								
									cada4597ca
								
							
						
					
					
						commit
						a76b409dd0
					
				| @ -19,130 +19,130 @@ | |||||||
| #include "libavutil/aarch64/asm.S" | #include "libavutil/aarch64/asm.S" | ||||||
| 
 | 
 | ||||||
| function ff_ps_add_squares_neon, export=1 | function ff_ps_add_squares_neon, export=1 | ||||||
| 1:      ld1         {v0.4s,v1.4s}, [x1], #32 | 1:      ld1             {v0.4s,v1.4s}, [x1], #32 | ||||||
|         fmul        v0.4s, v0.4s, v0.4s |         fmul            v0.4s, v0.4s, v0.4s | ||||||
|         fmul        v1.4s, v1.4s, v1.4s |         fmul            v1.4s, v1.4s, v1.4s | ||||||
|         faddp       v2.4s, v0.4s, v1.4s |         faddp           v2.4s, v0.4s, v1.4s | ||||||
|         ld1         {v3.4s}, [x0] |         ld1             {v3.4s}, [x0] | ||||||
|         fadd        v3.4s, v3.4s, v2.4s |         fadd            v3.4s, v3.4s, v2.4s | ||||||
|         st1         {v3.4s}, [x0], #16 |         st1             {v3.4s}, [x0], #16 | ||||||
|         subs        w2, w2, #4 |         subs            w2, w2, #4 | ||||||
|         b.gt        1b |         b.gt            1b | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| function ff_ps_mul_pair_single_neon, export=1 | function ff_ps_mul_pair_single_neon, export=1 | ||||||
| 1:      ld1         {v0.4s,v1.4s}, [x1], #32 | 1:      ld1             {v0.4s,v1.4s}, [x1], #32 | ||||||
|         ld1         {v2.4s},       [x2], #16 |         ld1             {v2.4s},       [x2], #16 | ||||||
|         zip1        v3.4s, v2.4s, v2.4s |         zip1            v3.4s, v2.4s, v2.4s | ||||||
|         zip2        v4.4s, v2.4s, v2.4s |         zip2            v4.4s, v2.4s, v2.4s | ||||||
|         fmul        v0.4s, v0.4s, v3.4s |         fmul            v0.4s, v0.4s, v3.4s | ||||||
|         fmul        v1.4s, v1.4s, v4.4s |         fmul            v1.4s, v1.4s, v4.4s | ||||||
|         st1         {v0.4s,v1.4s}, [x0], #32 |         st1             {v0.4s,v1.4s}, [x0], #32 | ||||||
|         subs        w3, w3, #4 |         subs            w3, w3, #4 | ||||||
|         b.gt        1b |         b.gt            1b | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| function ff_ps_stereo_interpolate_neon, export=1 | function ff_ps_stereo_interpolate_neon, export=1 | ||||||
|         ld1         {v0.4s}, [x2] |         ld1             {v0.4s}, [x2] | ||||||
|         ld1         {v1.4s}, [x3] |         ld1             {v1.4s}, [x3] | ||||||
|         zip1        v4.4s, v0.4s, v0.4s |         zip1            v4.4s, v0.4s, v0.4s | ||||||
|         zip2        v5.4s, v0.4s, v0.4s |         zip2            v5.4s, v0.4s, v0.4s | ||||||
|         zip1        v6.4s, v1.4s, v1.4s |         zip1            v6.4s, v1.4s, v1.4s | ||||||
|         zip2        v7.4s, v1.4s, v1.4s |         zip2            v7.4s, v1.4s, v1.4s | ||||||
| 1:      ld1         {v2.2s}, [x0] | 1:      ld1             {v2.2s}, [x0] | ||||||
|         ld1         {v3.2s}, [x1] |         ld1             {v3.2s}, [x1] | ||||||
|         fadd        v4.4s, v4.4s, v6.4s |         fadd            v4.4s, v4.4s, v6.4s | ||||||
|         fadd        v5.4s, v5.4s, v7.4s |         fadd            v5.4s, v5.4s, v7.4s | ||||||
|         mov         v2.d[1], v2.d[0] |         mov             v2.d[1], v2.d[0] | ||||||
|         mov         v3.d[1], v3.d[0] |         mov             v3.d[1], v3.d[0] | ||||||
|         fmul        v2.4s, v2.4s, v4.4s |         fmul            v2.4s, v2.4s, v4.4s | ||||||
|         fmla        v2.4s, v3.4s, v5.4s |         fmla            v2.4s, v3.4s, v5.4s | ||||||
|         st1         {v2.d}[0], [x0], #8 |         st1             {v2.d}[0], [x0], #8 | ||||||
|         st1         {v2.d}[1], [x1], #8 |         st1             {v2.d}[1], [x1], #8 | ||||||
|         subs        w4, w4, #1 |         subs            w4, w4, #1 | ||||||
|         b.gt        1b |         b.gt            1b | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| function ff_ps_stereo_interpolate_ipdopd_neon, export=1 | function ff_ps_stereo_interpolate_ipdopd_neon, export=1 | ||||||
|         ld1         {v0.4s,v1.4s}, [x2] |         ld1             {v0.4s,v1.4s}, [x2] | ||||||
|         ld1         {v6.4s,v7.4s}, [x3] |         ld1             {v6.4s,v7.4s}, [x3] | ||||||
|         fneg        v2.4s, v1.4s |         fneg            v2.4s, v1.4s | ||||||
|         fneg        v3.4s, v7.4s |         fneg            v3.4s, v7.4s | ||||||
|         zip1        v16.4s, v0.4s, v0.4s |         zip1            v16.4s, v0.4s, v0.4s | ||||||
|         zip2        v17.4s, v0.4s, v0.4s |         zip2            v17.4s, v0.4s, v0.4s | ||||||
|         zip1        v18.4s, v2.4s, v1.4s |         zip1            v18.4s, v2.4s, v1.4s | ||||||
|         zip2        v19.4s, v2.4s, v1.4s |         zip2            v19.4s, v2.4s, v1.4s | ||||||
|         zip1        v20.4s, v6.4s, v6.4s |         zip1            v20.4s, v6.4s, v6.4s | ||||||
|         zip2        v21.4s, v6.4s, v6.4s |         zip2            v21.4s, v6.4s, v6.4s | ||||||
|         zip1        v22.4s, v3.4s, v7.4s |         zip1            v22.4s, v3.4s, v7.4s | ||||||
|         zip2        v23.4s, v3.4s, v7.4s |         zip2            v23.4s, v3.4s, v7.4s | ||||||
| 1:      ld1         {v2.2s}, [x0] | 1:      ld1             {v2.2s}, [x0] | ||||||
|         ld1         {v3.2s}, [x1] |         ld1             {v3.2s}, [x1] | ||||||
|         fadd        v16.4s, v16.4s, v20.4s |         fadd            v16.4s, v16.4s, v20.4s | ||||||
|         fadd        v17.4s, v17.4s, v21.4s |         fadd            v17.4s, v17.4s, v21.4s | ||||||
|         mov         v2.d[1], v2.d[0] |         mov             v2.d[1], v2.d[0] | ||||||
|         mov         v3.d[1], v3.d[0] |         mov             v3.d[1], v3.d[0] | ||||||
|         fmul        v4.4s, v2.4s, v16.4s |         fmul            v4.4s, v2.4s, v16.4s | ||||||
|         fmla        v4.4s, v3.4s, v17.4s |         fmla            v4.4s, v3.4s, v17.4s | ||||||
|         fadd        v18.4s, v18.4s, v22.4s |         fadd            v18.4s, v18.4s, v22.4s | ||||||
|         fadd        v19.4s, v19.4s, v23.4s |         fadd            v19.4s, v19.4s, v23.4s | ||||||
|         ext         v2.16b, v2.16b, v2.16b, #4 |         ext             v2.16b, v2.16b, v2.16b, #4 | ||||||
|         ext         v3.16b, v3.16b, v3.16b, #4 |         ext             v3.16b, v3.16b, v3.16b, #4 | ||||||
|         fmla        v4.4s, v2.4s, v18.4s |         fmla            v4.4s, v2.4s, v18.4s | ||||||
|         fmla        v4.4s, v3.4s, v19.4s |         fmla            v4.4s, v3.4s, v19.4s | ||||||
|         st1         {v4.d}[0], [x0], #8 |         st1             {v4.d}[0], [x0], #8 | ||||||
|         st1         {v4.d}[1], [x1], #8 |         st1             {v4.d}[1], [x1], #8 | ||||||
|         subs        w4, w4, #1 |         subs            w4, w4, #1 | ||||||
|         b.gt        1b |         b.gt            1b | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| function ff_ps_hybrid_analysis_neon, export=1 | function ff_ps_hybrid_analysis_neon, export=1 | ||||||
|         lsl         x3, x3, #3 |         lsl             x3, x3, #3 | ||||||
|         ld2         {v0.4s,v1.4s}, [x1], #32 |         ld2             {v0.4s,v1.4s}, [x1], #32 | ||||||
|         ld2         {v2.2s,v3.2s}, [x1], #16 |         ld2             {v2.2s,v3.2s}, [x1], #16 | ||||||
|         ld1         {v24.2s},      [x1], #8 |         ld1             {v24.2s},      [x1], #8 | ||||||
|         ld2         {v4.2s,v5.2s}, [x1], #16 |         ld2             {v4.2s,v5.2s}, [x1], #16 | ||||||
|         ld2         {v6.4s,v7.4s}, [x1] |         ld2             {v6.4s,v7.4s}, [x1] | ||||||
|         rev64       v6.4s, v6.4s |         rev64           v6.4s, v6.4s | ||||||
|         rev64       v7.4s, v7.4s |         rev64           v7.4s, v7.4s | ||||||
|         ext         v6.16b, v6.16b, v6.16b, #8 |         ext             v6.16b, v6.16b, v6.16b, #8 | ||||||
|         ext         v7.16b, v7.16b, v7.16b, #8 |         ext             v7.16b, v7.16b, v7.16b, #8 | ||||||
|         rev64       v4.2s, v4.2s |         rev64           v4.2s, v4.2s | ||||||
|         rev64       v5.2s, v5.2s |         rev64           v5.2s, v5.2s | ||||||
|         mov         v2.d[1], v3.d[0] |         mov             v2.d[1], v3.d[0] | ||||||
|         mov         v4.d[1], v5.d[0] |         mov             v4.d[1], v5.d[0] | ||||||
|         mov         v5.d[1], v2.d[0] |         mov             v5.d[1], v2.d[0] | ||||||
|         mov         v3.d[1], v4.d[0] |         mov             v3.d[1], v4.d[0] | ||||||
|         fadd        v16.4s, v0.4s, v6.4s |         fadd            v16.4s, v0.4s, v6.4s | ||||||
|         fadd        v17.4s, v1.4s, v7.4s |         fadd            v17.4s, v1.4s, v7.4s | ||||||
|         fsub        v18.4s, v1.4s, v7.4s |         fsub            v18.4s, v1.4s, v7.4s | ||||||
|         fsub        v19.4s, v0.4s, v6.4s |         fsub            v19.4s, v0.4s, v6.4s | ||||||
|         fadd        v22.4s, v2.4s, v4.4s |         fadd            v22.4s, v2.4s, v4.4s | ||||||
|         fsub        v23.4s, v5.4s, v3.4s |         fsub            v23.4s, v5.4s, v3.4s | ||||||
|         trn1        v20.2d, v22.2d, v23.2d      // {re4+re8, re5+re7, im8-im4, im7-im5} |         trn1            v20.2d, v22.2d, v23.2d      // {re4+re8, re5+re7, im8-im4, im7-im5} | ||||||
|         trn2        v21.2d, v22.2d, v23.2d      // {im4+im8, im5+im7, re4-re8, re5-re7} |         trn2            v21.2d, v22.2d, v23.2d      // {im4+im8, im5+im7, re4-re8, re5-re7} | ||||||
| 1:      ld2         {v2.4s,v3.4s}, [x2], #32 | 1:      ld2             {v2.4s,v3.4s}, [x2], #32 | ||||||
|         ld2         {v4.2s,v5.2s}, [x2], #16 |         ld2             {v4.2s,v5.2s}, [x2], #16 | ||||||
|         ld1         {v6.2s},       [x2], #8 |         ld1             {v6.2s},       [x2], #8 | ||||||
|         add         x2, x2, #8 |         add             x2, x2, #8 | ||||||
|         mov         v4.d[1], v5.d[0] |         mov             v4.d[1], v5.d[0] | ||||||
|         mov         v6.s[1], v6.s[0] |         mov             v6.s[1], v6.s[0] | ||||||
|         fmul        v6.2s, v6.2s, v24.2s |         fmul            v6.2s, v6.2s, v24.2s | ||||||
|         fmul        v0.4s, v2.4s, v16.4s |         fmul            v0.4s, v2.4s, v16.4s | ||||||
|         fmul        v1.4s, v2.4s, v17.4s |         fmul            v1.4s, v2.4s, v17.4s | ||||||
|         fmls        v0.4s, v3.4s, v18.4s |         fmls            v0.4s, v3.4s, v18.4s | ||||||
|         fmla        v1.4s, v3.4s, v19.4s |         fmla            v1.4s, v3.4s, v19.4s | ||||||
|         fmla        v0.4s, v4.4s, v20.4s |         fmla            v0.4s, v4.4s, v20.4s | ||||||
|         fmla        v1.4s, v4.4s, v21.4s |         fmla            v1.4s, v4.4s, v21.4s | ||||||
|         faddp       v0.4s, v0.4s, v1.4s |         faddp           v0.4s, v0.4s, v1.4s | ||||||
|         faddp       v0.4s, v0.4s, v0.4s |         faddp           v0.4s, v0.4s, v0.4s | ||||||
|         fadd        v0.2s, v0.2s, v6.2s |         fadd            v0.2s, v0.2s, v6.2s | ||||||
|         st1         {v0.2s}, [x0], x3 |         st1             {v0.2s}, [x0], x3 | ||||||
|         subs        w4, w4, #1 |         subs            w4, w4, #1 | ||||||
|         b.gt        1b |         b.gt            1b | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
|  | |||||||
| @ -33,81 +33,81 @@ const tab_x2, align=4 | |||||||
| endconst | endconst | ||||||
| 
 | 
 | ||||||
| function ff_opus_deemphasis_neon, export=1 | function ff_opus_deemphasis_neon, export=1 | ||||||
|         movrel  x4, tab_st |         movrel          x4, tab_st | ||||||
|         ld1     {v4.4s}, [x4] |         ld1             {v4.4s}, [x4] | ||||||
|         movrel  x4, tab_x0 |         movrel          x4, tab_x0 | ||||||
|         ld1     {v5.4s}, [x4] |         ld1             {v5.4s}, [x4] | ||||||
|         movrel  x4, tab_x1 |         movrel          x4, tab_x1 | ||||||
|         ld1     {v6.4s}, [x4] |         ld1             {v6.4s}, [x4] | ||||||
|         movrel  x4, tab_x2 |         movrel          x4, tab_x2 | ||||||
|         ld1     {v7.4s}, [x4] |         ld1             {v7.4s}, [x4] | ||||||
| 
 | 
 | ||||||
|         fmul v0.4s, v4.4s, v0.s[0] |         fmul            v0.4s, v4.4s, v0.s[0] | ||||||
| 
 | 
 | ||||||
| 1:      ld1  {v1.4s, v2.4s}, [x1], #32 | 1:      ld1             {v1.4s, v2.4s}, [x1], #32 | ||||||
| 
 | 
 | ||||||
|         fmla v0.4s, v5.4s, v1.s[0] |         fmla            v0.4s, v5.4s, v1.s[0] | ||||||
|         fmul v3.4s, v7.4s, v2.s[2] |         fmul            v3.4s, v7.4s, v2.s[2] | ||||||
| 
 | 
 | ||||||
|         fmla v0.4s, v6.4s, v1.s[1] |         fmla            v0.4s, v6.4s, v1.s[1] | ||||||
|         fmla v3.4s, v6.4s, v2.s[1] |         fmla            v3.4s, v6.4s, v2.s[1] | ||||||
| 
 | 
 | ||||||
|         fmla v0.4s, v7.4s, v1.s[2] |         fmla            v0.4s, v7.4s, v1.s[2] | ||||||
|         fmla v3.4s, v5.4s, v2.s[0] |         fmla            v3.4s, v5.4s, v2.s[0] | ||||||
| 
 | 
 | ||||||
|         fadd v1.4s, v1.4s, v0.4s |         fadd            v1.4s, v1.4s, v0.4s | ||||||
|         fadd v2.4s, v2.4s, v3.4s |         fadd            v2.4s, v2.4s, v3.4s | ||||||
| 
 | 
 | ||||||
|         fmla v2.4s, v4.4s, v1.s[3] |         fmla            v2.4s, v4.4s, v1.s[3] | ||||||
| 
 | 
 | ||||||
|         st1  {v1.4s, v2.4s}, [x0], #32 |         st1             {v1.4s, v2.4s}, [x0], #32 | ||||||
|         fmul v0.4s, v4.4s, v2.s[3] |         fmul            v0.4s, v4.4s, v2.s[3] | ||||||
| 
 | 
 | ||||||
|         subs w2, w2, #8 |         subs            w2, w2, #8 | ||||||
|         b.gt 1b |         b.gt            1b | ||||||
| 
 | 
 | ||||||
|         mov s0, v2.s[3] |         mov             s0, v2.s[3] | ||||||
| 
 | 
 | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| function ff_opus_postfilter_neon, export=1 | function ff_opus_postfilter_neon, export=1 | ||||||
|         ld1 {v0.4s}, [x2] |         ld1             {v0.4s}, [x2] | ||||||
|         dup v1.4s, v0.s[1] |         dup             v1.4s, v0.s[1] | ||||||
|         dup v2.4s, v0.s[2] |         dup             v2.4s, v0.s[2] | ||||||
|         dup v0.4s, v0.s[0] |         dup             v0.4s, v0.s[0] | ||||||
| 
 | 
 | ||||||
|         add w1, w1, #2 |         add             w1, w1, #2 | ||||||
|         sub x1, x0, x1, lsl #2 |         sub             x1, x0, x1, lsl #2 | ||||||
| 
 | 
 | ||||||
|         ld1 {v3.4s}, [x1] |         ld1             {v3.4s}, [x1] | ||||||
|         fmul v3.4s, v3.4s, v2.4s |         fmul            v3.4s, v3.4s, v2.4s | ||||||
| 
 | 
 | ||||||
| 1:      add x1, x1, #4 | 1:      add             x1, x1, #4 | ||||||
|         ld1 {v4.4s}, [x1] |         ld1             {v4.4s}, [x1] | ||||||
|         add x1, x1, #4 |         add             x1, x1, #4 | ||||||
|         ld1 {v5.4s}, [x1] |         ld1             {v5.4s}, [x1] | ||||||
|         add x1, x1, #4 |         add             x1, x1, #4 | ||||||
|         ld1 {v6.4s}, [x1] |         ld1             {v6.4s}, [x1] | ||||||
|         add x1, x1, #4 |         add             x1, x1, #4 | ||||||
|         ld1 {v7.4s}, [x1] |         ld1             {v7.4s}, [x1] | ||||||
| 
 | 
 | ||||||
|         fmla v3.4s, v7.4s, v2.4s |         fmla            v3.4s, v7.4s, v2.4s | ||||||
|         fadd v6.4s, v6.4s, v4.4s |         fadd            v6.4s, v6.4s, v4.4s | ||||||
| 
 | 
 | ||||||
|         ld1 {v4.4s}, [x0] |         ld1             {v4.4s}, [x0] | ||||||
|         fmla v4.4s, v5.4s, v0.4s |         fmla            v4.4s, v5.4s, v0.4s | ||||||
| 
 | 
 | ||||||
|         fmul v6.4s, v6.4s, v1.4s |         fmul            v6.4s, v6.4s, v1.4s | ||||||
|         fadd v6.4s, v6.4s, v3.4s |         fadd            v6.4s, v6.4s, v3.4s | ||||||
| 
 | 
 | ||||||
|         fadd v4.4s, v4.4s, v6.4s |         fadd            v4.4s, v4.4s, v6.4s | ||||||
|         fmul v3.4s, v7.4s, v2.4s |         fmul            v3.4s, v7.4s, v2.4s | ||||||
| 
 | 
 | ||||||
|         st1  {v4.4s}, [x0], #16 |         st1             {v4.4s}, [x0], #16 | ||||||
| 
 | 
 | ||||||
|         subs w3, w3, #4 |         subs            w3, w3, #4 | ||||||
|         b.gt 1b |         b.gt            1b | ||||||
| 
 | 
 | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
|  | |||||||
| @ -21,57 +21,57 @@ | |||||||
| #include "libavutil/aarch64/asm.S" | #include "libavutil/aarch64/asm.S" | ||||||
| 
 | 
 | ||||||
| function ff_resample_common_apply_filter_x4_float_neon, export=1 | function ff_resample_common_apply_filter_x4_float_neon, export=1 | ||||||
|     movi                v0.4s, #0                                      // accumulator |         movi            v0.4s, #0                                      // accumulator | ||||||
| 1:  ld1                 {v1.4s}, [x1], #16                             // src[0..3] | 1:      ld1             {v1.4s}, [x1], #16                             // src[0..3] | ||||||
|     ld1                 {v2.4s}, [x2], #16                             // filter[0..3] |         ld1             {v2.4s}, [x2], #16                             // filter[0..3] | ||||||
|     fmla                v0.4s, v1.4s, v2.4s                            // accumulator += src[0..3] * filter[0..3] |         fmla            v0.4s, v1.4s, v2.4s                            // accumulator += src[0..3] * filter[0..3] | ||||||
|     subs                w3, w3, #4                                     // filter_length -= 4 |         subs            w3, w3, #4                                     // filter_length -= 4 | ||||||
|     b.gt                1b                                             // loop until filter_length |         b.gt            1b                                             // loop until filter_length | ||||||
|     faddp               v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values |         faddp           v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values | ||||||
|     faddp               v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values |         faddp           v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values | ||||||
|     st1                 {v0.s}[0], [x0], #4                            // write accumulator |         st1             {v0.s}[0], [x0], #4                            // write accumulator | ||||||
|     ret |     ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| function ff_resample_common_apply_filter_x8_float_neon, export=1 | function ff_resample_common_apply_filter_x8_float_neon, export=1 | ||||||
|     movi                v0.4s, #0                                      // accumulator |         movi            v0.4s, #0                                      // accumulator | ||||||
| 1:  ld1                 {v1.4s}, [x1], #16                             // src[0..3] | 1:      ld1             {v1.4s}, [x1], #16                             // src[0..3] | ||||||
|     ld1                 {v2.4s}, [x2], #16                             // filter[0..3] |         ld1             {v2.4s}, [x2], #16                             // filter[0..3] | ||||||
|     ld1                 {v3.4s}, [x1], #16                             // src[4..7] |         ld1             {v3.4s}, [x1], #16                             // src[4..7] | ||||||
|     ld1                 {v4.4s}, [x2], #16                             // filter[4..7] |         ld1             {v4.4s}, [x2], #16                             // filter[4..7] | ||||||
|     fmla                v0.4s, v1.4s, v2.4s                            // accumulator += src[0..3] * filter[0..3] |         fmla            v0.4s, v1.4s, v2.4s                            // accumulator += src[0..3] * filter[0..3] | ||||||
|     fmla                v0.4s, v3.4s, v4.4s                            // accumulator += src[4..7] * filter[4..7] |         fmla            v0.4s, v3.4s, v4.4s                            // accumulator += src[4..7] * filter[4..7] | ||||||
|     subs                w3, w3, #8                                     // filter_length -= 8 |         subs            w3, w3, #8                                     // filter_length -= 8 | ||||||
|     b.gt                1b                                             // loop until filter_length |         b.gt            1b                                             // loop until filter_length | ||||||
|     faddp               v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values |         faddp           v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values | ||||||
|     faddp               v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values |         faddp           v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values | ||||||
|     st1                 {v0.s}[0], [x0], #4                            // write accumulator |         st1             {v0.s}[0], [x0], #4                            // write accumulator | ||||||
|     ret |     ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| function ff_resample_common_apply_filter_x4_s16_neon, export=1 | function ff_resample_common_apply_filter_x4_s16_neon, export=1 | ||||||
|     movi                v0.4s, #0                                      // accumulator |         movi            v0.4s, #0                                      // accumulator | ||||||
| 1:  ld1                 {v1.4h}, [x1], #8                              // src[0..3] | 1:      ld1             {v1.4h}, [x1], #8                              // src[0..3] | ||||||
|     ld1                 {v2.4h}, [x2], #8                              // filter[0..3] |         ld1             {v2.4h}, [x2], #8                              // filter[0..3] | ||||||
|     smlal               v0.4s, v1.4h, v2.4h                            // accumulator += src[0..3] * filter[0..3] |         smlal           v0.4s, v1.4h, v2.4h                            // accumulator += src[0..3] * filter[0..3] | ||||||
|     subs                w3, w3, #4                                     // filter_length -= 4 |         subs            w3, w3, #4                                     // filter_length -= 4 | ||||||
|     b.gt                1b                                             // loop until filter_length |         b.gt            1b                                             // loop until filter_length | ||||||
|     addp                v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values |         addp            v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values | ||||||
|     addp                v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values |         addp            v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values | ||||||
|     st1                 {v0.s}[0], [x0], #4                            // write accumulator |         st1             {v0.s}[0], [x0], #4                            // write accumulator | ||||||
|     ret |     ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| function ff_resample_common_apply_filter_x8_s16_neon, export=1 | function ff_resample_common_apply_filter_x8_s16_neon, export=1 | ||||||
|     movi                v0.4s, #0                                      // accumulator |         movi            v0.4s, #0                                      // accumulator | ||||||
| 1:  ld1                 {v1.8h}, [x1], #16                             // src[0..7] | 1:      ld1             {v1.8h}, [x1], #16                             // src[0..7] | ||||||
|     ld1                 {v2.8h}, [x2], #16                             // filter[0..7] |         ld1             {v2.8h}, [x2], #16                             // filter[0..7] | ||||||
|     smlal               v0.4s, v1.4h, v2.4h                            // accumulator += src[0..3] * filter[0..3] |         smlal           v0.4s, v1.4h, v2.4h                            // accumulator += src[0..3] * filter[0..3] | ||||||
|     smlal2              v0.4s, v1.8h, v2.8h                            // accumulator += src[4..7] * filter[4..7] |         smlal2          v0.4s, v1.8h, v2.8h                            // accumulator += src[4..7] * filter[4..7] | ||||||
|     subs                w3, w3, #8                                     // filter_length -= 8 |         subs            w3, w3, #8                                     // filter_length -= 8 | ||||||
|     b.gt                1b                                             // loop until filter_length |         b.gt            1b                                             // loop until filter_length | ||||||
|     addp                v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values |         addp            v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values | ||||||
|     addp                v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values |         addp            v0.4s, v0.4s, v0.4s                            // pair adding of the 4x32-bit accumulated values | ||||||
|     st1                 {v0.s}[0], [x0], #4                            // write accumulator |         st1             {v0.s}[0], [x0], #4                            // write accumulator | ||||||
|     ret |     ret | ||||||
| endfunc | endfunc | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -29,178 +29,178 @@ function ff_yuv2planeX_8_neon, export=1 | |||||||
| // x5 - const uint8_t *dither, | // x5 - const uint8_t *dither, | ||||||
| // w6 - int offset | // w6 - int offset | ||||||
| 
 | 
 | ||||||
|         ld1                 {v0.8b}, [x5]                   // load 8x8-bit dither |         ld1             {v0.8b}, [x5]                   // load 8x8-bit dither | ||||||
|         and                 w6, w6, #7 |         and             w6, w6, #7 | ||||||
|         cbz                 w6, 1f                          // check if offsetting present |         cbz             w6, 1f                          // check if offsetting present | ||||||
|         ext                 v0.8b, v0.8b, v0.8b, #3         // honor offsetting which can be 0 or 3 only |         ext             v0.8b, v0.8b, v0.8b, #3         // honor offsetting which can be 0 or 3 only | ||||||
| 1:      uxtl                v0.8h, v0.8b                    // extend dither to 16-bit | 1:      uxtl            v0.8h, v0.8b                    // extend dither to 16-bit | ||||||
|         ushll               v1.4s, v0.4h, #12               // extend dither to 32-bit with left shift by 12 (part 1) |         ushll           v1.4s, v0.4h, #12               // extend dither to 32-bit with left shift by 12 (part 1) | ||||||
|         ushll2              v2.4s, v0.8h, #12               // extend dither to 32-bit with left shift by 12 (part 2) |         ushll2          v2.4s, v0.8h, #12               // extend dither to 32-bit with left shift by 12 (part 2) | ||||||
|         cmp                 w1, #8                          // if filterSize == 8, branch to specialized version |         cmp             w1, #8                          // if filterSize == 8, branch to specialized version | ||||||
|         b.eq                6f |         b.eq            6f | ||||||
|         cmp                 w1, #4                          // if filterSize == 4, branch to specialized version |         cmp             w1, #4                          // if filterSize == 4, branch to specialized version | ||||||
|         b.eq                8f |         b.eq            8f | ||||||
|         cmp                 w1, #2                          // if filterSize == 2, branch to specialized version |         cmp             w1, #2                          // if filterSize == 2, branch to specialized version | ||||||
|         b.eq                10f |         b.eq            10f | ||||||
| 
 | 
 | ||||||
| // The filter size does not match of the of specialized implementations. It is either even or odd. If it is even | // The filter size does not match of the of specialized implementations. It is either even or odd. If it is even | ||||||
| // then use the first section below. | // then use the first section below. | ||||||
|         mov                 x7, #0                          // i = 0 |         mov             x7, #0                          // i = 0 | ||||||
|         tbnz                w1, #0, 4f                      // if filterSize % 2 != 0 branch to specialized version |         tbnz            w1, #0, 4f                      // if filterSize % 2 != 0 branch to specialized version | ||||||
| // fs % 2 == 0 | // fs % 2 == 0 | ||||||
| 2:      mov                 v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value | 2:      mov             v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value | ||||||
|         mov                 v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value |         mov             v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value | ||||||
|         mov                 w8, w1                          // tmpfilterSize = filterSize |         mov             w8, w1                          // tmpfilterSize = filterSize | ||||||
|         mov                 x9, x2                          // srcp    = src |         mov             x9, x2                          // srcp    = src | ||||||
|         mov                 x10, x0                         // filterp = filter |         mov             x10, x0                         // filterp = filter | ||||||
| 3:      ldp                 x11, x12, [x9], #16             // get 2 pointers: src[j] and src[j+1] | 3:      ldp             x11, x12, [x9], #16             // get 2 pointers: src[j] and src[j+1] | ||||||
|         ldr                 s7, [x10], #4                   // read 2x16-bit coeff X and Y at filter[j] and filter[j+1] |         ldr             s7, [x10], #4                   // read 2x16-bit coeff X and Y at filter[j] and filter[j+1] | ||||||
|         add                 x11, x11, x7, lsl #1            // &src[j  ][i] |         add             x11, x11, x7, lsl #1            // &src[j  ][i] | ||||||
|         add                 x12, x12, x7, lsl #1            // &src[j+1][i] |         add             x12, x12, x7, lsl #1            // &src[j+1][i] | ||||||
|         ld1                 {v5.8h}, [x11]                  // read 8x16-bit @ src[j  ][i + {0..7}]: A,B,C,D,E,F,G,H
 |         ld1             {v5.8h}, [x11]                  // read 8x16-bit @ src[j  ][i + {0..7}]: A,B,C,D,E,F,G,H
 | ||||||
|         ld1                 {v6.8h}, [x12]                  // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
 |         ld1             {v6.8h}, [x12]                  // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
 | ||||||
|         smlal               v3.4s, v5.4h, v7.h[0]           // val0 += {A,B,C,D} * X |         smlal           v3.4s, v5.4h, v7.h[0]           // val0 += {A,B,C,D} * X | ||||||
|         smlal2              v4.4s, v5.8h, v7.h[0]           // val1 += {E,F,G,H} * X |         smlal2          v4.4s, v5.8h, v7.h[0]           // val1 += {E,F,G,H} * X | ||||||
|         smlal               v3.4s, v6.4h, v7.h[1]           // val0 += {I,J,K,L} * Y |         smlal           v3.4s, v6.4h, v7.h[1]           // val0 += {I,J,K,L} * Y | ||||||
|         smlal2              v4.4s, v6.8h, v7.h[1]           // val1 += {M,N,O,P} * Y |         smlal2          v4.4s, v6.8h, v7.h[1]           // val1 += {M,N,O,P} * Y | ||||||
|         subs                w8, w8, #2                      // tmpfilterSize -= 2 |         subs            w8, w8, #2                      // tmpfilterSize -= 2 | ||||||
|         b.gt                3b                              // loop until filterSize consumed |         b.gt            3b                              // loop until filterSize consumed | ||||||
| 
 | 
 | ||||||
|         sqshrun             v3.4h, v3.4s, #16               // clip16(val0>>16) |         sqshrun         v3.4h, v3.4s, #16               // clip16(val0>>16) | ||||||
|         sqshrun2            v3.8h, v4.4s, #16               // clip16(val1>>16) |         sqshrun2        v3.8h, v4.4s, #16               // clip16(val1>>16) | ||||||
|         uqshrn              v3.8b, v3.8h, #3                // clip8(val>>19) |         uqshrn          v3.8b, v3.8h, #3                // clip8(val>>19) | ||||||
|         st1                 {v3.8b}, [x3], #8               // write to destination |         st1             {v3.8b}, [x3], #8               // write to destination | ||||||
|         subs                w4, w4, #8                      // dstW -= 8 |         subs            w4, w4, #8                      // dstW -= 8 | ||||||
|         add                 x7, x7, #8                      // i += 8 |         add             x7, x7, #8                      // i += 8 | ||||||
|         b.gt                2b                              // loop until width consumed |         b.gt            2b                              // loop until width consumed | ||||||
|         ret |         ret | ||||||
| 
 | 
 | ||||||
| // If filter size is odd (most likely == 1), then use this section. | // If filter size is odd (most likely == 1), then use this section. | ||||||
| // fs % 2 != 0 | // fs % 2 != 0 | ||||||
| 4:      mov                 v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value | 4:      mov             v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value | ||||||
|         mov                 v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value |         mov             v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value | ||||||
|         mov                 w8, w1                          // tmpfilterSize = filterSize |         mov             w8, w1                          // tmpfilterSize = filterSize | ||||||
|         mov                 x9, x2                          // srcp    = src |         mov             x9, x2                          // srcp    = src | ||||||
|         mov                 x10, x0                         // filterp = filter |         mov             x10, x0                         // filterp = filter | ||||||
| 5:      ldr                 x11, [x9], #8                   // get 1 pointer: src[j] | 5:      ldr             x11, [x9], #8                   // get 1 pointer: src[j] | ||||||
|         ldr                 h6, [x10], #2                   // read 1 16 bit coeff X at filter[j] |         ldr             h6, [x10], #2                   // read 1 16 bit coeff X at filter[j] | ||||||
|         add                 x11, x11, x7, lsl #1            // &src[j  ][i] |         add             x11, x11, x7, lsl #1            // &src[j  ][i] | ||||||
|         ld1                 {v5.8h}, [x11]                  // read 8x16-bit @ src[j  ][i + {0..7}]: A,B,C,D,E,F,G,H
 |         ld1             {v5.8h}, [x11]                  // read 8x16-bit @ src[j  ][i + {0..7}]: A,B,C,D,E,F,G,H
 | ||||||
|         smlal               v3.4s, v5.4h, v6.h[0]           // val0 += {A,B,C,D} * X |         smlal           v3.4s, v5.4h, v6.h[0]           // val0 += {A,B,C,D} * X | ||||||
|         smlal2              v4.4s, v5.8h, v6.h[0]           // val1 += {E,F,G,H} * X |         smlal2          v4.4s, v5.8h, v6.h[0]           // val1 += {E,F,G,H} * X | ||||||
|         subs                w8, w8, #1                      // tmpfilterSize -= 2 |         subs            w8, w8, #1                      // tmpfilterSize -= 2 | ||||||
|         b.gt                5b                              // loop until filterSize consumed |         b.gt            5b                              // loop until filterSize consumed | ||||||
| 
 | 
 | ||||||
|         sqshrun             v3.4h, v3.4s, #16               // clip16(val0>>16) |         sqshrun         v3.4h, v3.4s, #16               // clip16(val0>>16) | ||||||
|         sqshrun2            v3.8h, v4.4s, #16               // clip16(val1>>16) |         sqshrun2        v3.8h, v4.4s, #16               // clip16(val1>>16) | ||||||
|         uqshrn              v3.8b, v3.8h, #3                // clip8(val>>19) |         uqshrn          v3.8b, v3.8h, #3                // clip8(val>>19) | ||||||
|         st1                 {v3.8b}, [x3], #8               // write to destination |         st1             {v3.8b}, [x3], #8               // write to destination | ||||||
|         subs                w4, w4, #8                      // dstW -= 8 |         subs            w4, w4, #8                      // dstW -= 8 | ||||||
|         add                 x7, x7, #8                      // i += 8 |         add             x7, x7, #8                      // i += 8 | ||||||
|         b.gt                4b                              // loop until width consumed |         b.gt            4b                              // loop until width consumed | ||||||
|         ret |         ret | ||||||
| 
 | 
 | ||||||
| 6:      // fs=8 | 6:      // fs=8 | ||||||
|         ldp                 x5, x6, [x2]                    // load 2 pointers: src[j  ] and src[j+1] |         ldp             x5, x6, [x2]                    // load 2 pointers: src[j  ] and src[j+1] | ||||||
|         ldp                 x7, x9, [x2, #16]               // load 2 pointers: src[j+2] and src[j+3] |         ldp             x7, x9, [x2, #16]               // load 2 pointers: src[j+2] and src[j+3] | ||||||
|         ldp                 x10, x11, [x2, #32]             // load 2 pointers: src[j+4] and src[j+5] |         ldp             x10, x11, [x2, #32]             // load 2 pointers: src[j+4] and src[j+5] | ||||||
|         ldp                 x12, x13, [x2, #48]             // load 2 pointers: src[j+6] and src[j+7] |         ldp             x12, x13, [x2, #48]             // load 2 pointers: src[j+6] and src[j+7] | ||||||
| 
 | 
 | ||||||
|         // load 8x16-bit values for filter[j], where j=0..7 |         // load 8x16-bit values for filter[j], where j=0..7 | ||||||
|         ld1                 {v6.8h}, [x0] |         ld1             {v6.8h}, [x0] | ||||||
| 7: | 7: | ||||||
|         mov                 v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value |         mov             v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value | ||||||
|         mov                 v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value |         mov             v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value | ||||||
| 
 | 
 | ||||||
|         ld1                 {v24.8h}, [x5], #16             // load 8x16-bit values for src[j + 0][i + {0..7}] |         ld1             {v24.8h}, [x5], #16             // load 8x16-bit values for src[j + 0][i + {0..7}] | ||||||
|         ld1                 {v25.8h}, [x6], #16             // load 8x16-bit values for src[j + 1][i + {0..7}] |         ld1             {v25.8h}, [x6], #16             // load 8x16-bit values for src[j + 1][i + {0..7}] | ||||||
|         ld1                 {v26.8h}, [x7], #16             // load 8x16-bit values for src[j + 2][i + {0..7}] |         ld1             {v26.8h}, [x7], #16             // load 8x16-bit values for src[j + 2][i + {0..7}] | ||||||
|         ld1                 {v27.8h}, [x9], #16             // load 8x16-bit values for src[j + 3][i + {0..7}] |         ld1             {v27.8h}, [x9], #16             // load 8x16-bit values for src[j + 3][i + {0..7}] | ||||||
|         ld1                 {v28.8h}, [x10], #16            // load 8x16-bit values for src[j + 4][i + {0..7}] |         ld1             {v28.8h}, [x10], #16            // load 8x16-bit values for src[j + 4][i + {0..7}] | ||||||
|         ld1                 {v29.8h}, [x11], #16            // load 8x16-bit values for src[j + 5][i + {0..7}] |         ld1             {v29.8h}, [x11], #16            // load 8x16-bit values for src[j + 5][i + {0..7}] | ||||||
|         ld1                 {v30.8h}, [x12], #16            // load 8x16-bit values for src[j + 6][i + {0..7}] |         ld1             {v30.8h}, [x12], #16            // load 8x16-bit values for src[j + 6][i + {0..7}] | ||||||
|         ld1                 {v31.8h}, [x13], #16            // load 8x16-bit values for src[j + 7][i + {0..7}] |         ld1             {v31.8h}, [x13], #16            // load 8x16-bit values for src[j + 7][i + {0..7}] | ||||||
| 
 | 
 | ||||||
|         smlal               v3.4s, v24.4h, v6.h[0]          // val0 += src[0][i + {0..3}] * filter[0] |         smlal           v3.4s, v24.4h, v6.h[0]          // val0 += src[0][i + {0..3}] * filter[0] | ||||||
|         smlal2              v4.4s, v24.8h, v6.h[0]          // val1 += src[0][i + {4..7}] * filter[0] |         smlal2          v4.4s, v24.8h, v6.h[0]          // val1 += src[0][i + {4..7}] * filter[0] | ||||||
|         smlal               v3.4s, v25.4h, v6.h[1]          // val0 += src[1][i + {0..3}] * filter[1] |         smlal           v3.4s, v25.4h, v6.h[1]          // val0 += src[1][i + {0..3}] * filter[1] | ||||||
|         smlal2              v4.4s, v25.8h, v6.h[1]          // val1 += src[1][i + {4..7}] * filter[1] |         smlal2          v4.4s, v25.8h, v6.h[1]          // val1 += src[1][i + {4..7}] * filter[1] | ||||||
|         smlal               v3.4s, v26.4h, v6.h[2]          // val0 += src[2][i + {0..3}] * filter[2] |         smlal           v3.4s, v26.4h, v6.h[2]          // val0 += src[2][i + {0..3}] * filter[2] | ||||||
|         smlal2              v4.4s, v26.8h, v6.h[2]          // val1 += src[2][i + {4..7}] * filter[2] |         smlal2          v4.4s, v26.8h, v6.h[2]          // val1 += src[2][i + {4..7}] * filter[2] | ||||||
|         smlal               v3.4s, v27.4h, v6.h[3]          // val0 += src[3][i + {0..3}] * filter[3] |         smlal           v3.4s, v27.4h, v6.h[3]          // val0 += src[3][i + {0..3}] * filter[3] | ||||||
|         smlal2              v4.4s, v27.8h, v6.h[3]          // val1 += src[3][i + {4..7}] * filter[3] |         smlal2          v4.4s, v27.8h, v6.h[3]          // val1 += src[3][i + {4..7}] * filter[3] | ||||||
|         smlal               v3.4s, v28.4h, v6.h[4]          // val0 += src[4][i + {0..3}] * filter[4] |         smlal           v3.4s, v28.4h, v6.h[4]          // val0 += src[4][i + {0..3}] * filter[4] | ||||||
|         smlal2              v4.4s, v28.8h, v6.h[4]          // val1 += src[4][i + {4..7}] * filter[4] |         smlal2          v4.4s, v28.8h, v6.h[4]          // val1 += src[4][i + {4..7}] * filter[4] | ||||||
|         smlal               v3.4s, v29.4h, v6.h[5]          // val0 += src[5][i + {0..3}] * filter[5] |         smlal           v3.4s, v29.4h, v6.h[5]          // val0 += src[5][i + {0..3}] * filter[5] | ||||||
|         smlal2              v4.4s, v29.8h, v6.h[5]          // val1 += src[5][i + {4..7}] * filter[5] |         smlal2          v4.4s, v29.8h, v6.h[5]          // val1 += src[5][i + {4..7}] * filter[5] | ||||||
|         smlal               v3.4s, v30.4h, v6.h[6]          // val0 += src[6][i + {0..3}] * filter[6] |         smlal           v3.4s, v30.4h, v6.h[6]          // val0 += src[6][i + {0..3}] * filter[6] | ||||||
|         smlal2              v4.4s, v30.8h, v6.h[6]          // val1 += src[6][i + {4..7}] * filter[6] |         smlal2          v4.4s, v30.8h, v6.h[6]          // val1 += src[6][i + {4..7}] * filter[6] | ||||||
|         smlal               v3.4s, v31.4h, v6.h[7]          // val0 += src[7][i + {0..3}] * filter[7] |         smlal           v3.4s, v31.4h, v6.h[7]          // val0 += src[7][i + {0..3}] * filter[7] | ||||||
|         smlal2              v4.4s, v31.8h, v6.h[7]          // val1 += src[7][i + {4..7}] * filter[7] |         smlal2          v4.4s, v31.8h, v6.h[7]          // val1 += src[7][i + {4..7}] * filter[7] | ||||||
| 
 | 
 | ||||||
|         sqshrun             v3.4h, v3.4s, #16               // clip16(val0>>16) |         sqshrun         v3.4h, v3.4s, #16               // clip16(val0>>16) | ||||||
|         sqshrun2            v3.8h, v4.4s, #16               // clip16(val1>>16) |         sqshrun2        v3.8h, v4.4s, #16               // clip16(val1>>16) | ||||||
|         uqshrn              v3.8b, v3.8h, #3                // clip8(val>>19) |         uqshrn          v3.8b, v3.8h, #3                // clip8(val>>19) | ||||||
|         subs                w4, w4, #8                      // dstW -= 8 |         subs            w4, w4, #8                      // dstW -= 8 | ||||||
|         st1                 {v3.8b}, [x3], #8               // write to destination |         st1             {v3.8b}, [x3], #8               // write to destination | ||||||
|         b.gt                7b                              // loop until width consumed |         b.gt            7b                              // loop until width consumed | ||||||
|         ret |         ret | ||||||
| 
 | 
 | ||||||
| 8:      // fs=4 | 8:      // fs=4 | ||||||
|         ldp                 x5, x6, [x2]                    // load 2 pointers: src[j  ] and src[j+1] |         ldp             x5, x6, [x2]                    // load 2 pointers: src[j  ] and src[j+1] | ||||||
|         ldp                 x7, x9, [x2, #16]               // load 2 pointers: src[j+2] and src[j+3] |         ldp             x7, x9, [x2, #16]               // load 2 pointers: src[j+2] and src[j+3] | ||||||
| 
 | 
 | ||||||
|         // load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes |         // load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes | ||||||
|         ld1                 {v6.4h}, [x0] |         ld1             {v6.4h}, [x0] | ||||||
| 9: | 9: | ||||||
|         mov                 v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value |         mov             v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value | ||||||
|         mov                 v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value |         mov             v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value | ||||||
| 
 | 
 | ||||||
|         ld1                 {v24.8h}, [x5], #16             // load 8x16-bit values for src[j + 0][i + {0..7}] |         ld1             {v24.8h}, [x5], #16             // load 8x16-bit values for src[j + 0][i + {0..7}] | ||||||
|         ld1                 {v25.8h}, [x6], #16             // load 8x16-bit values for src[j + 1][i + {0..7}] |         ld1             {v25.8h}, [x6], #16             // load 8x16-bit values for src[j + 1][i + {0..7}] | ||||||
|         ld1                 {v26.8h}, [x7], #16             // load 8x16-bit values for src[j + 2][i + {0..7}] |         ld1             {v26.8h}, [x7], #16             // load 8x16-bit values for src[j + 2][i + {0..7}] | ||||||
|         ld1                 {v27.8h}, [x9], #16             // load 8x16-bit values for src[j + 3][i + {0..7}] |         ld1             {v27.8h}, [x9], #16             // load 8x16-bit values for src[j + 3][i + {0..7}] | ||||||
| 
 | 
 | ||||||
|         smlal               v3.4s, v24.4h, v6.h[0]          // val0 += src[0][i + {0..3}] * filter[0] |         smlal           v3.4s, v24.4h, v6.h[0]          // val0 += src[0][i + {0..3}] * filter[0] | ||||||
|         smlal2              v4.4s, v24.8h, v6.h[0]          // val1 += src[0][i + {4..7}] * filter[0] |         smlal2          v4.4s, v24.8h, v6.h[0]          // val1 += src[0][i + {4..7}] * filter[0] | ||||||
|         smlal               v3.4s, v25.4h, v6.h[1]          // val0 += src[1][i + {0..3}] * filter[1] |         smlal           v3.4s, v25.4h, v6.h[1]          // val0 += src[1][i + {0..3}] * filter[1] | ||||||
|         smlal2              v4.4s, v25.8h, v6.h[1]          // val1 += src[1][i + {4..7}] * filter[1] |         smlal2          v4.4s, v25.8h, v6.h[1]          // val1 += src[1][i + {4..7}] * filter[1] | ||||||
|         smlal               v3.4s, v26.4h, v6.h[2]          // val0 += src[2][i + {0..3}] * filter[2] |         smlal           v3.4s, v26.4h, v6.h[2]          // val0 += src[2][i + {0..3}] * filter[2] | ||||||
|         smlal2              v4.4s, v26.8h, v6.h[2]          // val1 += src[2][i + {4..7}] * filter[2] |         smlal2          v4.4s, v26.8h, v6.h[2]          // val1 += src[2][i + {4..7}] * filter[2] | ||||||
|         smlal               v3.4s, v27.4h, v6.h[3]          // val0 += src[3][i + {0..3}] * filter[3] |         smlal           v3.4s, v27.4h, v6.h[3]          // val0 += src[3][i + {0..3}] * filter[3] | ||||||
|         smlal2              v4.4s, v27.8h, v6.h[3]          // val1 += src[3][i + {4..7}] * filter[3] |         smlal2          v4.4s, v27.8h, v6.h[3]          // val1 += src[3][i + {4..7}] * filter[3] | ||||||
| 
 | 
 | ||||||
|         sqshrun             v3.4h, v3.4s, #16               // clip16(val0>>16) |         sqshrun         v3.4h, v3.4s, #16               // clip16(val0>>16) | ||||||
|         sqshrun2            v3.8h, v4.4s, #16               // clip16(val1>>16) |         sqshrun2        v3.8h, v4.4s, #16               // clip16(val1>>16) | ||||||
|         uqshrn              v3.8b, v3.8h, #3                // clip8(val>>19) |         uqshrn          v3.8b, v3.8h, #3                // clip8(val>>19) | ||||||
|         st1                 {v3.8b}, [x3], #8               // write to destination |         st1             {v3.8b}, [x3], #8               // write to destination | ||||||
|         subs                w4, w4, #8                      // dstW -= 8 |         subs            w4, w4, #8                      // dstW -= 8 | ||||||
|         b.gt                9b                              // loop until width consumed |         b.gt            9b                              // loop until width consumed | ||||||
|         ret |         ret | ||||||
| 
 | 
 | ||||||
| 10:     // fs=2 | 10:     // fs=2 | ||||||
|         ldp                 x5, x6, [x2]                    // load 2 pointers: src[j  ] and src[j+1] |         ldp             x5, x6, [x2]                    // load 2 pointers: src[j  ] and src[j+1] | ||||||
| 
 | 
 | ||||||
|         // load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes |         // load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes | ||||||
|         ldr                 s6, [x0] |         ldr             s6, [x0] | ||||||
| 11: | 11: | ||||||
|         mov                 v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value |         mov             v3.16b, v1.16b                  // initialize accumulator part 1 with dithering value | ||||||
|         mov                 v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value |         mov             v4.16b, v2.16b                  // initialize accumulator part 2 with dithering value | ||||||
| 
 | 
 | ||||||
|         ld1                 {v24.8h}, [x5], #16             // load 8x16-bit values for src[j + 0][i + {0..7}] |         ld1             {v24.8h}, [x5], #16             // load 8x16-bit values for src[j + 0][i + {0..7}] | ||||||
|         ld1                 {v25.8h}, [x6], #16             // load 8x16-bit values for src[j + 1][i + {0..7}] |         ld1             {v25.8h}, [x6], #16             // load 8x16-bit values for src[j + 1][i + {0..7}] | ||||||
| 
 | 
 | ||||||
|         smlal               v3.4s, v24.4h, v6.h[0]          // val0 += src[0][i + {0..3}] * filter[0] |         smlal           v3.4s, v24.4h, v6.h[0]          // val0 += src[0][i + {0..3}] * filter[0] | ||||||
|         smlal2              v4.4s, v24.8h, v6.h[0]          // val1 += src[0][i + {4..7}] * filter[0] |         smlal2          v4.4s, v24.8h, v6.h[0]          // val1 += src[0][i + {4..7}] * filter[0] | ||||||
|         smlal               v3.4s, v25.4h, v6.h[1]          // val0 += src[1][i + {0..3}] * filter[1] |         smlal           v3.4s, v25.4h, v6.h[1]          // val0 += src[1][i + {0..3}] * filter[1] | ||||||
|         smlal2              v4.4s, v25.8h, v6.h[1]          // val1 += src[1][i + {4..7}] * filter[1] |         smlal2          v4.4s, v25.8h, v6.h[1]          // val1 += src[1][i + {4..7}] * filter[1] | ||||||
| 
 | 
 | ||||||
|         sqshrun             v3.4h, v3.4s, #16               // clip16(val0>>16) |         sqshrun         v3.4h, v3.4s, #16               // clip16(val0>>16) | ||||||
|         sqshrun2            v3.8h, v4.4s, #16               // clip16(val1>>16) |         sqshrun2        v3.8h, v4.4s, #16               // clip16(val1>>16) | ||||||
|         uqshrn              v3.8b, v3.8h, #3                // clip8(val>>19) |         uqshrn          v3.8b, v3.8h, #3                // clip8(val>>19) | ||||||
|         st1                 {v3.8b}, [x3], #8               // write to destination |         st1             {v3.8b}, [x3], #8               // write to destination | ||||||
|         subs                w4, w4, #8                      // dstW -= 8 |         subs            w4, w4, #8                      // dstW -= 8 | ||||||
|         b.gt                11b                             // loop until width consumed |         b.gt            11b                             // loop until width consumed | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
| 
 | 
 | ||||||
| @ -210,25 +210,25 @@ function ff_yuv2plane1_8_neon, export=1 | |||||||
| // w2 - int dstW, | // w2 - int dstW, | ||||||
| // x3 - const uint8_t *dither, | // x3 - const uint8_t *dither, | ||||||
| // w4 - int offset | // w4 - int offset | ||||||
|         ld1                 {v0.8b}, [x3]                   // load 8x8-bit dither |         ld1             {v0.8b}, [x3]                   // load 8x8-bit dither | ||||||
|         and                 w4, w4, #7 |         and             w4, w4, #7 | ||||||
|         cbz                 w4, 1f                          // check if offsetting present |         cbz             w4, 1f                          // check if offsetting present | ||||||
|         ext                 v0.8b, v0.8b, v0.8b, #3         // honor offsetting which can be 0 or 3 only |         ext             v0.8b, v0.8b, v0.8b, #3         // honor offsetting which can be 0 or 3 only | ||||||
| 1:      uxtl                v0.8h, v0.8b                    // extend dither to 32-bit | 1:      uxtl            v0.8h, v0.8b                    // extend dither to 32-bit | ||||||
|         uxtl                v1.4s, v0.4h |         uxtl            v1.4s, v0.4h | ||||||
|         uxtl2               v2.4s, v0.8h |         uxtl2           v2.4s, v0.8h | ||||||
| 2: | 2: | ||||||
|         ld1                 {v3.8h}, [x0], #16              // read 8x16-bit @ src[j  ][i + {0..7}]: A,B,C,D,E,F,G,H
 |         ld1             {v3.8h}, [x0], #16              // read 8x16-bit @ src[j  ][i + {0..7}]: A,B,C,D,E,F,G,H
 | ||||||
|         sxtl                v4.4s, v3.4h |         sxtl            v4.4s, v3.4h | ||||||
|         sxtl2               v5.4s, v3.8h |         sxtl2           v5.4s, v3.8h | ||||||
|         add                 v4.4s, v4.4s, v1.4s |         add             v4.4s, v4.4s, v1.4s | ||||||
|         add                 v5.4s, v5.4s, v2.4s |         add             v5.4s, v5.4s, v2.4s | ||||||
|         sqshrun             v4.4h, v4.4s, #6 |         sqshrun         v4.4h, v4.4s, #6 | ||||||
|         sqshrun2            v4.8h, v5.4s, #6 |         sqshrun2        v4.8h, v5.4s, #6 | ||||||
| 
 | 
 | ||||||
|         uqshrn              v3.8b, v4.8h, #1                // clip8(val>>7) |         uqshrn          v3.8b, v4.8h, #1                // clip8(val>>7) | ||||||
|         subs                w2, w2, #8                      // dstW -= 8 |         subs            w2, w2, #8                      // dstW -= 8 | ||||||
|         st1                 {v3.8b}, [x1], #8               // write to destination |         st1             {v3.8b}, [x1], #8               // write to destination | ||||||
|         b.gt                2b                              // loop until width consumed |         b.gt            2b                              // loop until width consumed | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
|  | |||||||
| @ -23,23 +23,23 @@ | |||||||
| 
 | 
 | ||||||
| .macro load_yoff_ycoeff yoff ycoeff | .macro load_yoff_ycoeff yoff ycoeff | ||||||
| #if defined(__APPLE__) | #if defined(__APPLE__) | ||||||
|     ldp                 w9, w10, [sp, #\yoff] |         ldp             w9, w10, [sp, #\yoff] | ||||||
| #else | #else | ||||||
|     ldr                 w9,  [sp, #\yoff] |         ldr             w9,  [sp, #\yoff] | ||||||
|     ldr                 w10, [sp, #\ycoeff] |         ldr             w10, [sp, #\ycoeff] | ||||||
| #endif | #endif | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro load_args_nv12
 | .macro load_args_nv12
 | ||||||
|     ldr                 x8,  [sp]                                       // table |         ldr             x8,  [sp]                                       // table | ||||||
|     load_yoff_ycoeff    8, 16                                           // y_offset, y_coeff |         load_yoff_ycoeff 8, 16                                           // y_offset, y_coeff | ||||||
|     ld1                 {v1.1d}, [x8] |         ld1             {v1.1d}, [x8] | ||||||
|     dup                 v0.8h, w10 |         dup             v0.8h, w10 | ||||||
|     dup                 v3.8h, w9 |         dup             v3.8h, w9 | ||||||
|     sub                 w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding) |         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding) | ||||||
|     sub                 w5, w5, w0                                      // w5 = linesizeY - width     (paddingY) |         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY) | ||||||
|     sub                 w7, w7, w0                                      // w7 = linesizeC - width     (paddingC) |         sub             w7, w7, w0                                      // w7 = linesizeC - width     (paddingC) | ||||||
|     neg                 w11, w0 |         neg             w11, w0 | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro load_args_nv21
 | .macro load_args_nv21
 | ||||||
| @ -47,52 +47,52 @@ | |||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro load_args_yuv420p
 | .macro load_args_yuv420p
 | ||||||
|     ldr                 x13, [sp]                                       // srcV |         ldr             x13, [sp]                                       // srcV | ||||||
|     ldr                 w14, [sp, #8]                                   // linesizeV |         ldr             w14, [sp, #8]                                   // linesizeV | ||||||
|     ldr                 x8,  [sp, #16]                                  // table |         ldr             x8,  [sp, #16]                                  // table | ||||||
|     load_yoff_ycoeff    24, 32                                          // y_offset, y_coeff |         load_yoff_ycoeff 24, 32                                          // y_offset, y_coeff | ||||||
|     ld1                 {v1.1d}, [x8] |         ld1             {v1.1d}, [x8] | ||||||
|     dup                 v0.8h, w10 |         dup             v0.8h, w10 | ||||||
|     dup                 v3.8h, w9 |         dup             v3.8h, w9 | ||||||
|     sub                 w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding) |         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding) | ||||||
|     sub                 w5, w5, w0                                      // w5 = linesizeY - width     (paddingY) |         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY) | ||||||
|     sub                 w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU) |         sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU) | ||||||
|     sub                 w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV) |         sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV) | ||||||
|     lsr                 w11, w0, #1 |         lsr             w11, w0, #1 | ||||||
|     neg                 w11, w11 |         neg             w11, w11 | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro load_args_yuv422p
 | .macro load_args_yuv422p
 | ||||||
|     ldr                 x13, [sp]                                       // srcV |         ldr             x13, [sp]                                       // srcV | ||||||
|     ldr                 w14, [sp, #8]                                   // linesizeV |         ldr             w14, [sp, #8]                                   // linesizeV | ||||||
|     ldr                 x8,  [sp, #16]                                  // table |         ldr             x8,  [sp, #16]                                  // table | ||||||
|     load_yoff_ycoeff    24, 32                                          // y_offset, y_coeff |         load_yoff_ycoeff 24, 32                                          // y_offset, y_coeff | ||||||
|     ld1                 {v1.1d}, [x8] |         ld1             {v1.1d}, [x8] | ||||||
|     dup                 v0.8h, w10 |         dup             v0.8h, w10 | ||||||
|     dup                 v3.8h, w9 |         dup             v3.8h, w9 | ||||||
|     sub                 w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding) |         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding) | ||||||
|     sub                 w5, w5, w0                                      // w5 = linesizeY - width     (paddingY) |         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY) | ||||||
|     sub                 w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU) |         sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU) | ||||||
|     sub                 w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV) |         sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV) | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro load_chroma_nv12
 | .macro load_chroma_nv12
 | ||||||
|     ld2                 {v16.8b, v17.8b}, [x6], #16 |         ld2             {v16.8b, v17.8b}, [x6], #16 | ||||||
|     ushll               v18.8h, v16.8b, #3 |         ushll           v18.8h, v16.8b, #3 | ||||||
|     ushll               v19.8h, v17.8b, #3 |         ushll           v19.8h, v17.8b, #3 | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro load_chroma_nv21
 | .macro load_chroma_nv21
 | ||||||
|     ld2                 {v16.8b, v17.8b}, [x6], #16 |         ld2             {v16.8b, v17.8b}, [x6], #16 | ||||||
|     ushll               v19.8h, v16.8b, #3 |         ushll           v19.8h, v16.8b, #3 | ||||||
|     ushll               v18.8h, v17.8b, #3 |         ushll           v18.8h, v17.8b, #3 | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro load_chroma_yuv420p
 | .macro load_chroma_yuv420p
 | ||||||
|     ld1                 {v16.8b}, [ x6], #8 |         ld1             {v16.8b}, [ x6], #8 | ||||||
|     ld1                 {v17.8b}, [x13], #8 |         ld1             {v17.8b}, [x13], #8 | ||||||
|     ushll               v18.8h, v16.8b, #3 |         ushll           v18.8h, v16.8b, #3 | ||||||
|     ushll               v19.8h, v17.8b, #3 |         ushll           v19.8h, v17.8b, #3 | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro load_chroma_yuv422p
 | .macro load_chroma_yuv422p
 | ||||||
| @ -100,9 +100,9 @@ | |||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro increment_nv12
 | .macro increment_nv12
 | ||||||
|     ands                w15, w1, #1 |         ands            w15, w1, #1 | ||||||
|     csel                w16, w7, w11, ne                                // incC = (h & 1) ? paddincC : -width |         csel            w16, w7, w11, ne                                // incC = (h & 1) ? paddincC : -width | ||||||
|     add                 x6,  x6, w16, sxtw                              // srcC += incC |         add             x6,  x6, w16, sxtw                              // srcC += incC | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro increment_nv21
 | .macro increment_nv21
 | ||||||
| @ -110,100 +110,100 @@ | |||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro increment_yuv420p
 | .macro increment_yuv420p
 | ||||||
|     ands                w15, w1, #1 |         ands            w15, w1, #1 | ||||||
|     csel                w16,  w7, w11, ne                               // incU = (h & 1) ? paddincU : -width/2 |         csel            w16,  w7, w11, ne                               // incU = (h & 1) ? paddincU : -width/2 | ||||||
|     csel                w17, w14, w11, ne                               // incV = (h & 1) ? paddincV : -width/2 |         csel            w17, w14, w11, ne                               // incV = (h & 1) ? paddincV : -width/2 | ||||||
|     add                 x6,  x6,  w16, sxtw                             // srcU += incU |         add             x6,  x6,  w16, sxtw                             // srcU += incU | ||||||
|     add                 x13, x13, w17, sxtw                             // srcV += incV |         add             x13, x13, w17, sxtw                             // srcV += incV | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro increment_yuv422p
 | .macro increment_yuv422p
 | ||||||
|     add                 x6,  x6,  w7, sxtw                              // srcU += incU |         add             x6,  x6,  w7, sxtw                              // srcU += incU | ||||||
|     add                 x13, x13, w14, sxtw                             // srcV += incV |         add             x13, x13, w14, sxtw                             // srcV += incV | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 | .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 | ||||||
|     add                 v20.8h, v26.8h, v20.8h                          // Y1 + R1 |         add             v20.8h, v26.8h, v20.8h                          // Y1 + R1 | ||||||
|     add                 v21.8h, v27.8h, v21.8h                          // Y2 + R2 |         add             v21.8h, v27.8h, v21.8h                          // Y2 + R2 | ||||||
|     add                 v22.8h, v26.8h, v22.8h                          // Y1 + G1 |         add             v22.8h, v26.8h, v22.8h                          // Y1 + G1 | ||||||
|     add                 v23.8h, v27.8h, v23.8h                          // Y2 + G2 |         add             v23.8h, v27.8h, v23.8h                          // Y2 + G2 | ||||||
|     add                 v24.8h, v26.8h, v24.8h                          // Y1 + B1 |         add             v24.8h, v26.8h, v24.8h                          // Y1 + B1 | ||||||
|     add                 v25.8h, v27.8h, v25.8h                          // Y2 + B2 |         add             v25.8h, v27.8h, v25.8h                          // Y2 + B2 | ||||||
|     sqrshrun            \r1, v20.8h, #1                                 // clip_u8((Y1 + R1) >> 1) |         sqrshrun        \r1, v20.8h, #1                                 // clip_u8((Y1 + R1) >> 1) | ||||||
|     sqrshrun            \r2, v21.8h, #1                                 // clip_u8((Y2 + R1) >> 1) |         sqrshrun        \r2, v21.8h, #1                                 // clip_u8((Y2 + R1) >> 1) | ||||||
|     sqrshrun            \g1, v22.8h, #1                                 // clip_u8((Y1 + G1) >> 1) |         sqrshrun        \g1, v22.8h, #1                                 // clip_u8((Y1 + G1) >> 1) | ||||||
|     sqrshrun            \g2, v23.8h, #1                                 // clip_u8((Y2 + G1) >> 1) |         sqrshrun        \g2, v23.8h, #1                                 // clip_u8((Y2 + G1) >> 1) | ||||||
|     sqrshrun            \b1, v24.8h, #1                                 // clip_u8((Y1 + B1) >> 1) |         sqrshrun        \b1, v24.8h, #1                                 // clip_u8((Y1 + B1) >> 1) | ||||||
|     sqrshrun            \b2, v25.8h, #1                                 // clip_u8((Y2 + B1) >> 1) |         sqrshrun        \b2, v25.8h, #1                                 // clip_u8((Y2 + B1) >> 1) | ||||||
|     movi                \a1, #255 |         movi            \a1, #255 | ||||||
|     movi                \a2, #255 |         movi            \a2, #255 | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro declare_func ifmt ofmt | .macro declare_func ifmt ofmt | ||||||
| function ff_\ifmt\()_to_\ofmt\()_neon, export=1 | function ff_\ifmt\()_to_\ofmt\()_neon, export=1 | ||||||
|     load_args_\ifmt |     load_args_\ifmt | ||||||
|     mov                 w9, w1 |         mov             w9, w1 | ||||||
| 1: | 1: | ||||||
|     mov                 w8, w0                                          // w8 = width |         mov             w8, w0                                          // w8 = width | ||||||
| 2: | 2: | ||||||
|     movi                v5.8h, #4, lsl #8                               // 128 * (1<<3) |         movi            v5.8h, #4, lsl #8                               // 128 * (1<<3) | ||||||
|     load_chroma_\ifmt |     load_chroma_\ifmt | ||||||
|     sub                 v18.8h, v18.8h, v5.8h                           // U*(1<<3) - 128*(1<<3) |         sub             v18.8h, v18.8h, v5.8h                           // U*(1<<3) - 128*(1<<3) | ||||||
|     sub                 v19.8h, v19.8h, v5.8h                           // V*(1<<3) - 128*(1<<3) |         sub             v19.8h, v19.8h, v5.8h                           // V*(1<<3) - 128*(1<<3) | ||||||
|     sqdmulh             v20.8h, v19.8h, v1.h[0]                         // V * v2r            (R) |         sqdmulh         v20.8h, v19.8h, v1.h[0]                         // V * v2r            (R) | ||||||
|     sqdmulh             v22.8h, v18.8h, v1.h[1]                         // U * u2g |         sqdmulh         v22.8h, v18.8h, v1.h[1]                         // U * u2g | ||||||
|     sqdmulh             v19.8h, v19.8h, v1.h[2]                         //           V * v2g |         sqdmulh         v19.8h, v19.8h, v1.h[2]                         //           V * v2g | ||||||
|     add                 v22.8h, v22.8h, v19.8h                          // U * u2g + V * v2g  (G) |         add             v22.8h, v22.8h, v19.8h                          // U * u2g + V * v2g  (G) | ||||||
|     sqdmulh             v24.8h, v18.8h, v1.h[3]                         // U * u2b            (B) |         sqdmulh         v24.8h, v18.8h, v1.h[3]                         // U * u2b            (B) | ||||||
|     zip2                v21.8h, v20.8h, v20.8h                          // R2 |         zip2            v21.8h, v20.8h, v20.8h                          // R2 | ||||||
|     zip1                v20.8h, v20.8h, v20.8h                          // R1 |         zip1            v20.8h, v20.8h, v20.8h                          // R1 | ||||||
|     zip2                v23.8h, v22.8h, v22.8h                          // G2 |         zip2            v23.8h, v22.8h, v22.8h                          // G2 | ||||||
|     zip1                v22.8h, v22.8h, v22.8h                          // G1 |         zip1            v22.8h, v22.8h, v22.8h                          // G1 | ||||||
|     zip2                v25.8h, v24.8h, v24.8h                          // B2 |         zip2            v25.8h, v24.8h, v24.8h                          // B2 | ||||||
|     zip1                v24.8h, v24.8h, v24.8h                          // B1 |         zip1            v24.8h, v24.8h, v24.8h                          // B1 | ||||||
|     ld1                 {v2.16b}, [x4], #16                             // load luma |         ld1             {v2.16b}, [x4], #16                             // load luma | ||||||
|     ushll               v26.8h, v2.8b,  #3                              // Y1*(1<<3) |         ushll           v26.8h, v2.8b,  #3                              // Y1*(1<<3) | ||||||
|     ushll2              v27.8h, v2.16b, #3                              // Y2*(1<<3) |         ushll2          v27.8h, v2.16b, #3                              // Y2*(1<<3) | ||||||
|     sub                 v26.8h, v26.8h, v3.8h                           // Y1*(1<<3) - y_offset |         sub             v26.8h, v26.8h, v3.8h                           // Y1*(1<<3) - y_offset | ||||||
|     sub                 v27.8h, v27.8h, v3.8h                           // Y2*(1<<3) - y_offset |         sub             v27.8h, v27.8h, v3.8h                           // Y2*(1<<3) - y_offset | ||||||
|     sqdmulh             v26.8h, v26.8h, v0.8h                           // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15 |         sqdmulh         v26.8h, v26.8h, v0.8h                           // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15 | ||||||
|     sqdmulh             v27.8h, v27.8h, v0.8h                           // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15 |         sqdmulh         v27.8h, v27.8h, v0.8h                           // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15 | ||||||
| 
 | 
 | ||||||
| .ifc \ofmt,argb // 1 2 3 0 | .ifc \ofmt,argb // 1 2 3 0 | ||||||
|     compute_rgba        v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b |         compute_rgba    v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b | ||||||
| .endif | .endif | ||||||
| 
 | 
 | ||||||
| .ifc \ofmt,rgba // 0 1 2 3 | .ifc \ofmt,rgba // 0 1 2 3 | ||||||
|     compute_rgba        v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b |         compute_rgba    v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b | ||||||
| .endif | .endif | ||||||
| 
 | 
 | ||||||
| .ifc \ofmt,abgr // 3 2 1 0 | .ifc \ofmt,abgr // 3 2 1 0 | ||||||
|     compute_rgba        v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b |         compute_rgba    v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b | ||||||
| .endif | .endif | ||||||
| 
 | 
 | ||||||
| .ifc \ofmt,bgra // 2 1 0 3 | .ifc \ofmt,bgra // 2 1 0 3 | ||||||
|     compute_rgba        v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b |         compute_rgba    v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b | ||||||
| .endif | .endif | ||||||
| 
 | 
 | ||||||
|     st4                 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32 |         st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32 | ||||||
|     st4                 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 |         st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 | ||||||
|     subs                w8, w8, #16                                     // width -= 16 |         subs            w8, w8, #16                                     // width -= 16 | ||||||
|     b.gt                2b |         b.gt            2b | ||||||
|     add                 x2, x2, w3, sxtw                                // dst  += padding |         add             x2, x2, w3, sxtw                                // dst  += padding | ||||||
|     add                 x4, x4, w5, sxtw                                // srcY += paddingY |         add             x4, x4, w5, sxtw                                // srcY += paddingY | ||||||
|     increment_\ifmt |     increment_\ifmt | ||||||
|     subs                w1, w1, #1                                      // height -= 1 |         subs            w1, w1, #1                                      // height -= 1 | ||||||
|     b.gt                1b |         b.gt            1b | ||||||
|     mov                 w0, w9 |         mov             w0, w9 | ||||||
|     ret |     ret | ||||||
| endfunc | endfunc | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| .macro declare_rgb_funcs ifmt | .macro declare_rgb_funcs ifmt | ||||||
|     declare_func \ifmt, argb |         declare_func    \ifmt, argb | ||||||
|     declare_func \ifmt, rgba |         declare_func    \ifmt, rgba | ||||||
|     declare_func \ifmt, abgr |         declare_func    \ifmt, abgr | ||||||
|     declare_func \ifmt, bgra |         declare_func    \ifmt, bgra | ||||||
| .endm | .endm | ||||||
| 
 | 
 | ||||||
| declare_rgb_funcs nv12 | declare_rgb_funcs nv12 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user