aarch64: Consistently use lowercase for vector element specifiers

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2023-10-17 14:16:24 +03:00
parent 393d1ee541
commit 184103b310
16 changed files with 2199 additions and 2199 deletions

@ -19,82 +19,82 @@
#include "libavutil/aarch64/asm.S" #include "libavutil/aarch64/asm.S"
function ff_ps_add_squares_neon, export=1 function ff_ps_add_squares_neon, export=1
1: ld1 {v0.4S,v1.4S}, [x1], #32 1: ld1 {v0.4s,v1.4s}, [x1], #32
fmul v0.4S, v0.4S, v0.4S fmul v0.4s, v0.4s, v0.4s
fmul v1.4S, v1.4S, v1.4S fmul v1.4s, v1.4s, v1.4s
faddp v2.4S, v0.4S, v1.4S faddp v2.4s, v0.4s, v1.4s
ld1 {v3.4S}, [x0] ld1 {v3.4s}, [x0]
fadd v3.4S, v3.4S, v2.4S fadd v3.4s, v3.4s, v2.4s
st1 {v3.4S}, [x0], #16 st1 {v3.4s}, [x0], #16
subs w2, w2, #4 subs w2, w2, #4
b.gt 1b b.gt 1b
ret ret
endfunc endfunc
function ff_ps_mul_pair_single_neon, export=1 function ff_ps_mul_pair_single_neon, export=1
1: ld1 {v0.4S,v1.4S}, [x1], #32 1: ld1 {v0.4s,v1.4s}, [x1], #32
ld1 {v2.4S}, [x2], #16 ld1 {v2.4s}, [x2], #16
zip1 v3.4S, v2.4S, v2.4S zip1 v3.4s, v2.4s, v2.4s
zip2 v4.4S, v2.4S, v2.4S zip2 v4.4s, v2.4s, v2.4s
fmul v0.4S, v0.4S, v3.4S fmul v0.4s, v0.4s, v3.4s
fmul v1.4S, v1.4S, v4.4S fmul v1.4s, v1.4s, v4.4s
st1 {v0.4S,v1.4S}, [x0], #32 st1 {v0.4s,v1.4s}, [x0], #32
subs w3, w3, #4 subs w3, w3, #4
b.gt 1b b.gt 1b
ret ret
endfunc endfunc
function ff_ps_stereo_interpolate_neon, export=1 function ff_ps_stereo_interpolate_neon, export=1
ld1 {v0.4S}, [x2] ld1 {v0.4s}, [x2]
ld1 {v1.4S}, [x3] ld1 {v1.4s}, [x3]
zip1 v4.4S, v0.4S, v0.4S zip1 v4.4s, v0.4s, v0.4s
zip2 v5.4S, v0.4S, v0.4S zip2 v5.4s, v0.4s, v0.4s
zip1 v6.4S, v1.4S, v1.4S zip1 v6.4s, v1.4s, v1.4s
zip2 v7.4S, v1.4S, v1.4S zip2 v7.4s, v1.4s, v1.4s
1: ld1 {v2.2S}, [x0] 1: ld1 {v2.2s}, [x0]
ld1 {v3.2S}, [x1] ld1 {v3.2s}, [x1]
fadd v4.4S, v4.4S, v6.4S fadd v4.4s, v4.4s, v6.4s
fadd v5.4S, v5.4S, v7.4S fadd v5.4s, v5.4s, v7.4s
mov v2.D[1], v2.D[0] mov v2.d[1], v2.d[0]
mov v3.D[1], v3.D[0] mov v3.d[1], v3.d[0]
fmul v2.4S, v2.4S, v4.4S fmul v2.4s, v2.4s, v4.4s
fmla v2.4S, v3.4S, v5.4S fmla v2.4s, v3.4s, v5.4s
st1 {v2.D}[0], [x0], #8 st1 {v2.d}[0], [x0], #8
st1 {v2.D}[1], [x1], #8 st1 {v2.d}[1], [x1], #8
subs w4, w4, #1 subs w4, w4, #1
b.gt 1b b.gt 1b
ret ret
endfunc endfunc
function ff_ps_stereo_interpolate_ipdopd_neon, export=1 function ff_ps_stereo_interpolate_ipdopd_neon, export=1
ld1 {v0.4S,v1.4S}, [x2] ld1 {v0.4s,v1.4s}, [x2]
ld1 {v6.4S,v7.4S}, [x3] ld1 {v6.4s,v7.4s}, [x3]
fneg v2.4S, v1.4S fneg v2.4s, v1.4s
fneg v3.4S, v7.4S fneg v3.4s, v7.4s
zip1 v16.4S, v0.4S, v0.4S zip1 v16.4s, v0.4s, v0.4s
zip2 v17.4S, v0.4S, v0.4S zip2 v17.4s, v0.4s, v0.4s
zip1 v18.4S, v2.4S, v1.4S zip1 v18.4s, v2.4s, v1.4s
zip2 v19.4S, v2.4S, v1.4S zip2 v19.4s, v2.4s, v1.4s
zip1 v20.4S, v6.4S, v6.4S zip1 v20.4s, v6.4s, v6.4s
zip2 v21.4S, v6.4S, v6.4S zip2 v21.4s, v6.4s, v6.4s
zip1 v22.4S, v3.4S, v7.4S zip1 v22.4s, v3.4s, v7.4s
zip2 v23.4S, v3.4S, v7.4S zip2 v23.4s, v3.4s, v7.4s
1: ld1 {v2.2S}, [x0] 1: ld1 {v2.2s}, [x0]
ld1 {v3.2S}, [x1] ld1 {v3.2s}, [x1]
fadd v16.4S, v16.4S, v20.4S fadd v16.4s, v16.4s, v20.4s
fadd v17.4S, v17.4S, v21.4S fadd v17.4s, v17.4s, v21.4s
mov v2.D[1], v2.D[0] mov v2.d[1], v2.d[0]
mov v3.D[1], v3.D[0] mov v3.d[1], v3.d[0]
fmul v4.4S, v2.4S, v16.4S fmul v4.4s, v2.4s, v16.4s
fmla v4.4S, v3.4S, v17.4S fmla v4.4s, v3.4s, v17.4s
fadd v18.4S, v18.4S, v22.4S fadd v18.4s, v18.4s, v22.4s
fadd v19.4S, v19.4S, v23.4S fadd v19.4s, v19.4s, v23.4s
ext v2.16B, v2.16B, v2.16B, #4 ext v2.16b, v2.16b, v2.16b, #4
ext v3.16B, v3.16B, v3.16B, #4 ext v3.16b, v3.16b, v3.16b, #4
fmla v4.4S, v2.4S, v18.4S fmla v4.4s, v2.4s, v18.4s
fmla v4.4S, v3.4S, v19.4S fmla v4.4s, v3.4s, v19.4s
st1 {v4.D}[0], [x0], #8 st1 {v4.d}[0], [x0], #8
st1 {v4.D}[1], [x1], #8 st1 {v4.d}[1], [x1], #8
subs w4, w4, #1 subs w4, w4, #1
b.gt 1b b.gt 1b
ret ret
@ -102,46 +102,46 @@ endfunc
function ff_ps_hybrid_analysis_neon, export=1 function ff_ps_hybrid_analysis_neon, export=1
lsl x3, x3, #3 lsl x3, x3, #3
ld2 {v0.4S,v1.4S}, [x1], #32 ld2 {v0.4s,v1.4s}, [x1], #32
ld2 {v2.2S,v3.2S}, [x1], #16 ld2 {v2.2s,v3.2s}, [x1], #16
ld1 {v24.2S}, [x1], #8 ld1 {v24.2s}, [x1], #8
ld2 {v4.2S,v5.2S}, [x1], #16 ld2 {v4.2s,v5.2s}, [x1], #16
ld2 {v6.4S,v7.4S}, [x1] ld2 {v6.4s,v7.4s}, [x1]
rev64 v6.4S, v6.4S rev64 v6.4s, v6.4s
rev64 v7.4S, v7.4S rev64 v7.4s, v7.4s
ext v6.16B, v6.16B, v6.16B, #8 ext v6.16b, v6.16b, v6.16b, #8
ext v7.16B, v7.16B, v7.16B, #8 ext v7.16b, v7.16b, v7.16b, #8
rev64 v4.2S, v4.2S rev64 v4.2s, v4.2s
rev64 v5.2S, v5.2S rev64 v5.2s, v5.2s
mov v2.D[1], v3.D[0] mov v2.d[1], v3.d[0]
mov v4.D[1], v5.D[0] mov v4.d[1], v5.d[0]
mov v5.D[1], v2.D[0] mov v5.d[1], v2.d[0]
mov v3.D[1], v4.D[0] mov v3.d[1], v4.d[0]
fadd v16.4S, v0.4S, v6.4S fadd v16.4s, v0.4s, v6.4s
fadd v17.4S, v1.4S, v7.4S fadd v17.4s, v1.4s, v7.4s
fsub v18.4S, v1.4S, v7.4S fsub v18.4s, v1.4s, v7.4s
fsub v19.4S, v0.4S, v6.4S fsub v19.4s, v0.4s, v6.4s
fadd v22.4S, v2.4S, v4.4S fadd v22.4s, v2.4s, v4.4s
fsub v23.4S, v5.4S, v3.4S fsub v23.4s, v5.4s, v3.4s
trn1 v20.2D, v22.2D, v23.2D // {re4+re8, re5+re7, im8-im4, im7-im5} trn1 v20.2d, v22.2d, v23.2d // {re4+re8, re5+re7, im8-im4, im7-im5}
trn2 v21.2D, v22.2D, v23.2D // {im4+im8, im5+im7, re4-re8, re5-re7} trn2 v21.2d, v22.2d, v23.2d // {im4+im8, im5+im7, re4-re8, re5-re7}
1: ld2 {v2.4S,v3.4S}, [x2], #32 1: ld2 {v2.4s,v3.4s}, [x2], #32
ld2 {v4.2S,v5.2S}, [x2], #16 ld2 {v4.2s,v5.2s}, [x2], #16
ld1 {v6.2S}, [x2], #8 ld1 {v6.2s}, [x2], #8
add x2, x2, #8 add x2, x2, #8
mov v4.D[1], v5.D[0] mov v4.d[1], v5.d[0]
mov v6.S[1], v6.S[0] mov v6.s[1], v6.s[0]
fmul v6.2S, v6.2S, v24.2S fmul v6.2s, v6.2s, v24.2s
fmul v0.4S, v2.4S, v16.4S fmul v0.4s, v2.4s, v16.4s
fmul v1.4S, v2.4S, v17.4S fmul v1.4s, v2.4s, v17.4s
fmls v0.4S, v3.4S, v18.4S fmls v0.4s, v3.4s, v18.4s
fmla v1.4S, v3.4S, v19.4S fmla v1.4s, v3.4s, v19.4s
fmla v0.4S, v4.4S, v20.4S fmla v0.4s, v4.4s, v20.4s
fmla v1.4S, v4.4S, v21.4S fmla v1.4s, v4.4s, v21.4s
faddp v0.4S, v0.4S, v1.4S faddp v0.4s, v0.4s, v1.4s
faddp v0.4S, v0.4S, v0.4S faddp v0.4s, v0.4s, v0.4s
fadd v0.2S, v0.2S, v6.2S fadd v0.2s, v0.2s, v6.2s
st1 {v0.2S}, [x0], x3 st1 {v0.2s}, [x0], x3
subs w4, w4, #1 subs w4, w4, #1
b.gt 1b b.gt 1b
ret ret

@ -39,10 +39,10 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
lsl w10, w10, #1 lsl w10, w10, #1
add w9, w9, w10 add w9, w9, w10
add x6, x6, w9, UXTW add x6, x6, w9, UXTW
ld1r {v22.8H}, [x6] ld1r {v22.8h}, [x6]
.endif .endif
.ifc \codec,vc1 .ifc \codec,vc1
movi v22.8H, #28 movi v22.8h, #28
.endif .endif
mul w7, w4, w5 mul w7, w4, w5
lsl w14, w5, #3 lsl w14, w5, #3
@ -55,139 +55,139 @@ function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
add w4, w4, #64 add w4, w4, #64
b.eq 2f b.eq 2f
dup v0.8B, w4 dup v0.8b, w4
dup v1.8B, w12 dup v1.8b, w12
ld1 {v4.8B, v5.8B}, [x1], x2 ld1 {v4.8b, v5.8b}, [x1], x2
dup v2.8B, w6 dup v2.8b, w6
dup v3.8B, w7 dup v3.8b, w7
ext v5.8B, v4.8B, v5.8B, #1 ext v5.8b, v4.8b, v5.8b, #1
1: ld1 {v6.8B, v7.8B}, [x1], x2 1: ld1 {v6.8b, v7.8b}, [x1], x2
umull v16.8H, v4.8B, v0.8B umull v16.8h, v4.8b, v0.8b
umlal v16.8H, v5.8B, v1.8B umlal v16.8h, v5.8b, v1.8b
ext v7.8B, v6.8B, v7.8B, #1 ext v7.8b, v6.8b, v7.8b, #1
ld1 {v4.8B, v5.8B}, [x1], x2 ld1 {v4.8b, v5.8b}, [x1], x2
umlal v16.8H, v6.8B, v2.8B umlal v16.8h, v6.8b, v2.8b
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
ext v5.8B, v4.8B, v5.8B, #1 ext v5.8b, v4.8b, v5.8b, #1
umlal v16.8H, v7.8B, v3.8B umlal v16.8h, v7.8b, v3.8b
umull v17.8H, v6.8B, v0.8B umull v17.8h, v6.8b, v0.8b
subs w3, w3, #2 subs w3, w3, #2
umlal v17.8H, v7.8B, v1.8B umlal v17.8h, v7.8b, v1.8b
umlal v17.8H, v4.8B, v2.8B umlal v17.8h, v4.8b, v2.8b
umlal v17.8H, v5.8B, v3.8B umlal v17.8h, v5.8b, v3.8b
prfm pldl1strm, [x1, x2] prfm pldl1strm, [x1, x2]
.ifc \codec,h264 .ifc \codec,h264
rshrn v16.8B, v16.8H, #6 rshrn v16.8b, v16.8h, #6
rshrn v17.8B, v17.8H, #6 rshrn v17.8b, v17.8h, #6
.else .else
add v16.8H, v16.8H, v22.8H add v16.8h, v16.8h, v22.8h
add v17.8H, v17.8H, v22.8H add v17.8h, v17.8h, v22.8h
shrn v16.8B, v16.8H, #6 shrn v16.8b, v16.8h, #6
shrn v17.8B, v17.8H, #6 shrn v17.8b, v17.8h, #6
.endif .endif
.ifc \type,avg .ifc \type,avg
ld1 {v20.8B}, [x8], x2 ld1 {v20.8b}, [x8], x2
ld1 {v21.8B}, [x8], x2 ld1 {v21.8b}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B urhadd v16.8b, v16.8b, v20.8b
urhadd v17.8B, v17.8B, v21.8B urhadd v17.8b, v17.8b, v21.8b
.endif .endif
st1 {v16.8B}, [x0], x2 st1 {v16.8b}, [x0], x2
st1 {v17.8B}, [x0], x2 st1 {v17.8b}, [x0], x2
b.gt 1b b.gt 1b
ret ret
2: adds w12, w12, w6 2: adds w12, w12, w6
dup v0.8B, w4 dup v0.8b, w4
b.eq 5f b.eq 5f
tst w6, w6 tst w6, w6
dup v1.8B, w12 dup v1.8b, w12
b.eq 4f b.eq 4f
ld1 {v4.8B}, [x1], x2 ld1 {v4.8b}, [x1], x2
3: ld1 {v6.8B}, [x1], x2 3: ld1 {v6.8b}, [x1], x2
umull v16.8H, v4.8B, v0.8B umull v16.8h, v4.8b, v0.8b
umlal v16.8H, v6.8B, v1.8B umlal v16.8h, v6.8b, v1.8b
ld1 {v4.8B}, [x1], x2 ld1 {v4.8b}, [x1], x2
umull v17.8H, v6.8B, v0.8B umull v17.8h, v6.8b, v0.8b
umlal v17.8H, v4.8B, v1.8B umlal v17.8h, v4.8b, v1.8b
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
.ifc \codec,h264 .ifc \codec,h264
rshrn v16.8B, v16.8H, #6 rshrn v16.8b, v16.8h, #6
rshrn v17.8B, v17.8H, #6 rshrn v17.8b, v17.8h, #6
.else .else
add v16.8H, v16.8H, v22.8H add v16.8h, v16.8h, v22.8h
add v17.8H, v17.8H, v22.8H add v17.8h, v17.8h, v22.8h
shrn v16.8B, v16.8H, #6 shrn v16.8b, v16.8h, #6
shrn v17.8B, v17.8H, #6 shrn v17.8b, v17.8h, #6
.endif .endif
prfm pldl1strm, [x1, x2] prfm pldl1strm, [x1, x2]
.ifc \type,avg .ifc \type,avg
ld1 {v20.8B}, [x8], x2 ld1 {v20.8b}, [x8], x2
ld1 {v21.8B}, [x8], x2 ld1 {v21.8b}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B urhadd v16.8b, v16.8b, v20.8b
urhadd v17.8B, v17.8B, v21.8B urhadd v17.8b, v17.8b, v21.8b
.endif .endif
subs w3, w3, #2 subs w3, w3, #2
st1 {v16.8B}, [x0], x2 st1 {v16.8b}, [x0], x2
st1 {v17.8B}, [x0], x2 st1 {v17.8b}, [x0], x2
b.gt 3b b.gt 3b
ret ret
4: ld1 {v4.8B, v5.8B}, [x1], x2 4: ld1 {v4.8b, v5.8b}, [x1], x2
ld1 {v6.8B, v7.8B}, [x1], x2 ld1 {v6.8b, v7.8b}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1 ext v5.8b, v4.8b, v5.8b, #1
ext v7.8B, v6.8B, v7.8B, #1 ext v7.8b, v6.8b, v7.8b, #1
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
subs w3, w3, #2 subs w3, w3, #2
umull v16.8H, v4.8B, v0.8B umull v16.8h, v4.8b, v0.8b
umlal v16.8H, v5.8B, v1.8B umlal v16.8h, v5.8b, v1.8b
umull v17.8H, v6.8B, v0.8B umull v17.8h, v6.8b, v0.8b
umlal v17.8H, v7.8B, v1.8B umlal v17.8h, v7.8b, v1.8b
prfm pldl1strm, [x1, x2] prfm pldl1strm, [x1, x2]
.ifc \codec,h264 .ifc \codec,h264
rshrn v16.8B, v16.8H, #6 rshrn v16.8b, v16.8h, #6
rshrn v17.8B, v17.8H, #6 rshrn v17.8b, v17.8h, #6
.else .else
add v16.8H, v16.8H, v22.8H add v16.8h, v16.8h, v22.8h
add v17.8H, v17.8H, v22.8H add v17.8h, v17.8h, v22.8h
shrn v16.8B, v16.8H, #6 shrn v16.8b, v16.8h, #6
shrn v17.8B, v17.8H, #6 shrn v17.8b, v17.8h, #6
.endif .endif
.ifc \type,avg .ifc \type,avg
ld1 {v20.8B}, [x8], x2 ld1 {v20.8b}, [x8], x2
ld1 {v21.8B}, [x8], x2 ld1 {v21.8b}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B urhadd v16.8b, v16.8b, v20.8b
urhadd v17.8B, v17.8B, v21.8B urhadd v17.8b, v17.8b, v21.8b
.endif .endif
st1 {v16.8B}, [x0], x2 st1 {v16.8b}, [x0], x2
st1 {v17.8B}, [x0], x2 st1 {v17.8b}, [x0], x2
b.gt 4b b.gt 4b
ret ret
5: ld1 {v4.8B}, [x1], x2 5: ld1 {v4.8b}, [x1], x2
ld1 {v5.8B}, [x1], x2 ld1 {v5.8b}, [x1], x2
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
subs w3, w3, #2 subs w3, w3, #2
umull v16.8H, v4.8B, v0.8B umull v16.8h, v4.8b, v0.8b
umull v17.8H, v5.8B, v0.8B umull v17.8h, v5.8b, v0.8b
prfm pldl1strm, [x1, x2] prfm pldl1strm, [x1, x2]
.ifc \codec,h264 .ifc \codec,h264
rshrn v16.8B, v16.8H, #6 rshrn v16.8b, v16.8h, #6
rshrn v17.8B, v17.8H, #6 rshrn v17.8b, v17.8h, #6
.else .else
add v16.8H, v16.8H, v22.8H add v16.8h, v16.8h, v22.8h
add v17.8H, v17.8H, v22.8H add v17.8h, v17.8h, v22.8h
shrn v16.8B, v16.8H, #6 shrn v16.8b, v16.8h, #6
shrn v17.8B, v17.8H, #6 shrn v17.8b, v17.8h, #6
.endif .endif
.ifc \type,avg .ifc \type,avg
ld1 {v20.8B}, [x8], x2 ld1 {v20.8b}, [x8], x2
ld1 {v21.8B}, [x8], x2 ld1 {v21.8b}, [x8], x2
urhadd v16.8B, v16.8B, v20.8B urhadd v16.8b, v16.8b, v20.8b
urhadd v17.8B, v17.8B, v21.8B urhadd v17.8b, v17.8b, v21.8b
.endif .endif
st1 {v16.8B}, [x0], x2 st1 {v16.8b}, [x0], x2
st1 {v17.8B}, [x0], x2 st1 {v17.8b}, [x0], x2
b.gt 5b b.gt 5b
ret ret
endfunc endfunc
@ -209,10 +209,10 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
lsl w10, w10, #1 lsl w10, w10, #1
add w9, w9, w10 add w9, w9, w10
add x6, x6, w9, UXTW add x6, x6, w9, UXTW
ld1r {v22.8H}, [x6] ld1r {v22.8h}, [x6]
.endif .endif
.ifc \codec,vc1 .ifc \codec,vc1
movi v22.8H, #28 movi v22.8h, #28
.endif .endif
mul w7, w4, w5 mul w7, w4, w5
lsl w14, w5, #3 lsl w14, w5, #3
@ -225,133 +225,133 @@ function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
add w4, w4, #64 add w4, w4, #64
b.eq 2f b.eq 2f
dup v24.8B, w4 dup v24.8b, w4
dup v25.8B, w12 dup v25.8b, w12
ld1 {v4.8B}, [x1], x2 ld1 {v4.8b}, [x1], x2
dup v26.8B, w6 dup v26.8b, w6
dup v27.8B, w7 dup v27.8b, w7
ext v5.8B, v4.8B, v5.8B, #1 ext v5.8b, v4.8b, v5.8b, #1
trn1 v0.2S, v24.2S, v25.2S trn1 v0.2s, v24.2s, v25.2s
trn1 v2.2S, v26.2S, v27.2S trn1 v2.2s, v26.2s, v27.2s
trn1 v4.2S, v4.2S, v5.2S trn1 v4.2s, v4.2s, v5.2s
1: ld1 {v6.8B}, [x1], x2 1: ld1 {v6.8b}, [x1], x2
ext v7.8B, v6.8B, v7.8B, #1 ext v7.8b, v6.8b, v7.8b, #1
trn1 v6.2S, v6.2S, v7.2S trn1 v6.2s, v6.2s, v7.2s
umull v18.8H, v4.8B, v0.8B umull v18.8h, v4.8b, v0.8b
umlal v18.8H, v6.8B, v2.8B umlal v18.8h, v6.8b, v2.8b
ld1 {v4.8B}, [x1], x2 ld1 {v4.8b}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1 ext v5.8b, v4.8b, v5.8b, #1
trn1 v4.2S, v4.2S, v5.2S trn1 v4.2s, v4.2s, v5.2s
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
umull v19.8H, v6.8B, v0.8B umull v19.8h, v6.8b, v0.8b
umlal v19.8H, v4.8B, v2.8B umlal v19.8h, v4.8b, v2.8b
trn1 v30.2D, v18.2D, v19.2D trn1 v30.2d, v18.2d, v19.2d
trn2 v31.2D, v18.2D, v19.2D trn2 v31.2d, v18.2d, v19.2d
add v18.8H, v30.8H, v31.8H add v18.8h, v30.8h, v31.8h
.ifc \codec,h264 .ifc \codec,h264
rshrn v16.8B, v18.8H, #6 rshrn v16.8b, v18.8h, #6
.else .else
add v18.8H, v18.8H, v22.8H add v18.8h, v18.8h, v22.8h
shrn v16.8B, v18.8H, #6 shrn v16.8b, v18.8h, #6
.endif .endif
subs w3, w3, #2 subs w3, w3, #2
prfm pldl1strm, [x1, x2] prfm pldl1strm, [x1, x2]
.ifc \type,avg .ifc \type,avg
ld1 {v20.S}[0], [x8], x2 ld1 {v20.s}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2 ld1 {v20.s}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B urhadd v16.8b, v16.8b, v20.8b
.endif .endif
st1 {v16.S}[0], [x0], x2 st1 {v16.s}[0], [x0], x2
st1 {v16.S}[1], [x0], x2 st1 {v16.s}[1], [x0], x2
b.gt 1b b.gt 1b
ret ret
2: adds w12, w12, w6 2: adds w12, w12, w6
dup v30.8B, w4 dup v30.8b, w4
b.eq 5f b.eq 5f
tst w6, w6 tst w6, w6
dup v31.8B, w12 dup v31.8b, w12
trn1 v0.2S, v30.2S, v31.2S trn1 v0.2s, v30.2s, v31.2s
trn2 v1.2S, v30.2S, v31.2S trn2 v1.2s, v30.2s, v31.2s
b.eq 4f b.eq 4f
ext v1.8B, v0.8B, v1.8B, #4 ext v1.8b, v0.8b, v1.8b, #4
ld1 {v4.S}[0], [x1], x2 ld1 {v4.s}[0], [x1], x2
3: ld1 {v4.S}[1], [x1], x2 3: ld1 {v4.s}[1], [x1], x2
umull v18.8H, v4.8B, v0.8B umull v18.8h, v4.8b, v0.8b
ld1 {v4.S}[0], [x1], x2 ld1 {v4.s}[0], [x1], x2
umull v19.8H, v4.8B, v1.8B umull v19.8h, v4.8b, v1.8b
trn1 v30.2D, v18.2D, v19.2D trn1 v30.2d, v18.2d, v19.2d
trn2 v31.2D, v18.2D, v19.2D trn2 v31.2d, v18.2d, v19.2d
add v18.8H, v30.8H, v31.8H add v18.8h, v30.8h, v31.8h
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
.ifc \codec,h264 .ifc \codec,h264
rshrn v16.8B, v18.8H, #6 rshrn v16.8b, v18.8h, #6
.else .else
add v18.8H, v18.8H, v22.8H add v18.8h, v18.8h, v22.8h
shrn v16.8B, v18.8H, #6 shrn v16.8b, v18.8h, #6
.endif .endif
.ifc \type,avg .ifc \type,avg
ld1 {v20.S}[0], [x8], x2 ld1 {v20.s}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2 ld1 {v20.s}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B urhadd v16.8b, v16.8b, v20.8b
.endif .endif
subs w3, w3, #2 subs w3, w3, #2
prfm pldl1strm, [x1, x2] prfm pldl1strm, [x1, x2]
st1 {v16.S}[0], [x0], x2 st1 {v16.s}[0], [x0], x2
st1 {v16.S}[1], [x0], x2 st1 {v16.s}[1], [x0], x2
b.gt 3b b.gt 3b
ret ret
4: ld1 {v4.8B}, [x1], x2 4: ld1 {v4.8b}, [x1], x2
ld1 {v6.8B}, [x1], x2 ld1 {v6.8b}, [x1], x2
ext v5.8B, v4.8B, v5.8B, #1 ext v5.8b, v4.8b, v5.8b, #1
ext v7.8B, v6.8B, v7.8B, #1 ext v7.8b, v6.8b, v7.8b, #1
trn1 v4.2S, v4.2S, v5.2S trn1 v4.2s, v4.2s, v5.2s
trn1 v6.2S, v6.2S, v7.2S trn1 v6.2s, v6.2s, v7.2s
umull v18.8H, v4.8B, v0.8B umull v18.8h, v4.8b, v0.8b
umull v19.8H, v6.8B, v0.8B umull v19.8h, v6.8b, v0.8b
subs w3, w3, #2 subs w3, w3, #2
trn1 v30.2D, v18.2D, v19.2D trn1 v30.2d, v18.2d, v19.2d
trn2 v31.2D, v18.2D, v19.2D trn2 v31.2d, v18.2d, v19.2d
add v18.8H, v30.8H, v31.8H add v18.8h, v30.8h, v31.8h
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
.ifc \codec,h264 .ifc \codec,h264
rshrn v16.8B, v18.8H, #6 rshrn v16.8b, v18.8h, #6
.else .else
add v18.8H, v18.8H, v22.8H add v18.8h, v18.8h, v22.8h
shrn v16.8B, v18.8H, #6 shrn v16.8b, v18.8h, #6
.endif .endif
.ifc \type,avg .ifc \type,avg
ld1 {v20.S}[0], [x8], x2 ld1 {v20.s}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2 ld1 {v20.s}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B urhadd v16.8b, v16.8b, v20.8b
.endif .endif
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
st1 {v16.S}[0], [x0], x2 st1 {v16.s}[0], [x0], x2
st1 {v16.S}[1], [x0], x2 st1 {v16.s}[1], [x0], x2
b.gt 4b b.gt 4b
ret ret
5: ld1 {v4.S}[0], [x1], x2 5: ld1 {v4.s}[0], [x1], x2
ld1 {v4.S}[1], [x1], x2 ld1 {v4.s}[1], [x1], x2
umull v18.8H, v4.8B, v30.8B umull v18.8h, v4.8b, v30.8b
subs w3, w3, #2 subs w3, w3, #2
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
.ifc \codec,h264 .ifc \codec,h264
rshrn v16.8B, v18.8H, #6 rshrn v16.8b, v18.8h, #6
.else .else
add v18.8H, v18.8H, v22.8H add v18.8h, v18.8h, v22.8h
shrn v16.8B, v18.8H, #6 shrn v16.8b, v18.8h, #6
.endif .endif
.ifc \type,avg .ifc \type,avg
ld1 {v20.S}[0], [x8], x2 ld1 {v20.s}[0], [x8], x2
ld1 {v20.S}[1], [x8], x2 ld1 {v20.s}[1], [x8], x2
urhadd v16.8B, v16.8B, v20.8B urhadd v16.8b, v16.8b, v20.8b
.endif .endif
prfm pldl1strm, [x1] prfm pldl1strm, [x1]
st1 {v16.S}[0], [x0], x2 st1 {v16.s}[0], [x0], x2
st1 {v16.S}[1], [x0], x2 st1 {v16.s}[1], [x0], x2
b.gt 5b b.gt 5b
ret ret
endfunc endfunc
@ -372,51 +372,51 @@ function ff_\type\()_h264_chroma_mc2_neon, export=1
sub w4, w7, w13 sub w4, w7, w13
sub w4, w4, w14 sub w4, w4, w14
add w4, w4, #64 add w4, w4, #64
dup v0.8B, w4 dup v0.8b, w4
dup v2.8B, w12 dup v2.8b, w12
dup v1.8B, w6 dup v1.8b, w6
dup v3.8B, w7 dup v3.8b, w7
trn1 v0.4H, v0.4H, v2.4H trn1 v0.4h, v0.4h, v2.4h
trn1 v1.4H, v1.4H, v3.4H trn1 v1.4h, v1.4h, v3.4h
1: 1:
ld1 {v4.S}[0], [x1], x2 ld1 {v4.s}[0], [x1], x2
ld1 {v4.S}[1], [x1], x2 ld1 {v4.s}[1], [x1], x2
rev64 v5.2S, v4.2S rev64 v5.2s, v4.2s
ld1 {v5.S}[1], [x1] ld1 {v5.s}[1], [x1]
ext v6.8B, v4.8B, v5.8B, #1 ext v6.8b, v4.8b, v5.8b, #1
ext v7.8B, v5.8B, v4.8B, #1 ext v7.8b, v5.8b, v4.8b, #1
trn1 v4.4H, v4.4H, v6.4H trn1 v4.4h, v4.4h, v6.4h
trn1 v5.4H, v5.4H, v7.4H trn1 v5.4h, v5.4h, v7.4h
umull v16.8H, v4.8B, v0.8B umull v16.8h, v4.8b, v0.8b
umlal v16.8H, v5.8B, v1.8B umlal v16.8h, v5.8b, v1.8b
.ifc \type,avg .ifc \type,avg
ld1 {v18.H}[0], [x0], x2 ld1 {v18.h}[0], [x0], x2
ld1 {v18.H}[2], [x0] ld1 {v18.h}[2], [x0]
sub x0, x0, x2 sub x0, x0, x2
.endif .endif
rev64 v17.4S, v16.4S rev64 v17.4s, v16.4s
add v16.8H, v16.8H, v17.8H add v16.8h, v16.8h, v17.8h
rshrn v16.8B, v16.8H, #6 rshrn v16.8b, v16.8h, #6
.ifc \type,avg .ifc \type,avg
urhadd v16.8B, v16.8B, v18.8B urhadd v16.8b, v16.8b, v18.8b
.endif .endif
st1 {v16.H}[0], [x0], x2 st1 {v16.h}[0], [x0], x2
st1 {v16.H}[2], [x0], x2 st1 {v16.h}[2], [x0], x2
subs w3, w3, #2 subs w3, w3, #2
b.gt 1b b.gt 1b
ret ret
2: 2:
ld1 {v16.H}[0], [x1], x2 ld1 {v16.h}[0], [x1], x2
ld1 {v16.H}[1], [x1], x2 ld1 {v16.h}[1], [x1], x2
.ifc \type,avg .ifc \type,avg
ld1 {v18.H}[0], [x0], x2 ld1 {v18.h}[0], [x0], x2
ld1 {v18.H}[1], [x0] ld1 {v18.h}[1], [x0]
sub x0, x0, x2 sub x0, x0, x2
urhadd v16.8B, v16.8B, v18.8B urhadd v16.8b, v16.8b, v18.8b
.endif .endif
st1 {v16.H}[0], [x0], x2 st1 {v16.h}[0], [x0], x2
st1 {v16.H}[1], [x0], x2 st1 {v16.h}[1], [x0], x2
subs w3, w3, #2 subs w3, w3, #2
b.gt 2b b.gt 2b
ret ret

@ -27,7 +27,7 @@
cmp w2, #0 cmp w2, #0
ldr w6, [x4] ldr w6, [x4]
ccmp w3, #0, #0, ne ccmp w3, #0, #0, ne
mov v24.S[0], w6 mov v24.s[0], w6
and w8, w6, w6, lsl #16 and w8, w6, w6, lsl #16
b.eq 1f b.eq 1f
ands w8, w8, w8, lsl #8 ands w8, w8, w8, lsl #8
@ -38,95 +38,95 @@
.endm .endm
.macro h264_loop_filter_luma .macro h264_loop_filter_luma
dup v22.16B, w2 // alpha dup v22.16b, w2 // alpha
uxtl v24.8H, v24.8B uxtl v24.8h, v24.8b
uabd v21.16B, v16.16B, v0.16B // abs(p0 - q0) uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
uxtl v24.4S, v24.4H uxtl v24.4s, v24.4h
uabd v28.16B, v18.16B, v16.16B // abs(p1 - p0) uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
sli v24.8H, v24.8H, #8 sli v24.8h, v24.8h, #8
uabd v30.16B, v2.16B, v0.16B // abs(q1 - q0) uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
sli v24.4S, v24.4S, #16 sli v24.4s, v24.4s, #16
cmhi v21.16B, v22.16B, v21.16B // < alpha cmhi v21.16b, v22.16b, v21.16b // < alpha
dup v22.16B, w3 // beta dup v22.16b, w3 // beta
cmlt v23.16B, v24.16B, #0 cmlt v23.16b, v24.16b, #0
cmhi v28.16B, v22.16B, v28.16B // < beta cmhi v28.16b, v22.16b, v28.16b // < beta
cmhi v30.16B, v22.16B, v30.16B // < beta cmhi v30.16b, v22.16b, v30.16b // < beta
bic v21.16B, v21.16B, v23.16B bic v21.16b, v21.16b, v23.16b
uabd v17.16B, v20.16B, v16.16B // abs(p2 - p0) uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
and v21.16B, v21.16B, v28.16B and v21.16b, v21.16b, v28.16b
uabd v19.16B, v4.16B, v0.16B // abs(q2 - q0) uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
and v21.16B, v21.16B, v30.16B // < beta and v21.16b, v21.16b, v30.16b // < beta
shrn v30.8b, v21.8h, #4 shrn v30.8b, v21.8h, #4
mov x7, v30.d[0] mov x7, v30.d[0]
cmhi v17.16B, v22.16B, v17.16B // < beta cmhi v17.16b, v22.16b, v17.16b // < beta
cmhi v19.16B, v22.16B, v19.16B // < beta cmhi v19.16b, v22.16b, v19.16b // < beta
cbz x7, 9f cbz x7, 9f
and v17.16B, v17.16B, v21.16B and v17.16b, v17.16b, v21.16b
and v19.16B, v19.16B, v21.16B and v19.16b, v19.16b, v21.16b
and v24.16B, v24.16B, v21.16B and v24.16b, v24.16b, v21.16b
urhadd v28.16B, v16.16B, v0.16B urhadd v28.16b, v16.16b, v0.16b
sub v21.16B, v24.16B, v17.16B sub v21.16b, v24.16b, v17.16b
uqadd v23.16B, v18.16B, v24.16B uqadd v23.16b, v18.16b, v24.16b
uhadd v20.16B, v20.16B, v28.16B uhadd v20.16b, v20.16b, v28.16b
sub v21.16B, v21.16B, v19.16B sub v21.16b, v21.16b, v19.16b
uhadd v28.16B, v4.16B, v28.16B uhadd v28.16b, v4.16b, v28.16b
umin v23.16B, v23.16B, v20.16B umin v23.16b, v23.16b, v20.16b
uqsub v22.16B, v18.16B, v24.16B uqsub v22.16b, v18.16b, v24.16b
uqadd v4.16B, v2.16B, v24.16B uqadd v4.16b, v2.16b, v24.16b
umax v23.16B, v23.16B, v22.16B umax v23.16b, v23.16b, v22.16b
uqsub v22.16B, v2.16B, v24.16B uqsub v22.16b, v2.16b, v24.16b
umin v28.16B, v4.16B, v28.16B umin v28.16b, v4.16b, v28.16b
uxtl v4.8H, v0.8B uxtl v4.8h, v0.8b
umax v28.16B, v28.16B, v22.16B umax v28.16b, v28.16b, v22.16b
uxtl2 v20.8H, v0.16B uxtl2 v20.8h, v0.16b
usubw v4.8H, v4.8H, v16.8B usubw v4.8h, v4.8h, v16.8b
usubw2 v20.8H, v20.8H, v16.16B usubw2 v20.8h, v20.8h, v16.16b
shl v4.8H, v4.8H, #2 shl v4.8h, v4.8h, #2
shl v20.8H, v20.8H, #2 shl v20.8h, v20.8h, #2
uaddw v4.8H, v4.8H, v18.8B uaddw v4.8h, v4.8h, v18.8b
uaddw2 v20.8H, v20.8H, v18.16B uaddw2 v20.8h, v20.8h, v18.16b
usubw v4.8H, v4.8H, v2.8B usubw v4.8h, v4.8h, v2.8b
usubw2 v20.8H, v20.8H, v2.16B usubw2 v20.8h, v20.8h, v2.16b
rshrn v4.8B, v4.8H, #3 rshrn v4.8b, v4.8h, #3
rshrn2 v4.16B, v20.8H, #3 rshrn2 v4.16b, v20.8h, #3
bsl v17.16B, v23.16B, v18.16B bsl v17.16b, v23.16b, v18.16b
bsl v19.16B, v28.16B, v2.16B bsl v19.16b, v28.16b, v2.16b
neg v23.16B, v21.16B neg v23.16b, v21.16b
uxtl v28.8H, v16.8B uxtl v28.8h, v16.8b
smin v4.16B, v4.16B, v21.16B smin v4.16b, v4.16b, v21.16b
uxtl2 v21.8H, v16.16B uxtl2 v21.8h, v16.16b
smax v4.16B, v4.16B, v23.16B smax v4.16b, v4.16b, v23.16b
uxtl v22.8H, v0.8B uxtl v22.8h, v0.8b
uxtl2 v24.8H, v0.16B uxtl2 v24.8h, v0.16b
saddw v28.8H, v28.8H, v4.8B saddw v28.8h, v28.8h, v4.8b
saddw2 v21.8H, v21.8H, v4.16B saddw2 v21.8h, v21.8h, v4.16b
ssubw v22.8H, v22.8H, v4.8B ssubw v22.8h, v22.8h, v4.8b
ssubw2 v24.8H, v24.8H, v4.16B ssubw2 v24.8h, v24.8h, v4.16b
sqxtun v16.8B, v28.8H sqxtun v16.8b, v28.8h
sqxtun2 v16.16B, v21.8H sqxtun2 v16.16b, v21.8h
sqxtun v0.8B, v22.8H sqxtun v0.8b, v22.8h
sqxtun2 v0.16B, v24.8H sqxtun2 v0.16b, v24.8h
.endm .endm
function ff_h264_v_loop_filter_luma_neon, export=1 function ff_h264_v_loop_filter_luma_neon, export=1
h264_loop_filter_start h264_loop_filter_start
ld1 {v0.16B}, [x0], x1 ld1 {v0.16b}, [x0], x1
ld1 {v2.16B}, [x0], x1 ld1 {v2.16b}, [x0], x1
ld1 {v4.16B}, [x0], x1 ld1 {v4.16b}, [x0], x1
sub x0, x0, x1, lsl #2 sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1 sub x0, x0, x1, lsl #1
ld1 {v20.16B}, [x0], x1 ld1 {v20.16b}, [x0], x1
ld1 {v18.16B}, [x0], x1 ld1 {v18.16b}, [x0], x1
ld1 {v16.16B}, [x0], x1 ld1 {v16.16b}, [x0], x1
h264_loop_filter_luma h264_loop_filter_luma
sub x0, x0, x1, lsl #1 sub x0, x0, x1, lsl #1
st1 {v17.16B}, [x0], x1 st1 {v17.16b}, [x0], x1
st1 {v16.16B}, [x0], x1 st1 {v16.16b}, [x0], x1
st1 {v0.16B}, [x0], x1 st1 {v0.16b}, [x0], x1
st1 {v19.16B}, [x0] st1 {v19.16b}, [x0]
9: 9:
ret ret
endfunc endfunc
@ -135,22 +135,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1
h264_loop_filter_start h264_loop_filter_start
sub x0, x0, #4 sub x0, x0, #4
ld1 {v6.8B}, [x0], x1 ld1 {v6.8b}, [x0], x1
ld1 {v20.8B}, [x0], x1 ld1 {v20.8b}, [x0], x1
ld1 {v18.8B}, [x0], x1 ld1 {v18.8b}, [x0], x1
ld1 {v16.8B}, [x0], x1 ld1 {v16.8b}, [x0], x1
ld1 {v0.8B}, [x0], x1 ld1 {v0.8b}, [x0], x1
ld1 {v2.8B}, [x0], x1 ld1 {v2.8b}, [x0], x1
ld1 {v4.8B}, [x0], x1 ld1 {v4.8b}, [x0], x1
ld1 {v26.8B}, [x0], x1 ld1 {v26.8b}, [x0], x1
ld1 {v6.D}[1], [x0], x1 ld1 {v6.d}[1], [x0], x1
ld1 {v20.D}[1], [x0], x1 ld1 {v20.d}[1], [x0], x1
ld1 {v18.D}[1], [x0], x1 ld1 {v18.d}[1], [x0], x1
ld1 {v16.D}[1], [x0], x1 ld1 {v16.d}[1], [x0], x1
ld1 {v0.D}[1], [x0], x1 ld1 {v0.d}[1], [x0], x1
ld1 {v2.D}[1], [x0], x1 ld1 {v2.d}[1], [x0], x1
ld1 {v4.D}[1], [x0], x1 ld1 {v4.d}[1], [x0], x1
ld1 {v26.D}[1], [x0], x1 ld1 {v26.d}[1], [x0], x1
transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23 transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
@ -160,22 +160,22 @@ function ff_h264_h_loop_filter_luma_neon, export=1
sub x0, x0, x1, lsl #4 sub x0, x0, x1, lsl #4
add x0, x0, #2 add x0, x0, #2
st1 {v17.S}[0], [x0], x1 st1 {v17.s}[0], [x0], x1
st1 {v16.S}[0], [x0], x1 st1 {v16.s}[0], [x0], x1
st1 {v0.S}[0], [x0], x1 st1 {v0.s}[0], [x0], x1
st1 {v19.S}[0], [x0], x1 st1 {v19.s}[0], [x0], x1
st1 {v17.S}[1], [x0], x1 st1 {v17.s}[1], [x0], x1
st1 {v16.S}[1], [x0], x1 st1 {v16.s}[1], [x0], x1
st1 {v0.S}[1], [x0], x1 st1 {v0.s}[1], [x0], x1
st1 {v19.S}[1], [x0], x1 st1 {v19.s}[1], [x0], x1
st1 {v17.S}[2], [x0], x1 st1 {v17.s}[2], [x0], x1
st1 {v16.S}[2], [x0], x1 st1 {v16.s}[2], [x0], x1
st1 {v0.S}[2], [x0], x1 st1 {v0.s}[2], [x0], x1
st1 {v19.S}[2], [x0], x1 st1 {v19.s}[2], [x0], x1
st1 {v17.S}[3], [x0], x1 st1 {v17.s}[3], [x0], x1
st1 {v16.S}[3], [x0], x1 st1 {v16.s}[3], [x0], x1
st1 {v0.S}[3], [x0], x1 st1 {v0.s}[3], [x0], x1
st1 {v19.S}[3], [x0], x1 st1 {v19.s}[3], [x0], x1
9: 9:
ret ret
endfunc endfunc
@ -377,52 +377,52 @@ function ff_h264_h_loop_filter_luma_intra_neon, export=1
endfunc endfunc
.macro h264_loop_filter_chroma .macro h264_loop_filter_chroma
dup v22.8B, w2 // alpha dup v22.8b, w2 // alpha
dup v23.8B, w3 // beta dup v23.8b, w3 // beta
uxtl v24.8H, v24.8B uxtl v24.8h, v24.8b
uabd v26.8B, v16.8B, v0.8B // abs(p0 - q0) uabd v26.8b, v16.8b, v0.8b // abs(p0 - q0)
uabd v28.8B, v18.8B, v16.8B // abs(p1 - p0) uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
uabd v30.8B, v2.8B, v0.8B // abs(q1 - q0) uabd v30.8b, v2.8b, v0.8b // abs(q1 - q0)
cmhi v26.8B, v22.8B, v26.8B // < alpha cmhi v26.8b, v22.8b, v26.8b // < alpha
cmhi v28.8B, v23.8B, v28.8B // < beta cmhi v28.8b, v23.8b, v28.8b // < beta
cmhi v30.8B, v23.8B, v30.8B // < beta cmhi v30.8b, v23.8b, v30.8b // < beta
uxtl v4.8H, v0.8B uxtl v4.8h, v0.8b
and v26.8B, v26.8B, v28.8B and v26.8b, v26.8b, v28.8b
usubw v4.8H, v4.8H, v16.8B usubw v4.8h, v4.8h, v16.8b
and v26.8B, v26.8B, v30.8B and v26.8b, v26.8b, v30.8b
shl v4.8H, v4.8H, #2 shl v4.8h, v4.8h, #2
mov x8, v26.d[0] mov x8, v26.d[0]
sli v24.8H, v24.8H, #8 sli v24.8h, v24.8h, #8
uaddw v4.8H, v4.8H, v18.8B uaddw v4.8h, v4.8h, v18.8b
cbz x8, 9f cbz x8, 9f
usubw v4.8H, v4.8H, v2.8B usubw v4.8h, v4.8h, v2.8b
rshrn v4.8B, v4.8H, #3 rshrn v4.8b, v4.8h, #3
smin v4.8B, v4.8B, v24.8B smin v4.8b, v4.8b, v24.8b
neg v25.8B, v24.8B neg v25.8b, v24.8b
smax v4.8B, v4.8B, v25.8B smax v4.8b, v4.8b, v25.8b
uxtl v22.8H, v0.8B uxtl v22.8h, v0.8b
and v4.8B, v4.8B, v26.8B and v4.8b, v4.8b, v26.8b
uxtl v28.8H, v16.8B uxtl v28.8h, v16.8b
saddw v28.8H, v28.8H, v4.8B saddw v28.8h, v28.8h, v4.8b
ssubw v22.8H, v22.8H, v4.8B ssubw v22.8h, v22.8h, v4.8b
sqxtun v16.8B, v28.8H sqxtun v16.8b, v28.8h
sqxtun v0.8B, v22.8H sqxtun v0.8b, v22.8h
.endm .endm
function ff_h264_v_loop_filter_chroma_neon, export=1 function ff_h264_v_loop_filter_chroma_neon, export=1
h264_loop_filter_start h264_loop_filter_start
sub x0, x0, x1, lsl #1 sub x0, x0, x1, lsl #1
ld1 {v18.8B}, [x0], x1 ld1 {v18.8b}, [x0], x1
ld1 {v16.8B}, [x0], x1 ld1 {v16.8b}, [x0], x1
ld1 {v0.8B}, [x0], x1 ld1 {v0.8b}, [x0], x1
ld1 {v2.8B}, [x0] ld1 {v2.8b}, [x0]
h264_loop_filter_chroma h264_loop_filter_chroma
sub x0, x0, x1, lsl #1 sub x0, x0, x1, lsl #1
st1 {v16.8B}, [x0], x1 st1 {v16.8b}, [x0], x1
st1 {v0.8B}, [x0], x1 st1 {v0.8b}, [x0], x1
9: 9:
ret ret
endfunc endfunc
@ -432,14 +432,14 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
sub x0, x0, #2 sub x0, x0, #2
h_loop_filter_chroma420: h_loop_filter_chroma420:
ld1 {v18.S}[0], [x0], x1 ld1 {v18.s}[0], [x0], x1
ld1 {v16.S}[0], [x0], x1 ld1 {v16.s}[0], [x0], x1
ld1 {v0.S}[0], [x0], x1 ld1 {v0.s}[0], [x0], x1
ld1 {v2.S}[0], [x0], x1 ld1 {v2.s}[0], [x0], x1
ld1 {v18.S}[1], [x0], x1 ld1 {v18.s}[1], [x0], x1
ld1 {v16.S}[1], [x0], x1 ld1 {v16.s}[1], [x0], x1
ld1 {v0.S}[1], [x0], x1 ld1 {v0.s}[1], [x0], x1
ld1 {v2.S}[1], [x0], x1 ld1 {v2.s}[1], [x0], x1
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
@ -448,14 +448,14 @@ h_loop_filter_chroma420:
transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31 transpose_4x8B v18, v16, v0, v2, v28, v29, v30, v31
sub x0, x0, x1, lsl #3 sub x0, x0, x1, lsl #3
st1 {v18.S}[0], [x0], x1 st1 {v18.s}[0], [x0], x1
st1 {v16.S}[0], [x0], x1 st1 {v16.s}[0], [x0], x1
st1 {v0.S}[0], [x0], x1 st1 {v0.s}[0], [x0], x1
st1 {v2.S}[0], [x0], x1 st1 {v2.s}[0], [x0], x1
st1 {v18.S}[1], [x0], x1 st1 {v18.s}[1], [x0], x1
st1 {v16.S}[1], [x0], x1 st1 {v16.s}[1], [x0], x1
st1 {v0.S}[1], [x0], x1 st1 {v0.s}[1], [x0], x1
st1 {v2.S}[1], [x0], x1 st1 {v2.s}[1], [x0], x1
9: 9:
ret ret
endfunc endfunc
@ -584,102 +584,102 @@ function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
endfunc endfunc
.macro biweight_16 macs, macd .macro biweight_16 macs, macd
dup v0.16B, w5 dup v0.16b, w5
dup v1.16B, w6 dup v1.16b, w6
mov v4.16B, v16.16B mov v4.16b, v16.16b
mov v6.16B, v16.16B mov v6.16b, v16.16b
1: subs w3, w3, #2 1: subs w3, w3, #2
ld1 {v20.16B}, [x0], x2 ld1 {v20.16b}, [x0], x2
\macd v4.8H, v0.8B, v20.8B \macd v4.8h, v0.8b, v20.8b
\macd\()2 v6.8H, v0.16B, v20.16B \macd\()2 v6.8H, v0.16B, v20.16B
ld1 {v22.16B}, [x1], x2 ld1 {v22.16b}, [x1], x2
\macs v4.8H, v1.8B, v22.8B \macs v4.8h, v1.8b, v22.8b
\macs\()2 v6.8H, v1.16B, v22.16B \macs\()2 v6.8H, v1.16B, v22.16B
mov v24.16B, v16.16B mov v24.16b, v16.16b
ld1 {v28.16B}, [x0], x2 ld1 {v28.16b}, [x0], x2
mov v26.16B, v16.16B mov v26.16b, v16.16b
\macd v24.8H, v0.8B, v28.8B \macd v24.8h, v0.8b, v28.8b
\macd\()2 v26.8H, v0.16B, v28.16B \macd\()2 v26.8H, v0.16B, v28.16B
ld1 {v30.16B}, [x1], x2 ld1 {v30.16b}, [x1], x2
\macs v24.8H, v1.8B, v30.8B \macs v24.8h, v1.8b, v30.8b
\macs\()2 v26.8H, v1.16B, v30.16B \macs\()2 v26.8H, v1.16B, v30.16B
sshl v4.8H, v4.8H, v18.8H sshl v4.8h, v4.8h, v18.8h
sshl v6.8H, v6.8H, v18.8H sshl v6.8h, v6.8h, v18.8h
sqxtun v4.8B, v4.8H sqxtun v4.8b, v4.8h
sqxtun2 v4.16B, v6.8H sqxtun2 v4.16b, v6.8h
sshl v24.8H, v24.8H, v18.8H sshl v24.8h, v24.8h, v18.8h
sshl v26.8H, v26.8H, v18.8H sshl v26.8h, v26.8h, v18.8h
sqxtun v24.8B, v24.8H sqxtun v24.8b, v24.8h
sqxtun2 v24.16B, v26.8H sqxtun2 v24.16b, v26.8h
mov v6.16B, v16.16B mov v6.16b, v16.16b
st1 {v4.16B}, [x7], x2 st1 {v4.16b}, [x7], x2
mov v4.16B, v16.16B mov v4.16b, v16.16b
st1 {v24.16B}, [x7], x2 st1 {v24.16b}, [x7], x2
b.ne 1b b.ne 1b
ret ret
.endm .endm
.macro biweight_8 macs, macd .macro biweight_8 macs, macd
dup v0.8B, w5 dup v0.8b, w5
dup v1.8B, w6 dup v1.8b, w6
mov v2.16B, v16.16B mov v2.16b, v16.16b
mov v20.16B, v16.16B mov v20.16b, v16.16b
1: subs w3, w3, #2 1: subs w3, w3, #2
ld1 {v4.8B}, [x0], x2 ld1 {v4.8b}, [x0], x2
\macd v2.8H, v0.8B, v4.8B \macd v2.8h, v0.8b, v4.8b
ld1 {v5.8B}, [x1], x2 ld1 {v5.8b}, [x1], x2
\macs v2.8H, v1.8B, v5.8B \macs v2.8h, v1.8b, v5.8b
ld1 {v6.8B}, [x0], x2 ld1 {v6.8b}, [x0], x2
\macd v20.8H, v0.8B, v6.8B \macd v20.8h, v0.8b, v6.8b
ld1 {v7.8B}, [x1], x2 ld1 {v7.8b}, [x1], x2
\macs v20.8H, v1.8B, v7.8B \macs v20.8h, v1.8b, v7.8b
sshl v2.8H, v2.8H, v18.8H sshl v2.8h, v2.8h, v18.8h
sqxtun v2.8B, v2.8H sqxtun v2.8b, v2.8h
sshl v20.8H, v20.8H, v18.8H sshl v20.8h, v20.8h, v18.8h
sqxtun v4.8B, v20.8H sqxtun v4.8b, v20.8h
mov v20.16B, v16.16B mov v20.16b, v16.16b
st1 {v2.8B}, [x7], x2 st1 {v2.8b}, [x7], x2
mov v2.16B, v16.16B mov v2.16b, v16.16b
st1 {v4.8B}, [x7], x2 st1 {v4.8b}, [x7], x2
b.ne 1b b.ne 1b
ret ret
.endm .endm
.macro biweight_4 macs, macd .macro biweight_4 macs, macd
dup v0.8B, w5 dup v0.8b, w5
dup v1.8B, w6 dup v1.8b, w6
mov v2.16B, v16.16B mov v2.16b, v16.16b
mov v20.16B,v16.16B mov v20.16b,v16.16b
1: subs w3, w3, #4 1: subs w3, w3, #4
ld1 {v4.S}[0], [x0], x2 ld1 {v4.s}[0], [x0], x2
ld1 {v4.S}[1], [x0], x2 ld1 {v4.s}[1], [x0], x2
\macd v2.8H, v0.8B, v4.8B \macd v2.8h, v0.8b, v4.8b
ld1 {v5.S}[0], [x1], x2 ld1 {v5.s}[0], [x1], x2
ld1 {v5.S}[1], [x1], x2 ld1 {v5.s}[1], [x1], x2
\macs v2.8H, v1.8B, v5.8B \macs v2.8h, v1.8b, v5.8b
b.lt 2f b.lt 2f
ld1 {v6.S}[0], [x0], x2 ld1 {v6.s}[0], [x0], x2
ld1 {v6.S}[1], [x0], x2 ld1 {v6.s}[1], [x0], x2
\macd v20.8H, v0.8B, v6.8B \macd v20.8h, v0.8b, v6.8b
ld1 {v7.S}[0], [x1], x2 ld1 {v7.s}[0], [x1], x2
ld1 {v7.S}[1], [x1], x2 ld1 {v7.s}[1], [x1], x2
\macs v20.8H, v1.8B, v7.8B \macs v20.8h, v1.8b, v7.8b
sshl v2.8H, v2.8H, v18.8H sshl v2.8h, v2.8h, v18.8h
sqxtun v2.8B, v2.8H sqxtun v2.8b, v2.8h
sshl v20.8H, v20.8H, v18.8H sshl v20.8h, v20.8h, v18.8h
sqxtun v4.8B, v20.8H sqxtun v4.8b, v20.8h
mov v20.16B, v16.16B mov v20.16b, v16.16b
st1 {v2.S}[0], [x7], x2 st1 {v2.s}[0], [x7], x2
st1 {v2.S}[1], [x7], x2 st1 {v2.s}[1], [x7], x2
mov v2.16B, v16.16B mov v2.16b, v16.16b
st1 {v4.S}[0], [x7], x2 st1 {v4.s}[0], [x7], x2
st1 {v4.S}[1], [x7], x2 st1 {v4.s}[1], [x7], x2
b.ne 1b b.ne 1b
ret ret
2: sshl v2.8H, v2.8H, v18.8H 2: sshl v2.8h, v2.8h, v18.8h
sqxtun v2.8B, v2.8H sqxtun v2.8b, v2.8h
st1 {v2.S}[0], [x7], x2 st1 {v2.s}[0], [x7], x2
st1 {v2.S}[1], [x7], x2 st1 {v2.s}[1], [x7], x2
ret ret
.endm .endm
@ -689,10 +689,10 @@ function ff_biweight_h264_pixels_\w\()_neon, export=1
add w7, w7, #1 add w7, w7, #1
eor w8, w8, w6, lsr #30 eor w8, w8, w6, lsr #30
orr w7, w7, #1 orr w7, w7, #1
dup v18.8H, w4 dup v18.8h, w4
lsl w7, w7, w4 lsl w7, w7, w4
not v18.16B, v18.16B not v18.16b, v18.16b
dup v16.8H, w7 dup v16.8h, w7
mov x7, x0 mov x7, x0
cbz w8, 10f cbz w8, 10f
subs w8, w8, #1 subs w8, w8, #1
@ -716,78 +716,78 @@ endfunc
biweight_func 4 biweight_func 4
.macro weight_16 add .macro weight_16 add
dup v0.16B, w4 dup v0.16b, w4
1: subs w2, w2, #2 1: subs w2, w2, #2
ld1 {v20.16B}, [x0], x1 ld1 {v20.16b}, [x0], x1
umull v4.8H, v0.8B, v20.8B umull v4.8h, v0.8b, v20.8b
umull2 v6.8H, v0.16B, v20.16B umull2 v6.8h, v0.16b, v20.16b
ld1 {v28.16B}, [x0], x1 ld1 {v28.16b}, [x0], x1
umull v24.8H, v0.8B, v28.8B umull v24.8h, v0.8b, v28.8b
umull2 v26.8H, v0.16B, v28.16B umull2 v26.8h, v0.16b, v28.16b
\add v4.8H, v16.8H, v4.8H \add v4.8h, v16.8h, v4.8h
srshl v4.8H, v4.8H, v18.8H srshl v4.8h, v4.8h, v18.8h
\add v6.8H, v16.8H, v6.8H \add v6.8h, v16.8h, v6.8h
srshl v6.8H, v6.8H, v18.8H srshl v6.8h, v6.8h, v18.8h
sqxtun v4.8B, v4.8H sqxtun v4.8b, v4.8h
sqxtun2 v4.16B, v6.8H sqxtun2 v4.16b, v6.8h
\add v24.8H, v16.8H, v24.8H \add v24.8h, v16.8h, v24.8h
srshl v24.8H, v24.8H, v18.8H srshl v24.8h, v24.8h, v18.8h
\add v26.8H, v16.8H, v26.8H \add v26.8h, v16.8h, v26.8h
srshl v26.8H, v26.8H, v18.8H srshl v26.8h, v26.8h, v18.8h
sqxtun v24.8B, v24.8H sqxtun v24.8b, v24.8h
sqxtun2 v24.16B, v26.8H sqxtun2 v24.16b, v26.8h
st1 {v4.16B}, [x5], x1 st1 {v4.16b}, [x5], x1
st1 {v24.16B}, [x5], x1 st1 {v24.16b}, [x5], x1
b.ne 1b b.ne 1b
ret ret
.endm .endm
.macro weight_8 add .macro weight_8 add
dup v0.8B, w4 dup v0.8b, w4
1: subs w2, w2, #2 1: subs w2, w2, #2
ld1 {v4.8B}, [x0], x1 ld1 {v4.8b}, [x0], x1
umull v2.8H, v0.8B, v4.8B umull v2.8h, v0.8b, v4.8b
ld1 {v6.8B}, [x0], x1 ld1 {v6.8b}, [x0], x1
umull v20.8H, v0.8B, v6.8B umull v20.8h, v0.8b, v6.8b
\add v2.8H, v16.8H, v2.8H \add v2.8h, v16.8h, v2.8h
srshl v2.8H, v2.8H, v18.8H srshl v2.8h, v2.8h, v18.8h
sqxtun v2.8B, v2.8H sqxtun v2.8b, v2.8h
\add v20.8H, v16.8H, v20.8H \add v20.8h, v16.8h, v20.8h
srshl v20.8H, v20.8H, v18.8H srshl v20.8h, v20.8h, v18.8h
sqxtun v4.8B, v20.8H sqxtun v4.8b, v20.8h
st1 {v2.8B}, [x5], x1 st1 {v2.8b}, [x5], x1
st1 {v4.8B}, [x5], x1 st1 {v4.8b}, [x5], x1
b.ne 1b b.ne 1b
ret ret
.endm .endm
.macro weight_4 add .macro weight_4 add
dup v0.8B, w4 dup v0.8b, w4
1: subs w2, w2, #4 1: subs w2, w2, #4
ld1 {v4.S}[0], [x0], x1 ld1 {v4.s}[0], [x0], x1
ld1 {v4.S}[1], [x0], x1 ld1 {v4.s}[1], [x0], x1
umull v2.8H, v0.8B, v4.8B umull v2.8h, v0.8b, v4.8b
b.lt 2f b.lt 2f
ld1 {v6.S}[0], [x0], x1 ld1 {v6.s}[0], [x0], x1
ld1 {v6.S}[1], [x0], x1 ld1 {v6.s}[1], [x0], x1
umull v20.8H, v0.8B, v6.8B umull v20.8h, v0.8b, v6.8b
\add v2.8H, v16.8H, v2.8H \add v2.8h, v16.8h, v2.8h
srshl v2.8H, v2.8H, v18.8H srshl v2.8h, v2.8h, v18.8h
sqxtun v2.8B, v2.8H sqxtun v2.8b, v2.8h
\add v20.8H, v16.8H, v20.8H \add v20.8h, v16.8h, v20.8h
srshl v20.8H, v20.8h, v18.8H srshl v20.8h, v20.8h, v18.8h
sqxtun v4.8B, v20.8H sqxtun v4.8b, v20.8h
st1 {v2.S}[0], [x5], x1 st1 {v2.s}[0], [x5], x1
st1 {v2.S}[1], [x5], x1 st1 {v2.s}[1], [x5], x1
st1 {v4.S}[0], [x5], x1 st1 {v4.s}[0], [x5], x1
st1 {v4.S}[1], [x5], x1 st1 {v4.s}[1], [x5], x1
b.ne 1b b.ne 1b
ret ret
2: \add v2.8H, v16.8H, v2.8H 2: \add v2.8h, v16.8h, v2.8h
srshl v2.8H, v2.8H, v18.8H srshl v2.8h, v2.8h, v18.8h
sqxtun v2.8B, v2.8H sqxtun v2.8b, v2.8h
st1 {v2.S}[0], [x5], x1 st1 {v2.s}[0], [x5], x1
st1 {v2.S}[1], [x5], x1 st1 {v2.s}[1], [x5], x1
ret ret
.endm .endm
@ -796,18 +796,18 @@ function ff_weight_h264_pixels_\w\()_neon, export=1
cmp w3, #1 cmp w3, #1
mov w6, #1 mov w6, #1
lsl w5, w5, w3 lsl w5, w5, w3
dup v16.8H, w5 dup v16.8h, w5
mov x5, x0 mov x5, x0
b.le 20f b.le 20f
sub w6, w6, w3 sub w6, w6, w3
dup v18.8H, w6 dup v18.8h, w6
cmp w4, #0 cmp w4, #0
b.lt 10f b.lt 10f
weight_\w shadd weight_\w shadd
10: neg w4, w4 10: neg w4, w4
weight_\w shsub weight_\w shsub
20: neg w6, w3 20: neg w6, w3
dup v18.8H, w6 dup v18.8h, w6
cmp w4, #0 cmp w4, #0
b.lt 10f b.lt 10f
weight_\w add weight_\w add
@ -825,7 +825,7 @@ endfunc
ldr w6, [x4] ldr w6, [x4]
ccmp w3, #0, #0, ne ccmp w3, #0, #0, ne
lsl w2, w2, #2 lsl w2, w2, #2
mov v24.S[0], w6 mov v24.s[0], w6
lsl w3, w3, #2 lsl w3, w3, #2
and w8, w6, w6, lsl #16 and w8, w6, w6, lsl #16
b.eq 1f b.eq 1f

@ -25,54 +25,54 @@
function ff_h264_idct_add_neon, export=1 function ff_h264_idct_add_neon, export=1
.L_ff_h264_idct_add_neon: .L_ff_h264_idct_add_neon:
AARCH64_VALID_CALL_TARGET AARCH64_VALID_CALL_TARGET
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1] ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x1]
sxtw x2, w2 sxtw x2, w2
movi v30.8H, #0 movi v30.8h, #0
add v4.4H, v0.4H, v2.4H add v4.4h, v0.4h, v2.4h
sshr v16.4H, v1.4H, #1 sshr v16.4h, v1.4h, #1
st1 {v30.8H}, [x1], #16 st1 {v30.8h}, [x1], #16
sshr v17.4H, v3.4H, #1 sshr v17.4h, v3.4h, #1
st1 {v30.8H}, [x1], #16 st1 {v30.8h}, [x1], #16
sub v5.4H, v0.4H, v2.4H sub v5.4h, v0.4h, v2.4h
sub v6.4H, v16.4H, v3.4H sub v6.4h, v16.4h, v3.4h
add v7.4H, v1.4H, v17.4H add v7.4h, v1.4h, v17.4h
add v0.4H, v4.4H, v7.4H add v0.4h, v4.4h, v7.4h
add v1.4H, v5.4H, v6.4H add v1.4h, v5.4h, v6.4h
sub v2.4H, v5.4H, v6.4H sub v2.4h, v5.4h, v6.4h
sub v3.4H, v4.4H, v7.4H sub v3.4h, v4.4h, v7.4h
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
add v4.4H, v0.4H, v2.4H add v4.4h, v0.4h, v2.4h
ld1 {v18.S}[0], [x0], x2 ld1 {v18.s}[0], [x0], x2
sshr v16.4H, v3.4H, #1 sshr v16.4h, v3.4h, #1
sshr v17.4H, v1.4H, #1 sshr v17.4h, v1.4h, #1
ld1 {v18.S}[1], [x0], x2 ld1 {v18.s}[1], [x0], x2
sub v5.4H, v0.4H, v2.4H sub v5.4h, v0.4h, v2.4h
ld1 {v19.S}[1], [x0], x2 ld1 {v19.s}[1], [x0], x2
add v6.4H, v16.4H, v1.4H add v6.4h, v16.4h, v1.4h
ins v4.D[1], v5.D[0] ins v4.d[1], v5.d[0]
sub v7.4H, v17.4H, v3.4H sub v7.4h, v17.4h, v3.4h
ld1 {v19.S}[0], [x0], x2 ld1 {v19.s}[0], [x0], x2
ins v6.D[1], v7.D[0] ins v6.d[1], v7.d[0]
sub x0, x0, x2, lsl #2 sub x0, x0, x2, lsl #2
add v0.8H, v4.8H, v6.8H add v0.8h, v4.8h, v6.8h
sub v1.8H, v4.8H, v6.8H sub v1.8h, v4.8h, v6.8h
srshr v0.8H, v0.8H, #6 srshr v0.8h, v0.8h, #6
srshr v1.8H, v1.8H, #6 srshr v1.8h, v1.8h, #6
uaddw v0.8H, v0.8H, v18.8B uaddw v0.8h, v0.8h, v18.8b
uaddw v1.8H, v1.8H, v19.8B uaddw v1.8h, v1.8h, v19.8b
sqxtun v0.8B, v0.8H sqxtun v0.8b, v0.8h
sqxtun v1.8B, v1.8H sqxtun v1.8b, v1.8h
st1 {v0.S}[0], [x0], x2 st1 {v0.s}[0], [x0], x2
st1 {v0.S}[1], [x0], x2 st1 {v0.s}[1], [x0], x2
st1 {v1.S}[1], [x0], x2 st1 {v1.s}[1], [x0], x2
st1 {v1.S}[0], [x0], x2 st1 {v1.s}[0], [x0], x2
sub x1, x1, #32 sub x1, x1, #32
ret ret
@ -83,22 +83,22 @@ function ff_h264_idct_dc_add_neon, export=1
AARCH64_VALID_CALL_TARGET AARCH64_VALID_CALL_TARGET
sxtw x2, w2 sxtw x2, w2
mov w3, #0 mov w3, #0
ld1r {v2.8H}, [x1] ld1r {v2.8h}, [x1]
strh w3, [x1] strh w3, [x1]
srshr v2.8H, v2.8H, #6 srshr v2.8h, v2.8h, #6
ld1 {v0.S}[0], [x0], x2 ld1 {v0.s}[0], [x0], x2
ld1 {v0.S}[1], [x0], x2 ld1 {v0.s}[1], [x0], x2
uaddw v3.8H, v2.8H, v0.8B uaddw v3.8h, v2.8h, v0.8b
ld1 {v1.S}[0], [x0], x2 ld1 {v1.s}[0], [x0], x2
ld1 {v1.S}[1], [x0], x2 ld1 {v1.s}[1], [x0], x2
uaddw v4.8H, v2.8H, v1.8B uaddw v4.8h, v2.8h, v1.8b
sqxtun v0.8B, v3.8H sqxtun v0.8b, v3.8h
sqxtun v1.8B, v4.8H sqxtun v1.8b, v4.8h
sub x0, x0, x2, lsl #2 sub x0, x0, x2, lsl #2
st1 {v0.S}[0], [x0], x2 st1 {v0.s}[0], [x0], x2
st1 {v0.S}[1], [x0], x2 st1 {v0.s}[1], [x0], x2
st1 {v1.S}[0], [x0], x2 st1 {v1.s}[0], [x0], x2
st1 {v1.S}[1], [x0], x2 st1 {v1.s}[1], [x0], x2
ret ret
endfunc endfunc
@ -194,71 +194,71 @@ endfunc
.if \pass == 0 .if \pass == 0
va .req v18 va .req v18
vb .req v30 vb .req v30
sshr v18.8H, v26.8H, #1 sshr v18.8h, v26.8h, #1
add v16.8H, v24.8H, v28.8H add v16.8h, v24.8h, v28.8h
ld1 {v30.8H, v31.8H}, [x1] ld1 {v30.8h, v31.8h}, [x1]
st1 {v19.8H}, [x1], #16 st1 {v19.8h}, [x1], #16
st1 {v19.8H}, [x1], #16 st1 {v19.8h}, [x1], #16
sub v17.8H, v24.8H, v28.8H sub v17.8h, v24.8h, v28.8h
sshr v19.8H, v30.8H, #1 sshr v19.8h, v30.8h, #1
sub v18.8H, v18.8H, v30.8H sub v18.8h, v18.8h, v30.8h
add v19.8H, v19.8H, v26.8H add v19.8h, v19.8h, v26.8h
.else .else
va .req v30 va .req v30
vb .req v18 vb .req v18
sshr v30.8H, v26.8H, #1 sshr v30.8h, v26.8h, #1
sshr v19.8H, v18.8H, #1 sshr v19.8h, v18.8h, #1
add v16.8H, v24.8H, v28.8H add v16.8h, v24.8h, v28.8h
sub v17.8H, v24.8H, v28.8H sub v17.8h, v24.8h, v28.8h
sub v30.8H, v30.8H, v18.8H sub v30.8h, v30.8h, v18.8h
add v19.8H, v19.8H, v26.8H add v19.8h, v19.8h, v26.8h
.endif .endif
add v26.8H, v17.8H, va.8H add v26.8h, v17.8h, va.8h
sub v28.8H, v17.8H, va.8H sub v28.8h, v17.8h, va.8h
add v24.8H, v16.8H, v19.8H add v24.8h, v16.8h, v19.8h
sub vb.8H, v16.8H, v19.8H sub vb.8h, v16.8h, v19.8h
sub v16.8H, v29.8H, v27.8H sub v16.8h, v29.8h, v27.8h
add v17.8H, v31.8H, v25.8H add v17.8h, v31.8h, v25.8h
sub va.8H, v31.8H, v25.8H sub va.8h, v31.8h, v25.8h
add v19.8H, v29.8H, v27.8H add v19.8h, v29.8h, v27.8h
sub v16.8H, v16.8H, v31.8H sub v16.8h, v16.8h, v31.8h
sub v17.8H, v17.8H, v27.8H sub v17.8h, v17.8h, v27.8h
add va.8H, va.8H, v29.8H add va.8h, va.8h, v29.8h
add v19.8H, v19.8H, v25.8H add v19.8h, v19.8h, v25.8h
sshr v25.8H, v25.8H, #1 sshr v25.8h, v25.8h, #1
sshr v27.8H, v27.8H, #1 sshr v27.8h, v27.8h, #1
sshr v29.8H, v29.8H, #1 sshr v29.8h, v29.8h, #1
sshr v31.8H, v31.8H, #1 sshr v31.8h, v31.8h, #1
sub v16.8H, v16.8H, v31.8H sub v16.8h, v16.8h, v31.8h
sub v17.8H, v17.8H, v27.8H sub v17.8h, v17.8h, v27.8h
add va.8H, va.8H, v29.8H add va.8h, va.8h, v29.8h
add v19.8H, v19.8H, v25.8H add v19.8h, v19.8h, v25.8h
sshr v25.8H, v16.8H, #2 sshr v25.8h, v16.8h, #2
sshr v27.8H, v17.8H, #2 sshr v27.8h, v17.8h, #2
sshr v29.8H, va.8H, #2 sshr v29.8h, va.8h, #2
sshr v31.8H, v19.8H, #2 sshr v31.8h, v19.8h, #2
sub v19.8H, v19.8H, v25.8H sub v19.8h, v19.8h, v25.8h
sub va.8H, v27.8H, va.8H sub va.8h, v27.8h, va.8h
add v17.8H, v17.8H, v29.8H add v17.8h, v17.8h, v29.8h
add v16.8H, v16.8H, v31.8H add v16.8h, v16.8h, v31.8h
.if \pass == 0 .if \pass == 0
sub v31.8H, v24.8H, v19.8H sub v31.8h, v24.8h, v19.8h
add v24.8H, v24.8H, v19.8H add v24.8h, v24.8h, v19.8h
add v25.8H, v26.8H, v18.8H add v25.8h, v26.8h, v18.8h
sub v18.8H, v26.8H, v18.8H sub v18.8h, v26.8h, v18.8h
add v26.8H, v28.8H, v17.8H add v26.8h, v28.8h, v17.8h
add v27.8H, v30.8H, v16.8H add v27.8h, v30.8h, v16.8h
sub v29.8H, v28.8H, v17.8H sub v29.8h, v28.8h, v17.8h
sub v28.8H, v30.8H, v16.8H sub v28.8h, v30.8h, v16.8h
.else .else
sub v31.8H, v24.8H, v19.8H sub v31.8h, v24.8h, v19.8h
add v24.8H, v24.8H, v19.8H add v24.8h, v24.8h, v19.8h
add v25.8H, v26.8H, v30.8H add v25.8h, v26.8h, v30.8h
sub v30.8H, v26.8H, v30.8H sub v30.8h, v26.8h, v30.8h
add v26.8H, v28.8H, v17.8H add v26.8h, v28.8h, v17.8h
sub v29.8H, v28.8H, v17.8H sub v29.8h, v28.8h, v17.8h
add v27.8H, v18.8H, v16.8H add v27.8h, v18.8h, v16.8h
sub v28.8H, v18.8H, v16.8H sub v28.8h, v18.8h, v16.8h
.endif .endif
.unreq va .unreq va
.unreq vb .unreq vb
@ -267,63 +267,63 @@ endfunc
function ff_h264_idct8_add_neon, export=1 function ff_h264_idct8_add_neon, export=1
.L_ff_h264_idct8_add_neon: .L_ff_h264_idct8_add_neon:
AARCH64_VALID_CALL_TARGET AARCH64_VALID_CALL_TARGET
movi v19.8H, #0 movi v19.8h, #0
sxtw x2, w2 sxtw x2, w2
ld1 {v24.8H, v25.8H}, [x1] ld1 {v24.8h, v25.8h}, [x1]
st1 {v19.8H}, [x1], #16 st1 {v19.8h}, [x1], #16
st1 {v19.8H}, [x1], #16 st1 {v19.8h}, [x1], #16
ld1 {v26.8H, v27.8H}, [x1] ld1 {v26.8h, v27.8h}, [x1]
st1 {v19.8H}, [x1], #16 st1 {v19.8h}, [x1], #16
st1 {v19.8H}, [x1], #16 st1 {v19.8h}, [x1], #16
ld1 {v28.8H, v29.8H}, [x1] ld1 {v28.8h, v29.8h}, [x1]
st1 {v19.8H}, [x1], #16 st1 {v19.8h}, [x1], #16
st1 {v19.8H}, [x1], #16 st1 {v19.8h}, [x1], #16
idct8x8_cols 0 idct8x8_cols 0
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7 transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
idct8x8_cols 1 idct8x8_cols 1
mov x3, x0 mov x3, x0
srshr v24.8H, v24.8H, #6 srshr v24.8h, v24.8h, #6
ld1 {v0.8B}, [x0], x2 ld1 {v0.8b}, [x0], x2
srshr v25.8H, v25.8H, #6 srshr v25.8h, v25.8h, #6
ld1 {v1.8B}, [x0], x2 ld1 {v1.8b}, [x0], x2
srshr v26.8H, v26.8H, #6 srshr v26.8h, v26.8h, #6
ld1 {v2.8B}, [x0], x2 ld1 {v2.8b}, [x0], x2
srshr v27.8H, v27.8H, #6 srshr v27.8h, v27.8h, #6
ld1 {v3.8B}, [x0], x2 ld1 {v3.8b}, [x0], x2
srshr v28.8H, v28.8H, #6 srshr v28.8h, v28.8h, #6
ld1 {v4.8B}, [x0], x2 ld1 {v4.8b}, [x0], x2
srshr v29.8H, v29.8H, #6 srshr v29.8h, v29.8h, #6
ld1 {v5.8B}, [x0], x2 ld1 {v5.8b}, [x0], x2
srshr v30.8H, v30.8H, #6 srshr v30.8h, v30.8h, #6
ld1 {v6.8B}, [x0], x2 ld1 {v6.8b}, [x0], x2
srshr v31.8H, v31.8H, #6 srshr v31.8h, v31.8h, #6
ld1 {v7.8B}, [x0], x2 ld1 {v7.8b}, [x0], x2
uaddw v24.8H, v24.8H, v0.8B uaddw v24.8h, v24.8h, v0.8b
uaddw v25.8H, v25.8H, v1.8B uaddw v25.8h, v25.8h, v1.8b
uaddw v26.8H, v26.8H, v2.8B uaddw v26.8h, v26.8h, v2.8b
sqxtun v0.8B, v24.8H sqxtun v0.8b, v24.8h
uaddw v27.8H, v27.8H, v3.8B uaddw v27.8h, v27.8h, v3.8b
sqxtun v1.8B, v25.8H sqxtun v1.8b, v25.8h
uaddw v28.8H, v28.8H, v4.8B uaddw v28.8h, v28.8h, v4.8b
sqxtun v2.8B, v26.8H sqxtun v2.8b, v26.8h
st1 {v0.8B}, [x3], x2 st1 {v0.8b}, [x3], x2
uaddw v29.8H, v29.8H, v5.8B uaddw v29.8h, v29.8h, v5.8b
sqxtun v3.8B, v27.8H sqxtun v3.8b, v27.8h
st1 {v1.8B}, [x3], x2 st1 {v1.8b}, [x3], x2
uaddw v30.8H, v30.8H, v6.8B uaddw v30.8h, v30.8h, v6.8b
sqxtun v4.8B, v28.8H sqxtun v4.8b, v28.8h
st1 {v2.8B}, [x3], x2 st1 {v2.8b}, [x3], x2
uaddw v31.8H, v31.8H, v7.8B uaddw v31.8h, v31.8h, v7.8b
sqxtun v5.8B, v29.8H sqxtun v5.8b, v29.8h
st1 {v3.8B}, [x3], x2 st1 {v3.8b}, [x3], x2
sqxtun v6.8B, v30.8H sqxtun v6.8b, v30.8h
sqxtun v7.8B, v31.8H sqxtun v7.8b, v31.8h
st1 {v4.8B}, [x3], x2 st1 {v4.8b}, [x3], x2
st1 {v5.8B}, [x3], x2 st1 {v5.8b}, [x3], x2
st1 {v6.8B}, [x3], x2 st1 {v6.8b}, [x3], x2
st1 {v7.8B}, [x3], x2 st1 {v7.8b}, [x3], x2
sub x1, x1, #128 sub x1, x1, #128
ret ret
@ -334,42 +334,42 @@ function ff_h264_idct8_dc_add_neon, export=1
AARCH64_VALID_CALL_TARGET AARCH64_VALID_CALL_TARGET
mov w3, #0 mov w3, #0
sxtw x2, w2 sxtw x2, w2
ld1r {v31.8H}, [x1] ld1r {v31.8h}, [x1]
strh w3, [x1] strh w3, [x1]
ld1 {v0.8B}, [x0], x2 ld1 {v0.8b}, [x0], x2
srshr v31.8H, v31.8H, #6 srshr v31.8h, v31.8h, #6
ld1 {v1.8B}, [x0], x2 ld1 {v1.8b}, [x0], x2
ld1 {v2.8B}, [x0], x2 ld1 {v2.8b}, [x0], x2
uaddw v24.8H, v31.8H, v0.8B uaddw v24.8h, v31.8h, v0.8b
ld1 {v3.8B}, [x0], x2 ld1 {v3.8b}, [x0], x2
uaddw v25.8H, v31.8H, v1.8B uaddw v25.8h, v31.8h, v1.8b
ld1 {v4.8B}, [x0], x2 ld1 {v4.8b}, [x0], x2
uaddw v26.8H, v31.8H, v2.8B uaddw v26.8h, v31.8h, v2.8b
ld1 {v5.8B}, [x0], x2 ld1 {v5.8b}, [x0], x2
uaddw v27.8H, v31.8H, v3.8B uaddw v27.8h, v31.8h, v3.8b
ld1 {v6.8B}, [x0], x2 ld1 {v6.8b}, [x0], x2
uaddw v28.8H, v31.8H, v4.8B uaddw v28.8h, v31.8h, v4.8b
ld1 {v7.8B}, [x0], x2 ld1 {v7.8b}, [x0], x2
uaddw v29.8H, v31.8H, v5.8B uaddw v29.8h, v31.8h, v5.8b
uaddw v30.8H, v31.8H, v6.8B uaddw v30.8h, v31.8h, v6.8b
uaddw v31.8H, v31.8H, v7.8B uaddw v31.8h, v31.8h, v7.8b
sqxtun v0.8B, v24.8H sqxtun v0.8b, v24.8h
sqxtun v1.8B, v25.8H sqxtun v1.8b, v25.8h
sqxtun v2.8B, v26.8H sqxtun v2.8b, v26.8h
sqxtun v3.8B, v27.8H sqxtun v3.8b, v27.8h
sub x0, x0, x2, lsl #3 sub x0, x0, x2, lsl #3
st1 {v0.8B}, [x0], x2 st1 {v0.8b}, [x0], x2
sqxtun v4.8B, v28.8H sqxtun v4.8b, v28.8h
st1 {v1.8B}, [x0], x2 st1 {v1.8b}, [x0], x2
sqxtun v5.8B, v29.8H sqxtun v5.8b, v29.8h
st1 {v2.8B}, [x0], x2 st1 {v2.8b}, [x0], x2
sqxtun v6.8B, v30.8H sqxtun v6.8b, v30.8h
st1 {v3.8B}, [x0], x2 st1 {v3.8b}, [x0], x2
sqxtun v7.8B, v31.8H sqxtun v7.8b, v31.8h
st1 {v4.8B}, [x0], x2 st1 {v4.8b}, [x0], x2
st1 {v5.8B}, [x0], x2 st1 {v5.8b}, [x0], x2
st1 {v6.8B}, [x0], x2 st1 {v6.8b}, [x0], x2
st1 {v7.8B}, [x0], x2 st1 {v7.8b}, [x0], x2
ret ret
endfunc endfunc

@ -27,127 +27,127 @@
.macro lowpass_const r .macro lowpass_const r
movz \r, #20, lsl #16 movz \r, #20, lsl #16
movk \r, #5 movk \r, #5
mov v6.S[0], \r mov v6.s[0], \r
.endm .endm
//trashes v0-v5 //trashes v0-v5
.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
ext v2.8B, \r0\().8B, \r1\().8B, #2 ext v2.8b, \r0\().8b, \r1\().8b, #2
ext v3.8B, \r0\().8B, \r1\().8B, #3 ext v3.8b, \r0\().8b, \r1\().8b, #3
uaddl v2.8H, v2.8B, v3.8B uaddl v2.8h, v2.8b, v3.8b
ext v4.8B, \r0\().8B, \r1\().8B, #1 ext v4.8b, \r0\().8b, \r1\().8b, #1
ext v5.8B, \r0\().8B, \r1\().8B, #4 ext v5.8b, \r0\().8b, \r1\().8b, #4
uaddl v4.8H, v4.8B, v5.8B uaddl v4.8h, v4.8b, v5.8b
ext v1.8B, \r0\().8B, \r1\().8B, #5 ext v1.8b, \r0\().8b, \r1\().8b, #5
uaddl \d0\().8H, \r0\().8B, v1.8B uaddl \d0\().8h, \r0\().8b, v1.8b
ext v0.8B, \r2\().8B, \r3\().8B, #2 ext v0.8b, \r2\().8b, \r3\().8b, #2
mla \d0\().8H, v2.8H, v6.H[1] mla \d0\().8h, v2.8h, v6.h[1]
ext v1.8B, \r2\().8B, \r3\().8B, #3 ext v1.8b, \r2\().8b, \r3\().8b, #3
uaddl v0.8H, v0.8B, v1.8B uaddl v0.8h, v0.8b, v1.8b
ext v1.8B, \r2\().8B, \r3\().8B, #1 ext v1.8b, \r2\().8b, \r3\().8b, #1
mls \d0\().8H, v4.8H, v6.H[0] mls \d0\().8h, v4.8h, v6.h[0]
ext v3.8B, \r2\().8B, \r3\().8B, #4 ext v3.8b, \r2\().8b, \r3\().8b, #4
uaddl v1.8H, v1.8B, v3.8B uaddl v1.8h, v1.8b, v3.8b
ext v2.8B, \r2\().8B, \r3\().8B, #5 ext v2.8b, \r2\().8b, \r3\().8b, #5
uaddl \d1\().8H, \r2\().8B, v2.8B uaddl \d1\().8h, \r2\().8b, v2.8b
mla \d1\().8H, v0.8H, v6.H[1] mla \d1\().8h, v0.8h, v6.h[1]
mls \d1\().8H, v1.8H, v6.H[0] mls \d1\().8h, v1.8h, v6.h[0]
.if \narrow .if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5 sqrshrun \d0\().8b, \d0\().8h, #5
sqrshrun \d1\().8B, \d1\().8H, #5 sqrshrun \d1\().8b, \d1\().8h, #5
.endif .endif
.endm .endm
//trashes v0-v4 //trashes v0-v4
.macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1 .macro lowpass_8_v r0, r1, r2, r3, r4, r5, r6, d0, d1, narrow=1
uaddl v2.8H, \r2\().8B, \r3\().8B uaddl v2.8h, \r2\().8b, \r3\().8b
uaddl v0.8H, \r3\().8B, \r4\().8B uaddl v0.8h, \r3\().8b, \r4\().8b
uaddl v4.8H, \r1\().8B, \r4\().8B uaddl v4.8h, \r1\().8b, \r4\().8b
uaddl v1.8H, \r2\().8B, \r5\().8B uaddl v1.8h, \r2\().8b, \r5\().8b
uaddl \d0\().8H, \r0\().8B, \r5\().8B uaddl \d0\().8h, \r0\().8b, \r5\().8b
uaddl \d1\().8H, \r1\().8B, \r6\().8B uaddl \d1\().8h, \r1\().8b, \r6\().8b
mla \d0\().8H, v2.8H, v6.H[1] mla \d0\().8h, v2.8h, v6.h[1]
mls \d0\().8H, v4.8H, v6.H[0] mls \d0\().8h, v4.8h, v6.h[0]
mla \d1\().8H, v0.8H, v6.H[1] mla \d1\().8h, v0.8h, v6.h[1]
mls \d1\().8H, v1.8H, v6.H[0] mls \d1\().8h, v1.8h, v6.h[0]
.if \narrow .if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5 sqrshrun \d0\().8b, \d0\().8h, #5
sqrshrun \d1\().8B, \d1\().8H, #5 sqrshrun \d1\().8b, \d1\().8h, #5
.endif .endif
.endm .endm
//trashes v0-v5, v7, v30-v31 //trashes v0-v5, v7, v30-v31
.macro lowpass_8H r0, r1 .macro lowpass_8H r0, r1
ext v0.16B, \r0\().16B, \r0\().16B, #2 ext v0.16b, \r0\().16b, \r0\().16b, #2
ext v1.16B, \r0\().16B, \r0\().16B, #3 ext v1.16b, \r0\().16b, \r0\().16b, #3
uaddl v0.8H, v0.8B, v1.8B uaddl v0.8h, v0.8b, v1.8b
ext v2.16B, \r0\().16B, \r0\().16B, #1 ext v2.16b, \r0\().16b, \r0\().16b, #1
ext v3.16B, \r0\().16B, \r0\().16B, #4 ext v3.16b, \r0\().16b, \r0\().16b, #4
uaddl v2.8H, v2.8B, v3.8B uaddl v2.8h, v2.8b, v3.8b
ext v30.16B, \r0\().16B, \r0\().16B, #5 ext v30.16b, \r0\().16b, \r0\().16b, #5
uaddl \r0\().8H, \r0\().8B, v30.8B uaddl \r0\().8h, \r0\().8b, v30.8b
ext v4.16B, \r1\().16B, \r1\().16B, #2 ext v4.16b, \r1\().16b, \r1\().16b, #2
mla \r0\().8H, v0.8H, v6.H[1] mla \r0\().8h, v0.8h, v6.h[1]
ext v5.16B, \r1\().16B, \r1\().16B, #3 ext v5.16b, \r1\().16b, \r1\().16b, #3
uaddl v4.8H, v4.8B, v5.8B uaddl v4.8h, v4.8b, v5.8b
ext v7.16B, \r1\().16B, \r1\().16B, #1 ext v7.16b, \r1\().16b, \r1\().16b, #1
mls \r0\().8H, v2.8H, v6.H[0] mls \r0\().8h, v2.8h, v6.h[0]
ext v0.16B, \r1\().16B, \r1\().16B, #4 ext v0.16b, \r1\().16b, \r1\().16b, #4
uaddl v7.8H, v7.8B, v0.8B uaddl v7.8h, v7.8b, v0.8b
ext v31.16B, \r1\().16B, \r1\().16B, #5 ext v31.16b, \r1\().16b, \r1\().16b, #5
uaddl \r1\().8H, \r1\().8B, v31.8B uaddl \r1\().8h, \r1\().8b, v31.8b
mla \r1\().8H, v4.8H, v6.H[1] mla \r1\().8h, v4.8h, v6.h[1]
mls \r1\().8H, v7.8H, v6.H[0] mls \r1\().8h, v7.8h, v6.h[0]
.endm .endm
// trashes v2-v5, v30 // trashes v2-v5, v30
.macro lowpass_8_1 r0, r1, d0, narrow=1 .macro lowpass_8_1 r0, r1, d0, narrow=1
ext v2.8B, \r0\().8B, \r1\().8B, #2 ext v2.8b, \r0\().8b, \r1\().8b, #2
ext v3.8B, \r0\().8B, \r1\().8B, #3 ext v3.8b, \r0\().8b, \r1\().8b, #3
uaddl v2.8H, v2.8B, v3.8B uaddl v2.8h, v2.8b, v3.8b
ext v4.8B, \r0\().8B, \r1\().8B, #1 ext v4.8b, \r0\().8b, \r1\().8b, #1
ext v5.8B, \r0\().8B, \r1\().8B, #4 ext v5.8b, \r0\().8b, \r1\().8b, #4
uaddl v4.8H, v4.8B, v5.8B uaddl v4.8h, v4.8b, v5.8b
ext v30.8B, \r0\().8B, \r1\().8B, #5 ext v30.8b, \r0\().8b, \r1\().8b, #5
uaddl \d0\().8H, \r0\().8B, v30.8B uaddl \d0\().8h, \r0\().8b, v30.8b
mla \d0\().8H, v2.8H, v6.H[1] mla \d0\().8h, v2.8h, v6.h[1]
mls \d0\().8H, v4.8H, v6.H[0] mls \d0\().8h, v4.8h, v6.h[0]
.if \narrow .if \narrow
sqrshrun \d0\().8B, \d0\().8H, #5 sqrshrun \d0\().8b, \d0\().8h, #5
.endif .endif
.endm .endm
// trashed v0-v7 // trashed v0-v7
.macro lowpass_8.16 r0, r1, r2, r3, r4, r5 .macro lowpass_8.16 r0, r1, r2, r3, r4, r5
saddl v5.4S, \r2\().4H, \r3\().4H saddl v5.4s, \r2\().4h, \r3\().4h
saddl2 v1.4S, \r2\().8H, \r3\().8H saddl2 v1.4s, \r2\().8h, \r3\().8h
saddl v6.4S, \r1\().4H, \r4\().4H saddl v6.4s, \r1\().4h, \r4\().4h
saddl2 v2.4S, \r1\().8H, \r4\().8H saddl2 v2.4s, \r1\().8h, \r4\().8h
saddl v0.4S, \r0\().4H, \r5\().4H saddl v0.4s, \r0\().4h, \r5\().4h
saddl2 v4.4S, \r0\().8H, \r5\().8H saddl2 v4.4s, \r0\().8h, \r5\().8h
shl v3.4S, v5.4S, #4 shl v3.4s, v5.4s, #4
shl v5.4S, v5.4S, #2 shl v5.4s, v5.4s, #2
shl v7.4S, v6.4S, #2 shl v7.4s, v6.4s, #2
add v5.4S, v5.4S, v3.4S add v5.4s, v5.4s, v3.4s
add v6.4S, v6.4S, v7.4S add v6.4s, v6.4s, v7.4s
shl v3.4S, v1.4S, #4 shl v3.4s, v1.4s, #4
shl v1.4S, v1.4S, #2 shl v1.4s, v1.4s, #2
shl v7.4S, v2.4S, #2 shl v7.4s, v2.4s, #2
add v1.4S, v1.4S, v3.4S add v1.4s, v1.4s, v3.4s
add v2.4S, v2.4S, v7.4S add v2.4s, v2.4s, v7.4s
add v5.4S, v5.4S, v0.4S add v5.4s, v5.4s, v0.4s
sub v5.4S, v5.4S, v6.4S sub v5.4s, v5.4s, v6.4s
add v1.4S, v1.4S, v4.4S add v1.4s, v1.4s, v4.4s
sub v1.4S, v1.4S, v2.4S sub v1.4s, v1.4s, v2.4s
rshrn v5.4H, v5.4S, #10 rshrn v5.4h, v5.4s, #10
rshrn2 v5.8H, v1.4S, #10 rshrn2 v5.8h, v1.4s, #10
sqxtun \r0\().8B, v5.8H sqxtun \r0\().8b, v5.8h
.endm .endm
function put_h264_qpel16_h_lowpass_neon_packed function put_h264_qpel16_h_lowpass_neon_packed
@ -176,19 +176,19 @@ function \type\()_h264_qpel16_h_lowpass_neon
endfunc endfunc
function \type\()_h264_qpel8_h_lowpass_neon function \type\()_h264_qpel8_h_lowpass_neon
1: ld1 {v28.8B, v29.8B}, [x1], x2 1: ld1 {v28.8b, v29.8b}, [x1], x2
ld1 {v16.8B, v17.8B}, [x1], x2 ld1 {v16.8b, v17.8b}, [x1], x2
subs x12, x12, #2 subs x12, x12, #2
lowpass_8 v28, v29, v16, v17, v28, v16 lowpass_8 v28, v29, v16, v17, v28, v16
.ifc \type,avg .ifc \type,avg
ld1 {v2.8B}, [x0], x3 ld1 {v2.8b}, [x0], x3
ld1 {v3.8B}, [x0] ld1 {v3.8b}, [x0]
urhadd v28.8B, v28.8B, v2.8B urhadd v28.8b, v28.8b, v2.8b
urhadd v16.8B, v16.8B, v3.8B urhadd v16.8b, v16.8b, v3.8b
sub x0, x0, x3 sub x0, x0, x3
.endif .endif
st1 {v28.8B}, [x0], x3 st1 {v28.8b}, [x0], x3
st1 {v16.8B}, [x0], x3 st1 {v16.8b}, [x0], x3
b.ne 1b b.ne 1b
ret ret
endfunc endfunc
@ -213,23 +213,23 @@ function \type\()_h264_qpel16_h_lowpass_l2_neon
endfunc endfunc
function \type\()_h264_qpel8_h_lowpass_l2_neon function \type\()_h264_qpel8_h_lowpass_l2_neon
1: ld1 {v26.8B, v27.8B}, [x1], x2 1: ld1 {v26.8b, v27.8b}, [x1], x2
ld1 {v16.8B, v17.8B}, [x1], x2 ld1 {v16.8b, v17.8b}, [x1], x2
ld1 {v28.8B}, [x3], x2 ld1 {v28.8b}, [x3], x2
ld1 {v29.8B}, [x3], x2 ld1 {v29.8b}, [x3], x2
subs x12, x12, #2 subs x12, x12, #2
lowpass_8 v26, v27, v16, v17, v26, v27 lowpass_8 v26, v27, v16, v17, v26, v27
urhadd v26.8B, v26.8B, v28.8B urhadd v26.8b, v26.8b, v28.8b
urhadd v27.8B, v27.8B, v29.8B urhadd v27.8b, v27.8b, v29.8b
.ifc \type,avg .ifc \type,avg
ld1 {v2.8B}, [x0], x2 ld1 {v2.8b}, [x0], x2
ld1 {v3.8B}, [x0] ld1 {v3.8b}, [x0]
urhadd v26.8B, v26.8B, v2.8B urhadd v26.8b, v26.8b, v2.8b
urhadd v27.8B, v27.8B, v3.8B urhadd v27.8b, v27.8b, v3.8b
sub x0, x0, x2 sub x0, x0, x2
.endif .endif
st1 {v26.8B}, [x0], x2 st1 {v26.8b}, [x0], x2
st1 {v27.8B}, [x0], x2 st1 {v27.8b}, [x0], x2
b.ne 1b b.ne 1b
ret ret
endfunc endfunc
@ -270,52 +270,52 @@ function \type\()_h264_qpel16_v_lowpass_neon
endfunc endfunc
function \type\()_h264_qpel8_v_lowpass_neon function \type\()_h264_qpel8_v_lowpass_neon
ld1 {v16.8B}, [x1], x3 ld1 {v16.8b}, [x1], x3
ld1 {v17.8B}, [x1], x3 ld1 {v17.8b}, [x1], x3
ld1 {v18.8B}, [x1], x3 ld1 {v18.8b}, [x1], x3
ld1 {v19.8B}, [x1], x3 ld1 {v19.8b}, [x1], x3
ld1 {v20.8B}, [x1], x3 ld1 {v20.8b}, [x1], x3
ld1 {v21.8B}, [x1], x3 ld1 {v21.8b}, [x1], x3
ld1 {v22.8B}, [x1], x3 ld1 {v22.8b}, [x1], x3
ld1 {v23.8B}, [x1], x3 ld1 {v23.8b}, [x1], x3
ld1 {v24.8B}, [x1], x3 ld1 {v24.8b}, [x1], x3
ld1 {v25.8B}, [x1], x3 ld1 {v25.8b}, [x1], x3
ld1 {v26.8B}, [x1], x3 ld1 {v26.8b}, [x1], x3
ld1 {v27.8B}, [x1], x3 ld1 {v27.8b}, [x1], x3
ld1 {v28.8B}, [x1] ld1 {v28.8b}, [x1]
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
.ifc \type,avg .ifc \type,avg
ld1 {v24.8B}, [x0], x2 ld1 {v24.8b}, [x0], x2
ld1 {v25.8B}, [x0], x2 ld1 {v25.8b}, [x0], x2
ld1 {v26.8B}, [x0], x2 ld1 {v26.8b}, [x0], x2
urhadd v16.8B, v16.8B, v24.8B urhadd v16.8b, v16.8b, v24.8b
ld1 {v27.8B}, [x0], x2 ld1 {v27.8b}, [x0], x2
urhadd v17.8B, v17.8B, v25.8B urhadd v17.8b, v17.8b, v25.8b
ld1 {v28.8B}, [x0], x2 ld1 {v28.8b}, [x0], x2
urhadd v18.8B, v18.8B, v26.8B urhadd v18.8b, v18.8b, v26.8b
ld1 {v29.8B}, [x0], x2 ld1 {v29.8b}, [x0], x2
urhadd v19.8B, v19.8B, v27.8B urhadd v19.8b, v19.8b, v27.8b
ld1 {v30.8B}, [x0], x2 ld1 {v30.8b}, [x0], x2
urhadd v20.8B, v20.8B, v28.8B urhadd v20.8b, v20.8b, v28.8b
ld1 {v31.8B}, [x0], x2 ld1 {v31.8b}, [x0], x2
urhadd v21.8B, v21.8B, v29.8B urhadd v21.8b, v21.8b, v29.8b
urhadd v22.8B, v22.8B, v30.8B urhadd v22.8b, v22.8b, v30.8b
urhadd v23.8B, v23.8B, v31.8B urhadd v23.8b, v23.8b, v31.8b
sub x0, x0, x2, lsl #3 sub x0, x0, x2, lsl #3
.endif .endif
st1 {v16.8B}, [x0], x2 st1 {v16.8b}, [x0], x2
st1 {v17.8B}, [x0], x2 st1 {v17.8b}, [x0], x2
st1 {v18.8B}, [x0], x2 st1 {v18.8b}, [x0], x2
st1 {v19.8B}, [x0], x2 st1 {v19.8b}, [x0], x2
st1 {v20.8B}, [x0], x2 st1 {v20.8b}, [x0], x2
st1 {v21.8B}, [x0], x2 st1 {v21.8b}, [x0], x2
st1 {v22.8B}, [x0], x2 st1 {v22.8b}, [x0], x2
st1 {v23.8B}, [x0], x2 st1 {v23.8b}, [x0], x2
ret ret
endfunc endfunc
@ -343,70 +343,70 @@ function \type\()_h264_qpel16_v_lowpass_l2_neon
endfunc endfunc
function \type\()_h264_qpel8_v_lowpass_l2_neon function \type\()_h264_qpel8_v_lowpass_l2_neon
ld1 {v16.8B}, [x1], x3 ld1 {v16.8b}, [x1], x3
ld1 {v17.8B}, [x1], x3 ld1 {v17.8b}, [x1], x3
ld1 {v18.8B}, [x1], x3 ld1 {v18.8b}, [x1], x3
ld1 {v19.8B}, [x1], x3 ld1 {v19.8b}, [x1], x3
ld1 {v20.8B}, [x1], x3 ld1 {v20.8b}, [x1], x3
ld1 {v21.8B}, [x1], x3 ld1 {v21.8b}, [x1], x3
ld1 {v22.8B}, [x1], x3 ld1 {v22.8b}, [x1], x3
ld1 {v23.8B}, [x1], x3 ld1 {v23.8b}, [x1], x3
ld1 {v24.8B}, [x1], x3 ld1 {v24.8b}, [x1], x3
ld1 {v25.8B}, [x1], x3 ld1 {v25.8b}, [x1], x3
ld1 {v26.8B}, [x1], x3 ld1 {v26.8b}, [x1], x3
ld1 {v27.8B}, [x1], x3 ld1 {v27.8b}, [x1], x3
ld1 {v28.8B}, [x1] ld1 {v28.8b}, [x1]
lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17 lowpass_8_v v16, v17, v18, v19, v20, v21, v22, v16, v17
lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19 lowpass_8_v v18, v19, v20, v21, v22, v23, v24, v18, v19
lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21 lowpass_8_v v20, v21, v22, v23, v24, v25, v26, v20, v21
lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23 lowpass_8_v v22, v23, v24, v25, v26, v27, v28, v22, v23
ld1 {v24.8B}, [x12], x2 ld1 {v24.8b}, [x12], x2
ld1 {v25.8B}, [x12], x2 ld1 {v25.8b}, [x12], x2
ld1 {v26.8B}, [x12], x2 ld1 {v26.8b}, [x12], x2
ld1 {v27.8B}, [x12], x2 ld1 {v27.8b}, [x12], x2
ld1 {v28.8B}, [x12], x2 ld1 {v28.8b}, [x12], x2
urhadd v16.8B, v24.8B, v16.8B urhadd v16.8b, v24.8b, v16.8b
urhadd v17.8B, v25.8B, v17.8B urhadd v17.8b, v25.8b, v17.8b
ld1 {v29.8B}, [x12], x2 ld1 {v29.8b}, [x12], x2
urhadd v18.8B, v26.8B, v18.8B urhadd v18.8b, v26.8b, v18.8b
urhadd v19.8B, v27.8B, v19.8B urhadd v19.8b, v27.8b, v19.8b
ld1 {v30.8B}, [x12], x2 ld1 {v30.8b}, [x12], x2
urhadd v20.8B, v28.8B, v20.8B urhadd v20.8b, v28.8b, v20.8b
urhadd v21.8B, v29.8B, v21.8B urhadd v21.8b, v29.8b, v21.8b
ld1 {v31.8B}, [x12], x2 ld1 {v31.8b}, [x12], x2
urhadd v22.8B, v30.8B, v22.8B urhadd v22.8b, v30.8b, v22.8b
urhadd v23.8B, v31.8B, v23.8B urhadd v23.8b, v31.8b, v23.8b
.ifc \type,avg .ifc \type,avg
ld1 {v24.8B}, [x0], x3 ld1 {v24.8b}, [x0], x3
ld1 {v25.8B}, [x0], x3 ld1 {v25.8b}, [x0], x3
ld1 {v26.8B}, [x0], x3 ld1 {v26.8b}, [x0], x3
urhadd v16.8B, v16.8B, v24.8B urhadd v16.8b, v16.8b, v24.8b
ld1 {v27.8B}, [x0], x3 ld1 {v27.8b}, [x0], x3
urhadd v17.8B, v17.8B, v25.8B urhadd v17.8b, v17.8b, v25.8b
ld1 {v28.8B}, [x0], x3 ld1 {v28.8b}, [x0], x3
urhadd v18.8B, v18.8B, v26.8B urhadd v18.8b, v18.8b, v26.8b
ld1 {v29.8B}, [x0], x3 ld1 {v29.8b}, [x0], x3
urhadd v19.8B, v19.8B, v27.8B urhadd v19.8b, v19.8b, v27.8b
ld1 {v30.8B}, [x0], x3 ld1 {v30.8b}, [x0], x3
urhadd v20.8B, v20.8B, v28.8B urhadd v20.8b, v20.8b, v28.8b
ld1 {v31.8B}, [x0], x3 ld1 {v31.8b}, [x0], x3
urhadd v21.8B, v21.8B, v29.8B urhadd v21.8b, v21.8b, v29.8b
urhadd v22.8B, v22.8B, v30.8B urhadd v22.8b, v22.8b, v30.8b
urhadd v23.8B, v23.8B, v31.8B urhadd v23.8b, v23.8b, v31.8b
sub x0, x0, x3, lsl #3 sub x0, x0, x3, lsl #3
.endif .endif
st1 {v16.8B}, [x0], x3 st1 {v16.8b}, [x0], x3
st1 {v17.8B}, [x0], x3 st1 {v17.8b}, [x0], x3
st1 {v18.8B}, [x0], x3 st1 {v18.8b}, [x0], x3
st1 {v19.8B}, [x0], x3 st1 {v19.8b}, [x0], x3
st1 {v20.8B}, [x0], x3 st1 {v20.8b}, [x0], x3
st1 {v21.8B}, [x0], x3 st1 {v21.8b}, [x0], x3
st1 {v22.8B}, [x0], x3 st1 {v22.8b}, [x0], x3
st1 {v23.8B}, [x0], x3 st1 {v23.8b}, [x0], x3
ret ret
endfunc endfunc
@ -417,19 +417,19 @@ endfunc
function put_h264_qpel8_hv_lowpass_neon_top function put_h264_qpel8_hv_lowpass_neon_top
lowpass_const w12 lowpass_const w12
ld1 {v16.8H}, [x1], x3 ld1 {v16.8h}, [x1], x3
ld1 {v17.8H}, [x1], x3 ld1 {v17.8h}, [x1], x3
ld1 {v18.8H}, [x1], x3 ld1 {v18.8h}, [x1], x3
ld1 {v19.8H}, [x1], x3 ld1 {v19.8h}, [x1], x3
ld1 {v20.8H}, [x1], x3 ld1 {v20.8h}, [x1], x3
ld1 {v21.8H}, [x1], x3 ld1 {v21.8h}, [x1], x3
ld1 {v22.8H}, [x1], x3 ld1 {v22.8h}, [x1], x3
ld1 {v23.8H}, [x1], x3 ld1 {v23.8h}, [x1], x3
ld1 {v24.8H}, [x1], x3 ld1 {v24.8h}, [x1], x3
ld1 {v25.8H}, [x1], x3 ld1 {v25.8h}, [x1], x3
ld1 {v26.8H}, [x1], x3 ld1 {v26.8h}, [x1], x3
ld1 {v27.8H}, [x1], x3 ld1 {v27.8h}, [x1], x3
ld1 {v28.8H}, [x1] ld1 {v28.8h}, [x1]
lowpass_8H v16, v17 lowpass_8H v16, v17
lowpass_8H v18, v19 lowpass_8H v18, v19
lowpass_8H v20, v21 lowpass_8H v20, v21
@ -458,33 +458,33 @@ function \type\()_h264_qpel8_hv_lowpass_neon
mov x10, x30 mov x10, x30
bl put_h264_qpel8_hv_lowpass_neon_top bl put_h264_qpel8_hv_lowpass_neon_top
.ifc \type,avg .ifc \type,avg
ld1 {v0.8B}, [x0], x2 ld1 {v0.8b}, [x0], x2
ld1 {v1.8B}, [x0], x2 ld1 {v1.8b}, [x0], x2
ld1 {v2.8B}, [x0], x2 ld1 {v2.8b}, [x0], x2
urhadd v16.8B, v16.8B, v0.8B urhadd v16.8b, v16.8b, v0.8b
ld1 {v3.8B}, [x0], x2 ld1 {v3.8b}, [x0], x2
urhadd v17.8B, v17.8B, v1.8B urhadd v17.8b, v17.8b, v1.8b
ld1 {v4.8B}, [x0], x2 ld1 {v4.8b}, [x0], x2
urhadd v18.8B, v18.8B, v2.8B urhadd v18.8b, v18.8b, v2.8b
ld1 {v5.8B}, [x0], x2 ld1 {v5.8b}, [x0], x2
urhadd v19.8B, v19.8B, v3.8B urhadd v19.8b, v19.8b, v3.8b
ld1 {v6.8B}, [x0], x2 ld1 {v6.8b}, [x0], x2
urhadd v20.8B, v20.8B, v4.8B urhadd v20.8b, v20.8b, v4.8b
ld1 {v7.8B}, [x0], x2 ld1 {v7.8b}, [x0], x2
urhadd v21.8B, v21.8B, v5.8B urhadd v21.8b, v21.8b, v5.8b
urhadd v22.8B, v22.8B, v6.8B urhadd v22.8b, v22.8b, v6.8b
urhadd v23.8B, v23.8B, v7.8B urhadd v23.8b, v23.8b, v7.8b
sub x0, x0, x2, lsl #3 sub x0, x0, x2, lsl #3
.endif .endif
st1 {v16.8B}, [x0], x2 st1 {v16.8b}, [x0], x2
st1 {v17.8B}, [x0], x2 st1 {v17.8b}, [x0], x2
st1 {v18.8B}, [x0], x2 st1 {v18.8b}, [x0], x2
st1 {v19.8B}, [x0], x2 st1 {v19.8b}, [x0], x2
st1 {v20.8B}, [x0], x2 st1 {v20.8b}, [x0], x2
st1 {v21.8B}, [x0], x2 st1 {v21.8b}, [x0], x2
st1 {v22.8B}, [x0], x2 st1 {v22.8b}, [x0], x2
st1 {v23.8B}, [x0], x2 st1 {v23.8b}, [x0], x2
ret x10 ret x10
endfunc endfunc
@ -498,45 +498,45 @@ function \type\()_h264_qpel8_hv_lowpass_l2_neon
mov x10, x30 mov x10, x30
bl put_h264_qpel8_hv_lowpass_neon_top bl put_h264_qpel8_hv_lowpass_neon_top
ld1 {v0.8B, v1.8B}, [x2], #16 ld1 {v0.8b, v1.8b}, [x2], #16
ld1 {v2.8B, v3.8B}, [x2], #16 ld1 {v2.8b, v3.8b}, [x2], #16
urhadd v0.8B, v0.8B, v16.8B urhadd v0.8b, v0.8b, v16.8b
urhadd v1.8B, v1.8B, v17.8B urhadd v1.8b, v1.8b, v17.8b
ld1 {v4.8B, v5.8B}, [x2], #16 ld1 {v4.8b, v5.8b}, [x2], #16
urhadd v2.8B, v2.8B, v18.8B urhadd v2.8b, v2.8b, v18.8b
urhadd v3.8B, v3.8B, v19.8B urhadd v3.8b, v3.8b, v19.8b
ld1 {v6.8B, v7.8B}, [x2], #16 ld1 {v6.8b, v7.8b}, [x2], #16
urhadd v4.8B, v4.8B, v20.8B urhadd v4.8b, v4.8b, v20.8b
urhadd v5.8B, v5.8B, v21.8B urhadd v5.8b, v5.8b, v21.8b
urhadd v6.8B, v6.8B, v22.8B urhadd v6.8b, v6.8b, v22.8b
urhadd v7.8B, v7.8B, v23.8B urhadd v7.8b, v7.8b, v23.8b
.ifc \type,avg .ifc \type,avg
ld1 {v16.8B}, [x0], x3 ld1 {v16.8b}, [x0], x3
ld1 {v17.8B}, [x0], x3 ld1 {v17.8b}, [x0], x3
ld1 {v18.8B}, [x0], x3 ld1 {v18.8b}, [x0], x3
urhadd v0.8B, v0.8B, v16.8B urhadd v0.8b, v0.8b, v16.8b
ld1 {v19.8B}, [x0], x3 ld1 {v19.8b}, [x0], x3
urhadd v1.8B, v1.8B, v17.8B urhadd v1.8b, v1.8b, v17.8b
ld1 {v20.8B}, [x0], x3 ld1 {v20.8b}, [x0], x3
urhadd v2.8B, v2.8B, v18.8B urhadd v2.8b, v2.8b, v18.8b
ld1 {v21.8B}, [x0], x3 ld1 {v21.8b}, [x0], x3
urhadd v3.8B, v3.8B, v19.8B urhadd v3.8b, v3.8b, v19.8b
ld1 {v22.8B}, [x0], x3 ld1 {v22.8b}, [x0], x3
urhadd v4.8B, v4.8B, v20.8B urhadd v4.8b, v4.8b, v20.8b
ld1 {v23.8B}, [x0], x3 ld1 {v23.8b}, [x0], x3
urhadd v5.8B, v5.8B, v21.8B urhadd v5.8b, v5.8b, v21.8b
urhadd v6.8B, v6.8B, v22.8B urhadd v6.8b, v6.8b, v22.8b
urhadd v7.8B, v7.8B, v23.8B urhadd v7.8b, v7.8b, v23.8b
sub x0, x0, x3, lsl #3 sub x0, x0, x3, lsl #3
.endif .endif
st1 {v0.8B}, [x0], x3 st1 {v0.8b}, [x0], x3
st1 {v1.8B}, [x0], x3 st1 {v1.8b}, [x0], x3
st1 {v2.8B}, [x0], x3 st1 {v2.8b}, [x0], x3
st1 {v3.8B}, [x0], x3 st1 {v3.8b}, [x0], x3
st1 {v4.8B}, [x0], x3 st1 {v4.8b}, [x0], x3
st1 {v5.8B}, [x0], x3 st1 {v5.8b}, [x0], x3
st1 {v6.8B}, [x0], x3 st1 {v6.8b}, [x0], x3
st1 {v7.8B}, [x0], x3 st1 {v7.8b}, [x0], x3
ret x10 ret x10
endfunc endfunc

@ -26,295 +26,295 @@
.if \avg .if \avg
mov x12, x0 mov x12, x0
.endif .endif
1: ld1 {v0.16B}, [x1], x2 1: ld1 {v0.16b}, [x1], x2
ld1 {v1.16B}, [x1], x2 ld1 {v1.16b}, [x1], x2
ld1 {v2.16B}, [x1], x2 ld1 {v2.16b}, [x1], x2
ld1 {v3.16B}, [x1], x2 ld1 {v3.16b}, [x1], x2
.if \avg .if \avg
ld1 {v4.16B}, [x12], x2 ld1 {v4.16b}, [x12], x2
urhadd v0.16B, v0.16B, v4.16B urhadd v0.16b, v0.16b, v4.16b
ld1 {v5.16B}, [x12], x2 ld1 {v5.16b}, [x12], x2
urhadd v1.16B, v1.16B, v5.16B urhadd v1.16b, v1.16b, v5.16b
ld1 {v6.16B}, [x12], x2 ld1 {v6.16b}, [x12], x2
urhadd v2.16B, v2.16B, v6.16B urhadd v2.16b, v2.16b, v6.16b
ld1 {v7.16B}, [x12], x2 ld1 {v7.16b}, [x12], x2
urhadd v3.16B, v3.16B, v7.16B urhadd v3.16b, v3.16b, v7.16b
.endif .endif
subs w3, w3, #4 subs w3, w3, #4
st1 {v0.16B}, [x0], x2 st1 {v0.16b}, [x0], x2
st1 {v1.16B}, [x0], x2 st1 {v1.16b}, [x0], x2
st1 {v2.16B}, [x0], x2 st1 {v2.16b}, [x0], x2
st1 {v3.16B}, [x0], x2 st1 {v3.16b}, [x0], x2
b.ne 1b b.ne 1b
ret ret
.endm .endm
.macro pixels16_x2 rnd=1, avg=0 .macro pixels16_x2 rnd=1, avg=0
1: ld1 {v0.16B, v1.16B}, [x1], x2 1: ld1 {v0.16b, v1.16b}, [x1], x2
ld1 {v2.16B, v3.16B}, [x1], x2 ld1 {v2.16b, v3.16b}, [x1], x2
subs w3, w3, #2 subs w3, w3, #2
ext v1.16B, v0.16B, v1.16B, #1 ext v1.16b, v0.16b, v1.16b, #1
avg v0.16B, v0.16B, v1.16B avg v0.16b, v0.16b, v1.16b
ext v3.16B, v2.16B, v3.16B, #1 ext v3.16b, v2.16b, v3.16b, #1
avg v2.16B, v2.16B, v3.16B avg v2.16b, v2.16b, v3.16b
.if \avg .if \avg
ld1 {v1.16B}, [x0], x2 ld1 {v1.16b}, [x0], x2
ld1 {v3.16B}, [x0] ld1 {v3.16b}, [x0]
urhadd v0.16B, v0.16B, v1.16B urhadd v0.16b, v0.16b, v1.16b
urhadd v2.16B, v2.16B, v3.16B urhadd v2.16b, v2.16b, v3.16b
sub x0, x0, x2 sub x0, x0, x2
.endif .endif
st1 {v0.16B}, [x0], x2 st1 {v0.16b}, [x0], x2
st1 {v2.16B}, [x0], x2 st1 {v2.16b}, [x0], x2
b.ne 1b b.ne 1b
ret ret
.endm .endm
.macro pixels16_y2 rnd=1, avg=0 .macro pixels16_y2 rnd=1, avg=0
sub w3, w3, #2 sub w3, w3, #2
ld1 {v0.16B}, [x1], x2 ld1 {v0.16b}, [x1], x2
ld1 {v1.16B}, [x1], x2 ld1 {v1.16b}, [x1], x2
1: subs w3, w3, #2 1: subs w3, w3, #2
avg v2.16B, v0.16B, v1.16B avg v2.16b, v0.16b, v1.16b
ld1 {v0.16B}, [x1], x2 ld1 {v0.16b}, [x1], x2
avg v3.16B, v0.16B, v1.16B avg v3.16b, v0.16b, v1.16b
ld1 {v1.16B}, [x1], x2 ld1 {v1.16b}, [x1], x2
.if \avg .if \avg
ld1 {v4.16B}, [x0], x2 ld1 {v4.16b}, [x0], x2
ld1 {v5.16B}, [x0] ld1 {v5.16b}, [x0]
urhadd v2.16B, v2.16B, v4.16B urhadd v2.16b, v2.16b, v4.16b
urhadd v3.16B, v3.16B, v5.16B urhadd v3.16b, v3.16b, v5.16b
sub x0, x0, x2 sub x0, x0, x2
.endif .endif
st1 {v2.16B}, [x0], x2 st1 {v2.16b}, [x0], x2
st1 {v3.16B}, [x0], x2 st1 {v3.16b}, [x0], x2
b.ne 1b b.ne 1b
avg v2.16B, v0.16B, v1.16B avg v2.16b, v0.16b, v1.16b
ld1 {v0.16B}, [x1], x2 ld1 {v0.16b}, [x1], x2
avg v3.16B, v0.16B, v1.16B avg v3.16b, v0.16b, v1.16b
.if \avg .if \avg
ld1 {v4.16B}, [x0], x2 ld1 {v4.16b}, [x0], x2
ld1 {v5.16B}, [x0] ld1 {v5.16b}, [x0]
urhadd v2.16B, v2.16B, v4.16B urhadd v2.16b, v2.16b, v4.16b
urhadd v3.16B, v3.16B, v5.16B urhadd v3.16b, v3.16b, v5.16b
sub x0, x0, x2 sub x0, x0, x2
.endif .endif
st1 {v2.16B}, [x0], x2 st1 {v2.16b}, [x0], x2
st1 {v3.16B}, [x0], x2 st1 {v3.16b}, [x0], x2
ret ret
.endm .endm
.macro pixels16_xy2 rnd=1, avg=0 .macro pixels16_xy2 rnd=1, avg=0
sub w3, w3, #2 sub w3, w3, #2
ld1 {v0.16B, v1.16B}, [x1], x2 ld1 {v0.16b, v1.16b}, [x1], x2
ld1 {v4.16B, v5.16B}, [x1], x2 ld1 {v4.16b, v5.16b}, [x1], x2
NRND movi v26.8H, #1 NRND movi v26.8H, #1
ext v1.16B, v0.16B, v1.16B, #1 ext v1.16b, v0.16b, v1.16b, #1
ext v5.16B, v4.16B, v5.16B, #1 ext v5.16b, v4.16b, v5.16b, #1
uaddl v16.8H, v0.8B, v1.8B uaddl v16.8h, v0.8b, v1.8b
uaddl2 v20.8H, v0.16B, v1.16B uaddl2 v20.8h, v0.16b, v1.16b
uaddl v18.8H, v4.8B, v5.8B uaddl v18.8h, v4.8b, v5.8b
uaddl2 v22.8H, v4.16B, v5.16B uaddl2 v22.8h, v4.16b, v5.16b
1: subs w3, w3, #2 1: subs w3, w3, #2
ld1 {v0.16B, v1.16B}, [x1], x2 ld1 {v0.16b, v1.16b}, [x1], x2
add v24.8H, v16.8H, v18.8H add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H NRND add v24.8H, v24.8H, v26.8H
ext v30.16B, v0.16B, v1.16B, #1 ext v30.16b, v0.16b, v1.16b, #1
add v1.8H, v20.8H, v22.8H add v1.8h, v20.8h, v22.8h
mshrn v28.8B, v24.8H, #2 mshrn v28.8b, v24.8h, #2
NRND add v1.8H, v1.8H, v26.8H NRND add v1.8H, v1.8H, v26.8H
mshrn2 v28.16B, v1.8H, #2 mshrn2 v28.16b, v1.8h, #2
.if \avg .if \avg
ld1 {v16.16B}, [x0] ld1 {v16.16b}, [x0]
urhadd v28.16B, v28.16B, v16.16B urhadd v28.16b, v28.16b, v16.16b
.endif .endif
uaddl v16.8H, v0.8B, v30.8B uaddl v16.8h, v0.8b, v30.8b
ld1 {v2.16B, v3.16B}, [x1], x2 ld1 {v2.16b, v3.16b}, [x1], x2
uaddl2 v20.8H, v0.16B, v30.16B uaddl2 v20.8h, v0.16b, v30.16b
st1 {v28.16B}, [x0], x2 st1 {v28.16b}, [x0], x2
add v24.8H, v16.8H, v18.8H add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H NRND add v24.8H, v24.8H, v26.8H
ext v3.16B, v2.16B, v3.16B, #1 ext v3.16b, v2.16b, v3.16b, #1
add v0.8H, v20.8H, v22.8H add v0.8h, v20.8h, v22.8h
mshrn v30.8B, v24.8H, #2 mshrn v30.8b, v24.8h, #2
NRND add v0.8H, v0.8H, v26.8H NRND add v0.8H, v0.8H, v26.8H
mshrn2 v30.16B, v0.8H, #2 mshrn2 v30.16b, v0.8h, #2
.if \avg .if \avg
ld1 {v18.16B}, [x0] ld1 {v18.16b}, [x0]
urhadd v30.16B, v30.16B, v18.16B urhadd v30.16b, v30.16b, v18.16b
.endif .endif
uaddl v18.8H, v2.8B, v3.8B uaddl v18.8h, v2.8b, v3.8b
uaddl2 v22.8H, v2.16B, v3.16B uaddl2 v22.8h, v2.16b, v3.16b
st1 {v30.16B}, [x0], x2 st1 {v30.16b}, [x0], x2
b.gt 1b b.gt 1b
ld1 {v0.16B, v1.16B}, [x1], x2 ld1 {v0.16b, v1.16b}, [x1], x2
add v24.8H, v16.8H, v18.8H add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H NRND add v24.8H, v24.8H, v26.8H
ext v30.16B, v0.16B, v1.16B, #1 ext v30.16b, v0.16b, v1.16b, #1
add v1.8H, v20.8H, v22.8H add v1.8h, v20.8h, v22.8h
mshrn v28.8B, v24.8H, #2 mshrn v28.8b, v24.8h, #2
NRND add v1.8H, v1.8H, v26.8H NRND add v1.8H, v1.8H, v26.8H
mshrn2 v28.16B, v1.8H, #2 mshrn2 v28.16b, v1.8h, #2
.if \avg .if \avg
ld1 {v16.16B}, [x0] ld1 {v16.16b}, [x0]
urhadd v28.16B, v28.16B, v16.16B urhadd v28.16b, v28.16b, v16.16b
.endif .endif
uaddl v16.8H, v0.8B, v30.8B uaddl v16.8h, v0.8b, v30.8b
uaddl2 v20.8H, v0.16B, v30.16B uaddl2 v20.8h, v0.16b, v30.16b
st1 {v28.16B}, [x0], x2 st1 {v28.16b}, [x0], x2
add v24.8H, v16.8H, v18.8H add v24.8h, v16.8h, v18.8h
NRND add v24.8H, v24.8H, v26.8H NRND add v24.8H, v24.8H, v26.8H
add v0.8H, v20.8H, v22.8H add v0.8h, v20.8h, v22.8h
mshrn v30.8B, v24.8H, #2 mshrn v30.8b, v24.8h, #2
NRND add v0.8H, v0.8H, v26.8H NRND add v0.8H, v0.8H, v26.8H
mshrn2 v30.16B, v0.8H, #2 mshrn2 v30.16b, v0.8h, #2
.if \avg .if \avg
ld1 {v18.16B}, [x0] ld1 {v18.16b}, [x0]
urhadd v30.16B, v30.16B, v18.16B urhadd v30.16b, v30.16b, v18.16b
.endif .endif
st1 {v30.16B}, [x0], x2 st1 {v30.16b}, [x0], x2
ret ret
.endm .endm
.macro pixels8 rnd=1, avg=0 .macro pixels8 rnd=1, avg=0
1: ld1 {v0.8B}, [x1], x2 1: ld1 {v0.8b}, [x1], x2
ld1 {v1.8B}, [x1], x2 ld1 {v1.8b}, [x1], x2
ld1 {v2.8B}, [x1], x2 ld1 {v2.8b}, [x1], x2
ld1 {v3.8B}, [x1], x2 ld1 {v3.8b}, [x1], x2
.if \avg .if \avg
ld1 {v4.8B}, [x0], x2 ld1 {v4.8b}, [x0], x2
urhadd v0.8B, v0.8B, v4.8B urhadd v0.8b, v0.8b, v4.8b
ld1 {v5.8B}, [x0], x2 ld1 {v5.8b}, [x0], x2
urhadd v1.8B, v1.8B, v5.8B urhadd v1.8b, v1.8b, v5.8b
ld1 {v6.8B}, [x0], x2 ld1 {v6.8b}, [x0], x2
urhadd v2.8B, v2.8B, v6.8B urhadd v2.8b, v2.8b, v6.8b
ld1 {v7.8B}, [x0], x2 ld1 {v7.8b}, [x0], x2
urhadd v3.8B, v3.8B, v7.8B urhadd v3.8b, v3.8b, v7.8b
sub x0, x0, x2, lsl #2 sub x0, x0, x2, lsl #2
.endif .endif
subs w3, w3, #4 subs w3, w3, #4
st1 {v0.8B}, [x0], x2 st1 {v0.8b}, [x0], x2
st1 {v1.8B}, [x0], x2 st1 {v1.8b}, [x0], x2
st1 {v2.8B}, [x0], x2 st1 {v2.8b}, [x0], x2
st1 {v3.8B}, [x0], x2 st1 {v3.8b}, [x0], x2
b.ne 1b b.ne 1b
ret ret
.endm .endm
.macro pixels8_x2 rnd=1, avg=0 .macro pixels8_x2 rnd=1, avg=0
1: ld1 {v0.8B, v1.8B}, [x1], x2 1: ld1 {v0.8b, v1.8b}, [x1], x2
ext v1.8B, v0.8B, v1.8B, #1 ext v1.8b, v0.8b, v1.8b, #1
ld1 {v2.8B, v3.8B}, [x1], x2 ld1 {v2.8b, v3.8b}, [x1], x2
ext v3.8B, v2.8B, v3.8B, #1 ext v3.8b, v2.8b, v3.8b, #1
subs w3, w3, #2 subs w3, w3, #2
avg v0.8B, v0.8B, v1.8B avg v0.8b, v0.8b, v1.8b
avg v2.8B, v2.8B, v3.8B avg v2.8b, v2.8b, v3.8b
.if \avg .if \avg
ld1 {v4.8B}, [x0], x2 ld1 {v4.8b}, [x0], x2
ld1 {v5.8B}, [x0] ld1 {v5.8b}, [x0]
urhadd v0.8B, v0.8B, v4.8B urhadd v0.8b, v0.8b, v4.8b
urhadd v2.8B, v2.8B, v5.8B urhadd v2.8b, v2.8b, v5.8b
sub x0, x0, x2 sub x0, x0, x2
.endif .endif
st1 {v0.8B}, [x0], x2 st1 {v0.8b}, [x0], x2
st1 {v2.8B}, [x0], x2 st1 {v2.8b}, [x0], x2
b.ne 1b b.ne 1b
ret ret
.endm .endm
.macro pixels8_y2 rnd=1, avg=0 .macro pixels8_y2 rnd=1, avg=0
sub w3, w3, #2 sub w3, w3, #2
ld1 {v0.8B}, [x1], x2 ld1 {v0.8b}, [x1], x2
ld1 {v1.8B}, [x1], x2 ld1 {v1.8b}, [x1], x2
1: subs w3, w3, #2 1: subs w3, w3, #2
avg v4.8B, v0.8B, v1.8B avg v4.8b, v0.8b, v1.8b
ld1 {v0.8B}, [x1], x2 ld1 {v0.8b}, [x1], x2
avg v5.8B, v0.8B, v1.8B avg v5.8b, v0.8b, v1.8b
ld1 {v1.8B}, [x1], x2 ld1 {v1.8b}, [x1], x2
.if \avg .if \avg
ld1 {v2.8B}, [x0], x2 ld1 {v2.8b}, [x0], x2
ld1 {v3.8B}, [x0] ld1 {v3.8b}, [x0]
urhadd v4.8B, v4.8B, v2.8B urhadd v4.8b, v4.8b, v2.8b
urhadd v5.8B, v5.8B, v3.8B urhadd v5.8b, v5.8b, v3.8b
sub x0, x0, x2 sub x0, x0, x2
.endif .endif
st1 {v4.8B}, [x0], x2 st1 {v4.8b}, [x0], x2
st1 {v5.8B}, [x0], x2 st1 {v5.8b}, [x0], x2
b.ne 1b b.ne 1b
avg v4.8B, v0.8B, v1.8B avg v4.8b, v0.8b, v1.8b
ld1 {v0.8B}, [x1], x2 ld1 {v0.8b}, [x1], x2
avg v5.8B, v0.8B, v1.8B avg v5.8b, v0.8b, v1.8b
.if \avg .if \avg
ld1 {v2.8B}, [x0], x2 ld1 {v2.8b}, [x0], x2
ld1 {v3.8B}, [x0] ld1 {v3.8b}, [x0]
urhadd v4.8B, v4.8B, v2.8B urhadd v4.8b, v4.8b, v2.8b
urhadd v5.8B, v5.8B, v3.8B urhadd v5.8b, v5.8b, v3.8b
sub x0, x0, x2 sub x0, x0, x2
.endif .endif
st1 {v4.8B}, [x0], x2 st1 {v4.8b}, [x0], x2
st1 {v5.8B}, [x0], x2 st1 {v5.8b}, [x0], x2
ret ret
.endm .endm
.macro pixels8_xy2 rnd=1, avg=0 .macro pixels8_xy2 rnd=1, avg=0
sub w3, w3, #2 sub w3, w3, #2
ld1 {v0.16B}, [x1], x2 ld1 {v0.16b}, [x1], x2
ld1 {v1.16B}, [x1], x2 ld1 {v1.16b}, [x1], x2
NRND movi v19.8H, #1 NRND movi v19.8H, #1
ext v4.16B, v0.16B, v4.16B, #1 ext v4.16b, v0.16b, v4.16b, #1
ext v6.16B, v1.16B, v6.16B, #1 ext v6.16b, v1.16b, v6.16b, #1
uaddl v16.8H, v0.8B, v4.8B uaddl v16.8h, v0.8b, v4.8b
uaddl v17.8H, v1.8B, v6.8B uaddl v17.8h, v1.8b, v6.8b
1: subs w3, w3, #2 1: subs w3, w3, #2
ld1 {v0.16B}, [x1], x2 ld1 {v0.16b}, [x1], x2
add v18.8H, v16.8H, v17.8H add v18.8h, v16.8h, v17.8h
ext v4.16B, v0.16B, v4.16B, #1 ext v4.16b, v0.16b, v4.16b, #1
NRND add v18.8H, v18.8H, v19.8H NRND add v18.8H, v18.8H, v19.8H
uaddl v16.8H, v0.8B, v4.8B uaddl v16.8h, v0.8b, v4.8b
mshrn v5.8B, v18.8H, #2 mshrn v5.8b, v18.8h, #2
ld1 {v1.16B}, [x1], x2 ld1 {v1.16b}, [x1], x2
add v18.8H, v16.8H, v17.8H add v18.8h, v16.8h, v17.8h
.if \avg .if \avg
ld1 {v7.8B}, [x0] ld1 {v7.8b}, [x0]
urhadd v5.8B, v5.8B, v7.8B urhadd v5.8b, v5.8b, v7.8b
.endif .endif
NRND add v18.8H, v18.8H, v19.8H NRND add v18.8H, v18.8H, v19.8H
st1 {v5.8B}, [x0], x2 st1 {v5.8b}, [x0], x2
mshrn v7.8B, v18.8H, #2 mshrn v7.8b, v18.8h, #2
.if \avg .if \avg
ld1 {v5.8B}, [x0] ld1 {v5.8b}, [x0]
urhadd v7.8B, v7.8B, v5.8B urhadd v7.8b, v7.8b, v5.8b
.endif .endif
ext v6.16B, v1.16B, v6.16B, #1 ext v6.16b, v1.16b, v6.16b, #1
uaddl v17.8H, v1.8B, v6.8B uaddl v17.8h, v1.8b, v6.8b
st1 {v7.8B}, [x0], x2 st1 {v7.8b}, [x0], x2
b.gt 1b b.gt 1b
ld1 {v0.16B}, [x1], x2 ld1 {v0.16b}, [x1], x2
add v18.8H, v16.8H, v17.8H add v18.8h, v16.8h, v17.8h
ext v4.16B, v0.16B, v4.16B, #1 ext v4.16b, v0.16b, v4.16b, #1
NRND add v18.8H, v18.8H, v19.8H NRND add v18.8H, v18.8H, v19.8H
uaddl v16.8H, v0.8B, v4.8B uaddl v16.8h, v0.8b, v4.8b
mshrn v5.8B, v18.8H, #2 mshrn v5.8b, v18.8h, #2
add v18.8H, v16.8H, v17.8H add v18.8h, v16.8h, v17.8h
.if \avg .if \avg
ld1 {v7.8B}, [x0] ld1 {v7.8b}, [x0]
urhadd v5.8B, v5.8B, v7.8B urhadd v5.8b, v5.8b, v7.8b
.endif .endif
NRND add v18.8H, v18.8H, v19.8H NRND add v18.8H, v18.8H, v19.8H
st1 {v5.8B}, [x0], x2 st1 {v5.8b}, [x0], x2
mshrn v7.8B, v18.8H, #2 mshrn v7.8b, v18.8h, #2
.if \avg .if \avg
ld1 {v5.8B}, [x0] ld1 {v5.8b}, [x0]
urhadd v7.8B, v7.8B, v5.8B urhadd v7.8b, v7.8b, v5.8b
.endif .endif
st1 {v7.8B}, [x0], x2 st1 {v7.8b}, [x0], x2
ret ret
.endm .endm

@ -1099,7 +1099,7 @@ function vsse_intra16_neon, export=1
cbnz w4, 2b cbnz w4, 2b
3: 3:
add v16.4s, v16.4s, v17.4S add v16.4s, v16.4s, v17.4s
uaddlv d17, v16.4s uaddlv d17, v16.4s
fmov w0, s17 fmov w0, s17

@ -28,146 +28,146 @@
.endm .endm
.macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 .macro transpose_8x8B r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8B, \r0\().8B, \r1\().8B trn1 \r8\().8b, \r0\().8b, \r1\().8b
trn2 \r9\().8B, \r0\().8B, \r1\().8B trn2 \r9\().8b, \r0\().8b, \r1\().8b
trn1 \r1\().8B, \r2\().8B, \r3\().8B trn1 \r1\().8b, \r2\().8b, \r3\().8b
trn2 \r3\().8B, \r2\().8B, \r3\().8B trn2 \r3\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().8B, \r4\().8B, \r5\().8B trn1 \r0\().8b, \r4\().8b, \r5\().8b
trn2 \r5\().8B, \r4\().8B, \r5\().8B trn2 \r5\().8b, \r4\().8b, \r5\().8b
trn1 \r2\().8B, \r6\().8B, \r7\().8B trn1 \r2\().8b, \r6\().8b, \r7\().8b
trn2 \r7\().8B, \r6\().8B, \r7\().8B trn2 \r7\().8b, \r6\().8b, \r7\().8b
trn1 \r4\().4H, \r0\().4H, \r2\().4H trn1 \r4\().4h, \r0\().4h, \r2\().4h
trn2 \r2\().4H, \r0\().4H, \r2\().4H trn2 \r2\().4h, \r0\().4h, \r2\().4h
trn1 \r6\().4H, \r5\().4H, \r7\().4H trn1 \r6\().4h, \r5\().4h, \r7\().4h
trn2 \r7\().4H, \r5\().4H, \r7\().4H trn2 \r7\().4h, \r5\().4h, \r7\().4h
trn1 \r5\().4H, \r9\().4H, \r3\().4H trn1 \r5\().4h, \r9\().4h, \r3\().4h
trn2 \r9\().4H, \r9\().4H, \r3\().4H trn2 \r9\().4h, \r9\().4h, \r3\().4h
trn1 \r3\().4H, \r8\().4H, \r1\().4H trn1 \r3\().4h, \r8\().4h, \r1\().4h
trn2 \r8\().4H, \r8\().4H, \r1\().4H trn2 \r8\().4h, \r8\().4h, \r1\().4h
trn1 \r0\().2S, \r3\().2S, \r4\().2S trn1 \r0\().2s, \r3\().2s, \r4\().2s
trn2 \r4\().2S, \r3\().2S, \r4\().2S trn2 \r4\().2s, \r3\().2s, \r4\().2s
trn1 \r1\().2S, \r5\().2S, \r6\().2S trn1 \r1\().2s, \r5\().2s, \r6\().2s
trn2 \r5\().2S, \r5\().2S, \r6\().2S trn2 \r5\().2s, \r5\().2s, \r6\().2s
trn2 \r6\().2S, \r8\().2S, \r2\().2S trn2 \r6\().2s, \r8\().2s, \r2\().2s
trn1 \r2\().2S, \r8\().2S, \r2\().2S trn1 \r2\().2s, \r8\().2s, \r2\().2s
trn1 \r3\().2S, \r9\().2S, \r7\().2S trn1 \r3\().2s, \r9\().2s, \r7\().2s
trn2 \r7\().2S, \r9\().2S, \r7\().2S trn2 \r7\().2s, \r9\().2s, \r7\().2s
.endm .endm
.macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 .macro transpose_8x16B r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
trn1 \t0\().16B, \r0\().16B, \r1\().16B trn1 \t0\().16b, \r0\().16b, \r1\().16b
trn2 \t1\().16B, \r0\().16B, \r1\().16B trn2 \t1\().16b, \r0\().16b, \r1\().16b
trn1 \r1\().16B, \r2\().16B, \r3\().16B trn1 \r1\().16b, \r2\().16b, \r3\().16b
trn2 \r3\().16B, \r2\().16B, \r3\().16B trn2 \r3\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().16B, \r4\().16B, \r5\().16B trn1 \r0\().16b, \r4\().16b, \r5\().16b
trn2 \r5\().16B, \r4\().16B, \r5\().16B trn2 \r5\().16b, \r4\().16b, \r5\().16b
trn1 \r2\().16B, \r6\().16B, \r7\().16B trn1 \r2\().16b, \r6\().16b, \r7\().16b
trn2 \r7\().16B, \r6\().16B, \r7\().16B trn2 \r7\().16b, \r6\().16b, \r7\().16b
trn1 \r4\().8H, \r0\().8H, \r2\().8H trn1 \r4\().8h, \r0\().8h, \r2\().8h
trn2 \r2\().8H, \r0\().8H, \r2\().8H trn2 \r2\().8h, \r0\().8h, \r2\().8h
trn1 \r6\().8H, \r5\().8H, \r7\().8H trn1 \r6\().8h, \r5\().8h, \r7\().8h
trn2 \r7\().8H, \r5\().8H, \r7\().8H trn2 \r7\().8h, \r5\().8h, \r7\().8h
trn1 \r5\().8H, \t1\().8H, \r3\().8H trn1 \r5\().8h, \t1\().8h, \r3\().8h
trn2 \t1\().8H, \t1\().8H, \r3\().8H trn2 \t1\().8h, \t1\().8h, \r3\().8h
trn1 \r3\().8H, \t0\().8H, \r1\().8H trn1 \r3\().8h, \t0\().8h, \r1\().8h
trn2 \t0\().8H, \t0\().8H, \r1\().8H trn2 \t0\().8h, \t0\().8h, \r1\().8h
trn1 \r0\().4S, \r3\().4S, \r4\().4S trn1 \r0\().4s, \r3\().4s, \r4\().4s
trn2 \r4\().4S, \r3\().4S, \r4\().4S trn2 \r4\().4s, \r3\().4s, \r4\().4s
trn1 \r1\().4S, \r5\().4S, \r6\().4S trn1 \r1\().4s, \r5\().4s, \r6\().4s
trn2 \r5\().4S, \r5\().4S, \r6\().4S trn2 \r5\().4s, \r5\().4s, \r6\().4s
trn2 \r6\().4S, \t0\().4S, \r2\().4S trn2 \r6\().4s, \t0\().4s, \r2\().4s
trn1 \r2\().4S, \t0\().4S, \r2\().4S trn1 \r2\().4s, \t0\().4s, \r2\().4s
trn1 \r3\().4S, \t1\().4S, \r7\().4S trn1 \r3\().4s, \t1\().4s, \r7\().4s
trn2 \r7\().4S, \t1\().4S, \r7\().4S trn2 \r7\().4s, \t1\().4s, \r7\().4s
.endm .endm
.macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7 .macro transpose_4x16B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().16B, \r0\().16B, \r1\().16B trn1 \t4\().16b, \r0\().16b, \r1\().16b
trn2 \t5\().16B, \r0\().16B, \r1\().16B trn2 \t5\().16b, \r0\().16b, \r1\().16b
trn1 \t6\().16B, \r2\().16B, \r3\().16B trn1 \t6\().16b, \r2\().16b, \r3\().16b
trn2 \t7\().16B, \r2\().16B, \r3\().16B trn2 \t7\().16b, \r2\().16b, \r3\().16b
trn1 \r0\().8H, \t4\().8H, \t6\().8H trn1 \r0\().8h, \t4\().8h, \t6\().8h
trn2 \r2\().8H, \t4\().8H, \t6\().8H trn2 \r2\().8h, \t4\().8h, \t6\().8h
trn1 \r1\().8H, \t5\().8H, \t7\().8H trn1 \r1\().8h, \t5\().8h, \t7\().8h
trn2 \r3\().8H, \t5\().8H, \t7\().8H trn2 \r3\().8h, \t5\().8h, \t7\().8h
.endm .endm
.macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7 .macro transpose_4x8B r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8B, \r0\().8B, \r1\().8B trn1 \t4\().8b, \r0\().8b, \r1\().8b
trn2 \t5\().8B, \r0\().8B, \r1\().8B trn2 \t5\().8b, \r0\().8b, \r1\().8b
trn1 \t6\().8B, \r2\().8B, \r3\().8B trn1 \t6\().8b, \r2\().8b, \r3\().8b
trn2 \t7\().8B, \r2\().8B, \r3\().8B trn2 \t7\().8b, \r2\().8b, \r3\().8b
trn1 \r0\().4H, \t4\().4H, \t6\().4H trn1 \r0\().4h, \t4\().4h, \t6\().4h
trn2 \r2\().4H, \t4\().4H, \t6\().4H trn2 \r2\().4h, \t4\().4h, \t6\().4h
trn1 \r1\().4H, \t5\().4H, \t7\().4H trn1 \r1\().4h, \t5\().4h, \t7\().4h
trn2 \r3\().4H, \t5\().4H, \t7\().4H trn2 \r3\().4h, \t5\().4h, \t7\().4h
.endm .endm
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7 .macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
trn1 \r4\().4H, \r0\().4H, \r1\().4H trn1 \r4\().4h, \r0\().4h, \r1\().4h
trn2 \r5\().4H, \r0\().4H, \r1\().4H trn2 \r5\().4h, \r0\().4h, \r1\().4h
trn1 \r6\().4H, \r2\().4H, \r3\().4H trn1 \r6\().4h, \r2\().4h, \r3\().4h
trn2 \r7\().4H, \r2\().4H, \r3\().4H trn2 \r7\().4h, \r2\().4h, \r3\().4h
trn1 \r0\().2S, \r4\().2S, \r6\().2S trn1 \r0\().2s, \r4\().2s, \r6\().2s
trn2 \r2\().2S, \r4\().2S, \r6\().2S trn2 \r2\().2s, \r4\().2s, \r6\().2s
trn1 \r1\().2S, \r5\().2S, \r7\().2S trn1 \r1\().2s, \r5\().2s, \r7\().2s
trn2 \r3\().2S, \r5\().2S, \r7\().2S trn2 \r3\().2s, \r5\().2s, \r7\().2s
.endm .endm
.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7 .macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8H, \r0\().8H, \r1\().8H trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8H, \r0\().8H, \r1\().8H trn2 \t5\().8h, \r0\().8h, \r1\().8h
trn1 \t6\().8H, \r2\().8H, \r3\().8H trn1 \t6\().8h, \r2\().8h, \r3\().8h
trn2 \t7\().8H, \r2\().8H, \r3\().8H trn2 \t7\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().4S, \t4\().4S, \t6\().4S trn1 \r0\().4s, \t4\().4s, \t6\().4s
trn2 \r2\().4S, \t4\().4S, \t6\().4S trn2 \r2\().4s, \t4\().4s, \t6\().4s
trn1 \r1\().4S, \t5\().4S, \t7\().4S trn1 \r1\().4s, \t5\().4s, \t7\().4s
trn2 \r3\().4S, \t5\().4S, \t7\().4S trn2 \r3\().4s, \t5\().4s, \t7\().4s
.endm .endm
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 .macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
trn1 \r8\().8H, \r0\().8H, \r1\().8H trn1 \r8\().8h, \r0\().8h, \r1\().8h
trn2 \r9\().8H, \r0\().8H, \r1\().8H trn2 \r9\().8h, \r0\().8h, \r1\().8h
trn1 \r1\().8H, \r2\().8H, \r3\().8H trn1 \r1\().8h, \r2\().8h, \r3\().8h
trn2 \r3\().8H, \r2\().8H, \r3\().8H trn2 \r3\().8h, \r2\().8h, \r3\().8h
trn1 \r0\().8H, \r4\().8H, \r5\().8H trn1 \r0\().8h, \r4\().8h, \r5\().8h
trn2 \r5\().8H, \r4\().8H, \r5\().8H trn2 \r5\().8h, \r4\().8h, \r5\().8h
trn1 \r2\().8H, \r6\().8H, \r7\().8H trn1 \r2\().8h, \r6\().8h, \r7\().8h
trn2 \r7\().8H, \r6\().8H, \r7\().8H trn2 \r7\().8h, \r6\().8h, \r7\().8h
trn1 \r4\().4S, \r0\().4S, \r2\().4S trn1 \r4\().4s, \r0\().4s, \r2\().4s
trn2 \r2\().4S, \r0\().4S, \r2\().4S trn2 \r2\().4s, \r0\().4s, \r2\().4s
trn1 \r6\().4S, \r5\().4S, \r7\().4S trn1 \r6\().4s, \r5\().4s, \r7\().4s
trn2 \r7\().4S, \r5\().4S, \r7\().4S trn2 \r7\().4s, \r5\().4s, \r7\().4s
trn1 \r5\().4S, \r9\().4S, \r3\().4S trn1 \r5\().4s, \r9\().4s, \r3\().4s
trn2 \r9\().4S, \r9\().4S, \r3\().4S trn2 \r9\().4s, \r9\().4s, \r3\().4s
trn1 \r3\().4S, \r8\().4S, \r1\().4S trn1 \r3\().4s, \r8\().4s, \r1\().4s
trn2 \r8\().4S, \r8\().4S, \r1\().4S trn2 \r8\().4s, \r8\().4s, \r1\().4s
trn1 \r0\().2D, \r3\().2D, \r4\().2D trn1 \r0\().2d, \r3\().2d, \r4\().2d
trn2 \r4\().2D, \r3\().2D, \r4\().2D trn2 \r4\().2d, \r3\().2d, \r4\().2d
trn1 \r1\().2D, \r5\().2D, \r6\().2D trn1 \r1\().2d, \r5\().2d, \r6\().2d
trn2 \r5\().2D, \r5\().2D, \r6\().2D trn2 \r5\().2d, \r5\().2d, \r6\().2d
trn2 \r6\().2D, \r8\().2D, \r2\().2D trn2 \r6\().2d, \r8\().2d, \r2\().2d
trn1 \r2\().2D, \r8\().2D, \r2\().2D trn1 \r2\().2d, \r8\().2d, \r2\().2d
trn1 \r3\().2D, \r9\().2D, \r7\().2D trn1 \r3\().2d, \r9\().2d, \r7\().2d
trn2 \r7\().2D, \r9\().2D, \r7\().2D trn2 \r7\().2d, \r9\().2d, \r7\().2d
.endm .endm

@ -46,49 +46,49 @@ function ff_sbr_sum64x5_neon, export=1
add x3, x0, #192*4 add x3, x0, #192*4
add x4, x0, #256*4 add x4, x0, #256*4
mov x5, #64 mov x5, #64
1: ld1 {v0.4S}, [x0] 1: ld1 {v0.4s}, [x0]
ld1 {v1.4S}, [x1], #16 ld1 {v1.4s}, [x1], #16
fadd v0.4S, v0.4S, v1.4S fadd v0.4s, v0.4s, v1.4s
ld1 {v2.4S}, [x2], #16 ld1 {v2.4s}, [x2], #16
fadd v0.4S, v0.4S, v2.4S fadd v0.4s, v0.4s, v2.4s
ld1 {v3.4S}, [x3], #16 ld1 {v3.4s}, [x3], #16
fadd v0.4S, v0.4S, v3.4S fadd v0.4s, v0.4s, v3.4s
ld1 {v4.4S}, [x4], #16 ld1 {v4.4s}, [x4], #16
fadd v0.4S, v0.4S, v4.4S fadd v0.4s, v0.4s, v4.4s
st1 {v0.4S}, [x0], #16 st1 {v0.4s}, [x0], #16
subs x5, x5, #4 subs x5, x5, #4
b.gt 1b b.gt 1b
ret ret
endfunc endfunc
function ff_sbr_sum_square_neon, export=1 function ff_sbr_sum_square_neon, export=1
movi v0.4S, #0 movi v0.4s, #0
1: ld1 {v1.4S}, [x0], #16 1: ld1 {v1.4s}, [x0], #16
fmla v0.4S, v1.4S, v1.4S fmla v0.4s, v1.4s, v1.4s
subs w1, w1, #2 subs w1, w1, #2
b.gt 1b b.gt 1b
faddp v0.4S, v0.4S, v0.4S faddp v0.4s, v0.4s, v0.4s
faddp v0.4S, v0.4S, v0.4S faddp v0.4s, v0.4s, v0.4s
ret ret
endfunc endfunc
function ff_sbr_neg_odd_64_neon, export=1 function ff_sbr_neg_odd_64_neon, export=1
mov x1, x0 mov x1, x0
movi v5.4S, #1<<7, lsl #24 movi v5.4s, #1<<7, lsl #24
ld2 {v0.4S, v1.4S}, [x0], #32 ld2 {v0.4s, v1.4s}, [x0], #32
eor v1.16B, v1.16B, v5.16B eor v1.16b, v1.16b, v5.16b
ld2 {v2.4S, v3.4S}, [x0], #32 ld2 {v2.4s, v3.4s}, [x0], #32
.rept 3 .rept 3
st2 {v0.4S, v1.4S}, [x1], #32 st2 {v0.4s, v1.4s}, [x1], #32
eor v3.16B, v3.16B, v5.16B eor v3.16b, v3.16b, v5.16b
ld2 {v0.4S, v1.4S}, [x0], #32 ld2 {v0.4s, v1.4s}, [x0], #32
st2 {v2.4S, v3.4S}, [x1], #32 st2 {v2.4s, v3.4s}, [x1], #32
eor v1.16B, v1.16B, v5.16B eor v1.16b, v1.16b, v5.16b
ld2 {v2.4S, v3.4S}, [x0], #32 ld2 {v2.4s, v3.4s}, [x0], #32
.endr .endr
eor v3.16B, v3.16B, v5.16B eor v3.16b, v3.16b, v5.16b
st2 {v0.4S, v1.4S}, [x1], #32 st2 {v0.4s, v1.4s}, [x1], #32
st2 {v2.4S, v3.4S}, [x1], #32 st2 {v2.4s, v3.4s}, [x1], #32
ret ret
endfunc endfunc
@ -97,26 +97,26 @@ function ff_sbr_qmf_pre_shuffle_neon, export=1
add x2, x0, #64*4 add x2, x0, #64*4
mov x3, #-16 mov x3, #-16
mov x4, #-4 mov x4, #-4
movi v6.4S, #1<<7, lsl #24 movi v6.4s, #1<<7, lsl #24
ld1 {v0.2S}, [x0], #8 ld1 {v0.2s}, [x0], #8
st1 {v0.2S}, [x2], #8 st1 {v0.2s}, [x2], #8
.rept 7 .rept 7
ld1 {v1.4S}, [x1], x3 ld1 {v1.4s}, [x1], x3
ld1 {v2.4S}, [x0], #16 ld1 {v2.4s}, [x0], #16
eor v1.16B, v1.16B, v6.16B eor v1.16b, v1.16b, v6.16b
rev64 v1.4S, v1.4S rev64 v1.4s, v1.4s
ext v1.16B, v1.16B, v1.16B, #8 ext v1.16b, v1.16b, v1.16b, #8
st2 {v1.4S, v2.4S}, [x2], #32 st2 {v1.4s, v2.4s}, [x2], #32
.endr .endr
add x1, x1, #8 add x1, x1, #8
ld1 {v1.2S}, [x1], x4 ld1 {v1.2s}, [x1], x4
ld1 {v2.2S}, [x0], #8 ld1 {v2.2s}, [x0], #8
ld1 {v1.S}[3], [x1] ld1 {v1.s}[3], [x1]
ld1 {v2.S}[2], [x0] ld1 {v2.s}[2], [x0]
eor v1.16B, v1.16B, v6.16B eor v1.16b, v1.16b, v6.16b
rev64 v1.4S, v1.4S rev64 v1.4s, v1.4s
st2 {v1.2S, v2.2S}, [x2], #16 st2 {v1.2s, v2.2s}, [x2], #16
st2 {v1.S, v2.S}[2], [x2] st2 {v1.s, v2.s}[2], [x2]
ret ret
endfunc endfunc
@ -124,13 +124,13 @@ function ff_sbr_qmf_post_shuffle_neon, export=1
add x2, x1, #60*4 add x2, x1, #60*4
mov x3, #-16 mov x3, #-16
mov x4, #32 mov x4, #32
movi v6.4S, #1<<7, lsl #24 movi v6.4s, #1<<7, lsl #24
1: ld1 {v0.4S}, [x2], x3 1: ld1 {v0.4s}, [x2], x3
ld1 {v1.4S}, [x1], #16 ld1 {v1.4s}, [x1], #16
eor v0.16B, v0.16B, v6.16B eor v0.16b, v0.16b, v6.16b
rev64 v0.4S, v0.4S rev64 v0.4s, v0.4s
ext v0.16B, v0.16B, v0.16B, #8 ext v0.16b, v0.16b, v0.16b, #8
st2 {v0.4S, v1.4S}, [x0], #32 st2 {v0.4s, v1.4s}, [x0], #32
subs x4, x4, #4 subs x4, x4, #4
b.gt 1b b.gt 1b
ret ret
@ -141,13 +141,13 @@ function ff_sbr_qmf_deint_neg_neon, export=1
add x2, x0, #60*4 add x2, x0, #60*4
mov x3, #-32 mov x3, #-32
mov x4, #32 mov x4, #32
movi v2.4S, #1<<7, lsl #24 movi v2.4s, #1<<7, lsl #24
1: ld2 {v0.4S, v1.4S}, [x1], x3 1: ld2 {v0.4s, v1.4s}, [x1], x3
eor v0.16B, v0.16B, v2.16B eor v0.16b, v0.16b, v2.16b
rev64 v1.4S, v1.4S rev64 v1.4s, v1.4s
ext v1.16B, v1.16B, v1.16B, #8 ext v1.16b, v1.16b, v1.16b, #8
st1 {v0.4S}, [x2] st1 {v0.4s}, [x2]
st1 {v1.4S}, [x0], #16 st1 {v1.4s}, [x0], #16
sub x2, x2, #16 sub x2, x2, #16
subs x4, x4, #4 subs x4, x4, #4
b.gt 1b b.gt 1b
@ -159,16 +159,16 @@ function ff_sbr_qmf_deint_bfly_neon, export=1
add x3, x0, #124*4 add x3, x0, #124*4
mov x4, #64 mov x4, #64
mov x5, #-16 mov x5, #-16
1: ld1 {v0.4S}, [x1], #16 1: ld1 {v0.4s}, [x1], #16
ld1 {v1.4S}, [x2], x5 ld1 {v1.4s}, [x2], x5
rev64 v2.4S, v0.4S rev64 v2.4s, v0.4s
ext v2.16B, v2.16B, v2.16B, #8 ext v2.16b, v2.16b, v2.16b, #8
rev64 v3.4S, v1.4S rev64 v3.4s, v1.4s
ext v3.16B, v3.16B, v3.16B, #8 ext v3.16b, v3.16b, v3.16b, #8
fadd v1.4S, v1.4S, v2.4S fadd v1.4s, v1.4s, v2.4s
fsub v0.4S, v0.4S, v3.4S fsub v0.4s, v0.4s, v3.4s
st1 {v0.4S}, [x0], #16 st1 {v0.4s}, [x0], #16
st1 {v1.4S}, [x3], x5 st1 {v1.4s}, [x3], x5
subs x4, x4, #4 subs x4, x4, #4
b.gt 1b b.gt 1b
ret ret
@ -178,32 +178,32 @@ function ff_sbr_hf_gen_neon, export=1
sxtw x4, w4 sxtw x4, w4
sxtw x5, w5 sxtw x5, w5
movrel x6, factors movrel x6, factors
ld1 {v7.4S}, [x6] ld1 {v7.4s}, [x6]
dup v1.4S, v0.S[0] dup v1.4s, v0.s[0]
mov v2.8B, v1.8B mov v2.8b, v1.8b
mov v2.S[2], v7.S[0] mov v2.s[2], v7.s[0]
mov v2.S[3], v7.S[0] mov v2.s[3], v7.s[0]
fmul v1.4S, v1.4S, v2.4S fmul v1.4s, v1.4s, v2.4s
ld1 {v0.D}[0], [x3] ld1 {v0.d}[0], [x3]
ld1 {v0.D}[1], [x2] ld1 {v0.d}[1], [x2]
fmul v0.4S, v0.4S, v1.4S fmul v0.4s, v0.4s, v1.4s
fmul v1.4S, v0.4S, v7.4S fmul v1.4s, v0.4s, v7.4s
rev64 v0.4S, v0.4S rev64 v0.4s, v0.4s
sub x7, x5, x4 sub x7, x5, x4
add x0, x0, x4, lsl #3 add x0, x0, x4, lsl #3
add x1, x1, x4, lsl #3 add x1, x1, x4, lsl #3
sub x1, x1, #16 sub x1, x1, #16
1: ld1 {v2.4S}, [x1], #16 1: ld1 {v2.4s}, [x1], #16
ld1 {v3.2S}, [x1] ld1 {v3.2s}, [x1]
fmul v4.4S, v2.4S, v1.4S fmul v4.4s, v2.4s, v1.4s
fmul v5.4S, v2.4S, v0.4S fmul v5.4s, v2.4s, v0.4s
faddp v4.4S, v4.4S, v4.4S faddp v4.4s, v4.4s, v4.4s
faddp v5.4S, v5.4S, v5.4S faddp v5.4s, v5.4s, v5.4s
faddp v4.4S, v4.4S, v4.4S faddp v4.4s, v4.4s, v4.4s
faddp v5.4S, v5.4S, v5.4S faddp v5.4s, v5.4s, v5.4s
mov v4.S[1], v5.S[0] mov v4.s[1], v5.s[0]
fadd v4.2S, v4.2S, v3.2S fadd v4.2s, v4.2s, v3.2s
st1 {v4.2S}, [x0], #8 st1 {v4.2s}, [x0], #8
sub x1, x1, #8 sub x1, x1, #8
subs x7, x7, #1 subs x7, x7, #1
b.gt 1b b.gt 1b
@ -215,10 +215,10 @@ function ff_sbr_hf_g_filt_neon, export=1
sxtw x4, w4 sxtw x4, w4
mov x5, #40*2*4 mov x5, #40*2*4
add x1, x1, x4, lsl #3 add x1, x1, x4, lsl #3
1: ld1 {v0.2S}, [x1], x5 1: ld1 {v0.2s}, [x1], x5
ld1 {v1.S}[0], [x2], #4 ld1 {v1.s}[0], [x2], #4
fmul v2.4S, v0.4S, v1.S[0] fmul v2.4s, v0.4s, v1.s[0]
st1 {v2.2S}, [x0], #8 st1 {v2.2s}, [x0], #8
subs x3, x3, #1 subs x3, x3, #1
b.gt 1b b.gt 1b
ret ret
@ -227,46 +227,46 @@ endfunc
function ff_sbr_autocorrelate_neon, export=1 function ff_sbr_autocorrelate_neon, export=1
mov x2, #38 mov x2, #38
movrel x3, factors movrel x3, factors
ld1 {v0.4S}, [x3] ld1 {v0.4s}, [x3]
movi v1.4S, #0 movi v1.4s, #0
movi v2.4S, #0 movi v2.4s, #0
movi v3.4S, #0 movi v3.4s, #0
ld1 {v4.2S}, [x0], #8 ld1 {v4.2s}, [x0], #8
ld1 {v5.2S}, [x0], #8 ld1 {v5.2s}, [x0], #8
fmul v16.2S, v4.2S, v4.2S fmul v16.2s, v4.2s, v4.2s
fmul v17.2S, v5.2S, v4.S[0] fmul v17.2s, v5.2s, v4.s[0]
fmul v18.2S, v5.2S, v4.S[1] fmul v18.2s, v5.2s, v4.s[1]
1: ld1 {v5.D}[1], [x0], #8 1: ld1 {v5.d}[1], [x0], #8
fmla v1.2S, v4.2S, v4.2S fmla v1.2s, v4.2s, v4.2s
fmla v2.4S, v5.4S, v4.S[0] fmla v2.4s, v5.4s, v4.s[0]
fmla v3.4S, v5.4S, v4.S[1] fmla v3.4s, v5.4s, v4.s[1]
mov v4.D[0], v5.D[0] mov v4.d[0], v5.d[0]
mov v5.D[0], v5.D[1] mov v5.d[0], v5.d[1]
subs x2, x2, #1 subs x2, x2, #1
b.gt 1b b.gt 1b
fmul v19.2S, v4.2S, v4.2S fmul v19.2s, v4.2s, v4.2s
fmul v20.2S, v5.2S, v4.S[0] fmul v20.2s, v5.2s, v4.s[0]
fmul v21.2S, v5.2S, v4.S[1] fmul v21.2s, v5.2s, v4.s[1]
fadd v22.4S, v2.4S, v20.4S fadd v22.4s, v2.4s, v20.4s
fsub v22.4S, v22.4S, v17.4S fsub v22.4s, v22.4s, v17.4s
fadd v23.4S, v3.4S, v21.4S fadd v23.4s, v3.4s, v21.4s
fsub v23.4S, v23.4S, v18.4S fsub v23.4s, v23.4s, v18.4s
rev64 v23.4S, v23.4S rev64 v23.4s, v23.4s
fmul v23.4S, v23.4S, v0.4S fmul v23.4s, v23.4s, v0.4s
fadd v22.4S, v22.4S, v23.4S fadd v22.4s, v22.4s, v23.4s
st1 {v22.4S}, [x1], #16 st1 {v22.4s}, [x1], #16
fadd v23.2S, v1.2S, v19.2S fadd v23.2s, v1.2s, v19.2s
fsub v23.2S, v23.2S, v16.2S fsub v23.2s, v23.2s, v16.2s
faddp v23.2S, v23.2S, v23.2S faddp v23.2s, v23.2s, v23.2s
st1 {v23.S}[0], [x1] st1 {v23.s}[0], [x1]
add x1, x1, #8 add x1, x1, #8
rev64 v3.2S, v3.2S rev64 v3.2s, v3.2s
fmul v3.2S, v3.2S, v0.2S fmul v3.2s, v3.2s, v0.2s
fadd v2.2S, v2.2S, v3.2S fadd v2.2s, v2.2s, v3.2s
st1 {v2.2S}, [x1] st1 {v2.2s}, [x1]
add x1, x1, #16 add x1, x1, #16
faddp v1.2S, v1.2S, v1.2S faddp v1.2s, v1.2s, v1.2s
st1 {v1.S}[0], [x1] st1 {v1.s}[0], [x1]
ret ret
endfunc endfunc
@ -278,25 +278,25 @@ endfunc
1: and x3, x3, #0x1ff 1: and x3, x3, #0x1ff
add x8, x7, x3, lsl #3 add x8, x7, x3, lsl #3
add x3, x3, #2 add x3, x3, #2
ld1 {v2.4S}, [x0] ld1 {v2.4s}, [x0]
ld1 {v3.2S}, [x1], #8 ld1 {v3.2s}, [x1], #8
ld1 {v4.2S}, [x2], #8 ld1 {v4.2s}, [x2], #8
ld1 {v5.4S}, [x8] ld1 {v5.4s}, [x8]
mov v6.16B, v2.16B mov v6.16b, v2.16b
zip1 v3.4S, v3.4S, v3.4S zip1 v3.4s, v3.4s, v3.4s
zip1 v4.4S, v4.4S, v4.4S zip1 v4.4s, v4.4s, v4.4s
fmla v6.4S, v1.4S, v3.4S fmla v6.4s, v1.4s, v3.4s
fmla v2.4S, v5.4S, v4.4S fmla v2.4s, v5.4s, v4.4s
fcmeq v7.4S, v3.4S, #0 fcmeq v7.4s, v3.4s, #0
bif v2.16B, v6.16B, v7.16B bif v2.16b, v6.16b, v7.16b
st1 {v2.4S}, [x0], #16 st1 {v2.4s}, [x0], #16
subs x5, x5, #2 subs x5, x5, #2
b.gt 1b b.gt 1b
.endm .endm
function ff_sbr_hf_apply_noise_0_neon, export=1 function ff_sbr_hf_apply_noise_0_neon, export=1
movrel x9, phi_noise_0 movrel x9, phi_noise_0
ld1 {v1.4S}, [x9] ld1 {v1.4s}, [x9]
apply_noise_common apply_noise_common
ret ret
endfunc endfunc
@ -305,14 +305,14 @@ function ff_sbr_hf_apply_noise_1_neon, export=1
movrel x9, phi_noise_1 movrel x9, phi_noise_1
and x4, x4, #1 and x4, x4, #1
add x9, x9, x4, lsl #4 add x9, x9, x4, lsl #4
ld1 {v1.4S}, [x9] ld1 {v1.4s}, [x9]
apply_noise_common apply_noise_common
ret ret
endfunc endfunc
function ff_sbr_hf_apply_noise_2_neon, export=1 function ff_sbr_hf_apply_noise_2_neon, export=1
movrel x9, phi_noise_2 movrel x9, phi_noise_2
ld1 {v1.4S}, [x9] ld1 {v1.4s}, [x9]
apply_noise_common apply_noise_common
ret ret
endfunc endfunc
@ -321,7 +321,7 @@ function ff_sbr_hf_apply_noise_3_neon, export=1
movrel x9, phi_noise_3 movrel x9, phi_noise_3
and x4, x4, #1 and x4, x4, #1
add x9, x9, x4, lsl #4 add x9, x9, x4, lsl #4
ld1 {v1.4S}, [x9] ld1 {v1.4s}, [x9]
apply_noise_common apply_noise_common
ret ret
endfunc endfunc

@ -54,7 +54,7 @@ endconst
prfm pldl1keep, [\data] prfm pldl1keep, [\data]
mov x10, x30 mov x10, x30
movrel x3, idct_coeff_neon movrel x3, idct_coeff_neon
ld1 {v0.2D}, [x3] ld1 {v0.2d}, [x3]
.endm .endm
.macro idct_end .macro idct_end
@ -74,146 +74,146 @@ endconst
.endm .endm
.macro idct_col4_top y1, y2, y3, y4, i, l .macro idct_col4_top y1, y2, y3, y4, i, l
smull\i v7.4S, \y3\l, z2 smull\i v7.4s, \y3\l, z2
smull\i v16.4S, \y3\l, z6 smull\i v16.4s, \y3\l, z6
smull\i v17.4S, \y2\l, z1 smull\i v17.4s, \y2\l, z1
add v19.4S, v23.4S, v7.4S add v19.4s, v23.4s, v7.4s
smull\i v18.4S, \y2\l, z3 smull\i v18.4s, \y2\l, z3
add v20.4S, v23.4S, v16.4S add v20.4s, v23.4s, v16.4s
smull\i v5.4S, \y2\l, z5 smull\i v5.4s, \y2\l, z5
sub v21.4S, v23.4S, v16.4S sub v21.4s, v23.4s, v16.4s
smull\i v6.4S, \y2\l, z7 smull\i v6.4s, \y2\l, z7
sub v22.4S, v23.4S, v7.4S sub v22.4s, v23.4s, v7.4s
smlal\i v17.4S, \y4\l, z3 smlal\i v17.4s, \y4\l, z3
smlsl\i v18.4S, \y4\l, z7 smlsl\i v18.4s, \y4\l, z7
smlsl\i v5.4S, \y4\l, z1 smlsl\i v5.4s, \y4\l, z1
smlsl\i v6.4S, \y4\l, z5 smlsl\i v6.4s, \y4\l, z5
.endm .endm
.macro idct_row4_neon y1, y2, y3, y4, pass .macro idct_row4_neon y1, y2, y3, y4, pass
ld1 {\y1\().2D,\y2\().2D}, [x2], #32 ld1 {\y1\().2d,\y2\().2d}, [x2], #32
movi v23.4S, #1<<2, lsl #8 movi v23.4s, #1<<2, lsl #8
orr v5.16B, \y1\().16B, \y2\().16B orr v5.16b, \y1\().16b, \y2\().16b
ld1 {\y3\().2D,\y4\().2D}, [x2], #32 ld1 {\y3\().2d,\y4\().2d}, [x2], #32
orr v6.16B, \y3\().16B, \y4\().16B orr v6.16b, \y3\().16b, \y4\().16b
orr v5.16B, v5.16B, v6.16B orr v5.16b, v5.16b, v6.16b
mov x3, v5.D[1] mov x3, v5.d[1]
smlal v23.4S, \y1\().4H, z4 smlal v23.4s, \y1\().4h, z4
idct_col4_top \y1, \y2, \y3, \y4, 1, .4H idct_col4_top \y1, \y2, \y3, \y4, 1, .4h
cmp x3, #0 cmp x3, #0
b.eq \pass\()f b.eq \pass\()f
smull2 v7.4S, \y1\().8H, z4 smull2 v7.4s, \y1\().8h, z4
smlal2 v17.4S, \y2\().8H, z5 smlal2 v17.4s, \y2\().8h, z5
smlsl2 v18.4S, \y2\().8H, z1 smlsl2 v18.4s, \y2\().8h, z1
smull2 v16.4S, \y3\().8H, z2 smull2 v16.4s, \y3\().8h, z2
smlal2 v5.4S, \y2\().8H, z7 smlal2 v5.4s, \y2\().8h, z7
add v19.4S, v19.4S, v7.4S add v19.4s, v19.4s, v7.4s
sub v20.4S, v20.4S, v7.4S sub v20.4s, v20.4s, v7.4s
sub v21.4S, v21.4S, v7.4S sub v21.4s, v21.4s, v7.4s
add v22.4S, v22.4S, v7.4S add v22.4s, v22.4s, v7.4s
smlal2 v6.4S, \y2\().8H, z3 smlal2 v6.4s, \y2\().8h, z3
smull2 v7.4S, \y3\().8H, z6 smull2 v7.4s, \y3\().8h, z6
smlal2 v17.4S, \y4\().8H, z7 smlal2 v17.4s, \y4\().8h, z7
smlsl2 v18.4S, \y4\().8H, z5 smlsl2 v18.4s, \y4\().8h, z5
smlal2 v5.4S, \y4\().8H, z3 smlal2 v5.4s, \y4\().8h, z3
smlsl2 v6.4S, \y4\().8H, z1 smlsl2 v6.4s, \y4\().8h, z1
add v19.4S, v19.4S, v7.4S add v19.4s, v19.4s, v7.4s
sub v20.4S, v20.4S, v16.4S sub v20.4s, v20.4s, v16.4s
add v21.4S, v21.4S, v16.4S add v21.4s, v21.4s, v16.4s
sub v22.4S, v22.4S, v7.4S sub v22.4s, v22.4s, v7.4s
\pass: add \y3\().4S, v19.4S, v17.4S \pass: add \y3\().4S, v19.4S, v17.4S
add \y4\().4S, v20.4S, v18.4S add \y4\().4s, v20.4s, v18.4s
shrn \y1\().4H, \y3\().4S, #ROW_SHIFT shrn \y1\().4h, \y3\().4s, #ROW_SHIFT
shrn \y2\().4H, \y4\().4S, #ROW_SHIFT shrn \y2\().4h, \y4\().4s, #ROW_SHIFT
add v7.4S, v21.4S, v5.4S add v7.4s, v21.4s, v5.4s
add v16.4S, v22.4S, v6.4S add v16.4s, v22.4s, v6.4s
shrn \y3\().4H, v7.4S, #ROW_SHIFT shrn \y3\().4h, v7.4s, #ROW_SHIFT
shrn \y4\().4H, v16.4S, #ROW_SHIFT shrn \y4\().4h, v16.4s, #ROW_SHIFT
sub v22.4S, v22.4S, v6.4S sub v22.4s, v22.4s, v6.4s
sub v19.4S, v19.4S, v17.4S sub v19.4s, v19.4s, v17.4s
sub v21.4S, v21.4S, v5.4S sub v21.4s, v21.4s, v5.4s
shrn2 \y1\().8H, v22.4S, #ROW_SHIFT shrn2 \y1\().8h, v22.4s, #ROW_SHIFT
sub v20.4S, v20.4S, v18.4S sub v20.4s, v20.4s, v18.4s
shrn2 \y2\().8H, v21.4S, #ROW_SHIFT shrn2 \y2\().8h, v21.4s, #ROW_SHIFT
shrn2 \y3\().8H, v20.4S, #ROW_SHIFT shrn2 \y3\().8h, v20.4s, #ROW_SHIFT
shrn2 \y4\().8H, v19.4S, #ROW_SHIFT shrn2 \y4\().8h, v19.4s, #ROW_SHIFT
trn1 v16.8H, \y1\().8H, \y2\().8H trn1 v16.8h, \y1\().8h, \y2\().8h
trn2 v17.8H, \y1\().8H, \y2\().8H trn2 v17.8h, \y1\().8h, \y2\().8h
trn1 v18.8H, \y3\().8H, \y4\().8H trn1 v18.8h, \y3\().8h, \y4\().8h
trn2 v19.8H, \y3\().8H, \y4\().8H trn2 v19.8h, \y3\().8h, \y4\().8h
trn1 \y1\().4S, v16.4S, v18.4S trn1 \y1\().4s, v16.4s, v18.4s
trn1 \y2\().4S, v17.4S, v19.4S trn1 \y2\().4s, v17.4s, v19.4s
trn2 \y3\().4S, v16.4S, v18.4S trn2 \y3\().4s, v16.4s, v18.4s
trn2 \y4\().4S, v17.4S, v19.4S trn2 \y4\().4s, v17.4s, v19.4s
.endm .endm
.macro declare_idct_col4_neon i, l .macro declare_idct_col4_neon i, l
function idct_col4_neon\i function idct_col4_neon\i
dup v23.4H, z4c dup v23.4h, z4c
.if \i == 1 .if \i == 1
add v23.4H, v23.4H, v24.4H add v23.4h, v23.4h, v24.4h
.else .else
mov v5.D[0], v24.D[1] mov v5.d[0], v24.d[1]
add v23.4H, v23.4H, v5.4H add v23.4h, v23.4h, v5.4h
.endif .endif
smull v23.4S, v23.4H, z4 smull v23.4s, v23.4h, z4
idct_col4_top v24, v25, v26, v27, \i, \l idct_col4_top v24, v25, v26, v27, \i, \l
mov x4, v28.D[\i - 1] mov x4, v28.d[\i - 1]
mov x5, v29.D[\i - 1] mov x5, v29.d[\i - 1]
cmp x4, #0 cmp x4, #0
b.eq 1f b.eq 1f
smull\i v7.4S, v28\l, z4 smull\i v7.4s, v28\l, z4
add v19.4S, v19.4S, v7.4S add v19.4s, v19.4s, v7.4s
sub v20.4S, v20.4S, v7.4S sub v20.4s, v20.4s, v7.4s
sub v21.4S, v21.4S, v7.4S sub v21.4s, v21.4s, v7.4s
add v22.4S, v22.4S, v7.4S add v22.4s, v22.4s, v7.4s
1: mov x4, v30.D[\i - 1] 1: mov x4, v30.d[\i - 1]
cmp x5, #0 cmp x5, #0
b.eq 2f b.eq 2f
smlal\i v17.4S, v29\l, z5 smlal\i v17.4s, v29\l, z5
smlsl\i v18.4S, v29\l, z1 smlsl\i v18.4s, v29\l, z1
smlal\i v5.4S, v29\l, z7 smlal\i v5.4s, v29\l, z7
smlal\i v6.4S, v29\l, z3 smlal\i v6.4s, v29\l, z3
2: mov x5, v31.D[\i - 1] 2: mov x5, v31.d[\i - 1]
cmp x4, #0 cmp x4, #0
b.eq 3f b.eq 3f
smull\i v7.4S, v30\l, z6 smull\i v7.4s, v30\l, z6
smull\i v16.4S, v30\l, z2 smull\i v16.4s, v30\l, z2
add v19.4S, v19.4S, v7.4S add v19.4s, v19.4s, v7.4s
sub v22.4S, v22.4S, v7.4S sub v22.4s, v22.4s, v7.4s
sub v20.4S, v20.4S, v16.4S sub v20.4s, v20.4s, v16.4s
add v21.4S, v21.4S, v16.4S add v21.4s, v21.4s, v16.4s
3: cmp x5, #0 3: cmp x5, #0
b.eq 4f b.eq 4f
smlal\i v17.4S, v31\l, z7 smlal\i v17.4s, v31\l, z7
smlsl\i v18.4S, v31\l, z5 smlsl\i v18.4s, v31\l, z5
smlal\i v5.4S, v31\l, z3 smlal\i v5.4s, v31\l, z3
smlsl\i v6.4S, v31\l, z1 smlsl\i v6.4s, v31\l, z1
4: addhn v7.4H, v19.4S, v17.4S 4: addhn v7.4h, v19.4s, v17.4s
addhn2 v7.8H, v20.4S, v18.4S addhn2 v7.8h, v20.4s, v18.4s
subhn v18.4H, v20.4S, v18.4S subhn v18.4h, v20.4s, v18.4s
subhn2 v18.8H, v19.4S, v17.4S subhn2 v18.8h, v19.4s, v17.4s
addhn v16.4H, v21.4S, v5.4S addhn v16.4h, v21.4s, v5.4s
addhn2 v16.8H, v22.4S, v6.4S addhn2 v16.8h, v22.4s, v6.4s
subhn v17.4H, v22.4S, v6.4S subhn v17.4h, v22.4s, v6.4s
subhn2 v17.8H, v21.4S, v5.4S subhn2 v17.8h, v21.4s, v5.4s
ret ret
endfunc endfunc
@ -229,33 +229,33 @@ function ff_simple_idct_put_neon, export=1
idct_row4_neon v28, v29, v30, v31, 2 idct_row4_neon v28, v29, v30, v31, 2
bl idct_col4_neon1 bl idct_col4_neon1
sqshrun v1.8B, v7.8H, #COL_SHIFT-16 sqshrun v1.8b, v7.8h, #COL_SHIFT-16
sqshrun2 v1.16B, v16.8H, #COL_SHIFT-16 sqshrun2 v1.16b, v16.8h, #COL_SHIFT-16
sqshrun v3.8B, v17.8H, #COL_SHIFT-16 sqshrun v3.8b, v17.8h, #COL_SHIFT-16
sqshrun2 v3.16B, v18.8H, #COL_SHIFT-16 sqshrun2 v3.16b, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2 bl idct_col4_neon2
sqshrun v2.8B, v7.8H, #COL_SHIFT-16 sqshrun v2.8b, v7.8h, #COL_SHIFT-16
sqshrun2 v2.16B, v16.8H, #COL_SHIFT-16 sqshrun2 v2.16b, v16.8h, #COL_SHIFT-16
sqshrun v4.8B, v17.8H, #COL_SHIFT-16 sqshrun v4.8b, v17.8h, #COL_SHIFT-16
sqshrun2 v4.16B, v18.8H, #COL_SHIFT-16 sqshrun2 v4.16b, v18.8h, #COL_SHIFT-16
zip1 v16.4S, v1.4S, v2.4S zip1 v16.4s, v1.4s, v2.4s
zip2 v17.4S, v1.4S, v2.4S zip2 v17.4s, v1.4s, v2.4s
st1 {v16.D}[0], [x0], x1 st1 {v16.d}[0], [x0], x1
st1 {v16.D}[1], [x0], x1 st1 {v16.d}[1], [x0], x1
zip1 v18.4S, v3.4S, v4.4S zip1 v18.4s, v3.4s, v4.4s
zip2 v19.4S, v3.4S, v4.4S zip2 v19.4s, v3.4s, v4.4s
st1 {v17.D}[0], [x0], x1 st1 {v17.d}[0], [x0], x1
st1 {v17.D}[1], [x0], x1 st1 {v17.d}[1], [x0], x1
st1 {v18.D}[0], [x0], x1 st1 {v18.d}[0], [x0], x1
st1 {v18.D}[1], [x0], x1 st1 {v18.d}[1], [x0], x1
st1 {v19.D}[0], [x0], x1 st1 {v19.d}[0], [x0], x1
st1 {v19.D}[1], [x0], x1 st1 {v19.d}[1], [x0], x1
idct_end idct_end
endfunc endfunc
@ -267,59 +267,59 @@ function ff_simple_idct_add_neon, export=1
idct_row4_neon v28, v29, v30, v31, 2 idct_row4_neon v28, v29, v30, v31, 2
bl idct_col4_neon1 bl idct_col4_neon1
sshr v1.8H, v7.8H, #COL_SHIFT-16 sshr v1.8h, v7.8h, #COL_SHIFT-16
sshr v2.8H, v16.8H, #COL_SHIFT-16 sshr v2.8h, v16.8h, #COL_SHIFT-16
sshr v3.8H, v17.8H, #COL_SHIFT-16 sshr v3.8h, v17.8h, #COL_SHIFT-16
sshr v4.8H, v18.8H, #COL_SHIFT-16 sshr v4.8h, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2 bl idct_col4_neon2
sshr v7.8H, v7.8H, #COL_SHIFT-16 sshr v7.8h, v7.8h, #COL_SHIFT-16
sshr v16.8H, v16.8H, #COL_SHIFT-16 sshr v16.8h, v16.8h, #COL_SHIFT-16
sshr v17.8H, v17.8H, #COL_SHIFT-16 sshr v17.8h, v17.8h, #COL_SHIFT-16
sshr v18.8H, v18.8H, #COL_SHIFT-16 sshr v18.8h, v18.8h, #COL_SHIFT-16
mov x9, x0 mov x9, x0
ld1 {v19.D}[0], [x0], x1 ld1 {v19.d}[0], [x0], x1
zip1 v23.2D, v1.2D, v7.2D zip1 v23.2d, v1.2d, v7.2d
zip2 v24.2D, v1.2D, v7.2D zip2 v24.2d, v1.2d, v7.2d
ld1 {v19.D}[1], [x0], x1 ld1 {v19.d}[1], [x0], x1
zip1 v25.2D, v2.2D, v16.2D zip1 v25.2d, v2.2d, v16.2d
zip2 v26.2D, v2.2D, v16.2D zip2 v26.2d, v2.2d, v16.2d
ld1 {v20.D}[0], [x0], x1 ld1 {v20.d}[0], [x0], x1
zip1 v27.2D, v3.2D, v17.2D zip1 v27.2d, v3.2d, v17.2d
zip2 v28.2D, v3.2D, v17.2D zip2 v28.2d, v3.2d, v17.2d
ld1 {v20.D}[1], [x0], x1 ld1 {v20.d}[1], [x0], x1
zip1 v29.2D, v4.2D, v18.2D zip1 v29.2d, v4.2d, v18.2d
zip2 v30.2D, v4.2D, v18.2D zip2 v30.2d, v4.2d, v18.2d
ld1 {v21.D}[0], [x0], x1 ld1 {v21.d}[0], [x0], x1
uaddw v23.8H, v23.8H, v19.8B uaddw v23.8h, v23.8h, v19.8b
uaddw2 v24.8H, v24.8H, v19.16B uaddw2 v24.8h, v24.8h, v19.16b
ld1 {v21.D}[1], [x0], x1 ld1 {v21.d}[1], [x0], x1
sqxtun v23.8B, v23.8H sqxtun v23.8b, v23.8h
sqxtun2 v23.16B, v24.8H sqxtun2 v23.16b, v24.8h
ld1 {v22.D}[0], [x0], x1 ld1 {v22.d}[0], [x0], x1
uaddw v24.8H, v25.8H, v20.8B uaddw v24.8h, v25.8h, v20.8b
uaddw2 v25.8H, v26.8H, v20.16B uaddw2 v25.8h, v26.8h, v20.16b
ld1 {v22.D}[1], [x0], x1 ld1 {v22.d}[1], [x0], x1
sqxtun v24.8B, v24.8H sqxtun v24.8b, v24.8h
sqxtun2 v24.16B, v25.8H sqxtun2 v24.16b, v25.8h
st1 {v23.D}[0], [x9], x1 st1 {v23.d}[0], [x9], x1
uaddw v25.8H, v27.8H, v21.8B uaddw v25.8h, v27.8h, v21.8b
uaddw2 v26.8H, v28.8H, v21.16B uaddw2 v26.8h, v28.8h, v21.16b
st1 {v23.D}[1], [x9], x1 st1 {v23.d}[1], [x9], x1
sqxtun v25.8B, v25.8H sqxtun v25.8b, v25.8h
sqxtun2 v25.16B, v26.8H sqxtun2 v25.16b, v26.8h
st1 {v24.D}[0], [x9], x1 st1 {v24.d}[0], [x9], x1
uaddw v26.8H, v29.8H, v22.8B uaddw v26.8h, v29.8h, v22.8b
uaddw2 v27.8H, v30.8H, v22.16B uaddw2 v27.8h, v30.8h, v22.16b
st1 {v24.D}[1], [x9], x1 st1 {v24.d}[1], [x9], x1
sqxtun v26.8B, v26.8H sqxtun v26.8b, v26.8h
sqxtun2 v26.16B, v27.8H sqxtun2 v26.16b, v27.8h
st1 {v25.D}[0], [x9], x1 st1 {v25.d}[0], [x9], x1
st1 {v25.D}[1], [x9], x1 st1 {v25.d}[1], [x9], x1
st1 {v26.D}[0], [x9], x1 st1 {v26.d}[0], [x9], x1
st1 {v26.D}[1], [x9], x1 st1 {v26.d}[1], [x9], x1
idct_end idct_end
endfunc endfunc
@ -333,30 +333,30 @@ function ff_simple_idct_neon, export=1
sub x2, x2, #128 sub x2, x2, #128
bl idct_col4_neon1 bl idct_col4_neon1
sshr v1.8H, v7.8H, #COL_SHIFT-16 sshr v1.8h, v7.8h, #COL_SHIFT-16
sshr v2.8H, v16.8H, #COL_SHIFT-16 sshr v2.8h, v16.8h, #COL_SHIFT-16
sshr v3.8H, v17.8H, #COL_SHIFT-16 sshr v3.8h, v17.8h, #COL_SHIFT-16
sshr v4.8H, v18.8H, #COL_SHIFT-16 sshr v4.8h, v18.8h, #COL_SHIFT-16
bl idct_col4_neon2 bl idct_col4_neon2
sshr v7.8H, v7.8H, #COL_SHIFT-16 sshr v7.8h, v7.8h, #COL_SHIFT-16
sshr v16.8H, v16.8H, #COL_SHIFT-16 sshr v16.8h, v16.8h, #COL_SHIFT-16
sshr v17.8H, v17.8H, #COL_SHIFT-16 sshr v17.8h, v17.8h, #COL_SHIFT-16
sshr v18.8H, v18.8H, #COL_SHIFT-16 sshr v18.8h, v18.8h, #COL_SHIFT-16
zip1 v23.2D, v1.2D, v7.2D zip1 v23.2d, v1.2d, v7.2d
zip2 v24.2D, v1.2D, v7.2D zip2 v24.2d, v1.2d, v7.2d
st1 {v23.2D,v24.2D}, [x2], #32 st1 {v23.2d,v24.2d}, [x2], #32
zip1 v25.2D, v2.2D, v16.2D zip1 v25.2d, v2.2d, v16.2d
zip2 v26.2D, v2.2D, v16.2D zip2 v26.2d, v2.2d, v16.2d
st1 {v25.2D,v26.2D}, [x2], #32 st1 {v25.2d,v26.2d}, [x2], #32
zip1 v27.2D, v3.2D, v17.2D zip1 v27.2d, v3.2d, v17.2d
zip2 v28.2D, v3.2D, v17.2D zip2 v28.2d, v3.2d, v17.2d
st1 {v27.2D,v28.2D}, [x2], #32 st1 {v27.2d,v28.2d}, [x2], #32
zip1 v29.2D, v4.2D, v18.2D zip1 v29.2d, v4.2d, v18.2d
zip2 v30.2D, v4.2D, v18.2D zip2 v30.2d, v4.2d, v18.2d
st1 {v29.2D,v30.2D}, [x2], #32 st1 {v29.2d,v30.2d}, [x2], #32
idct_end idct_end
endfunc endfunc

@ -22,19 +22,19 @@
// acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D} // acc_sum_store(ABCD) = {X+A, X+A+B, X+A+B+C, X+A+B+C+D}
.macro acc_sum_store x, xb .macro acc_sum_store x, xb
dup v24.4S, v24.S[3] // ...X -> XXXX dup v24.4s, v24.s[3] // ...X -> XXXX
ext v25.16B, v26.16B, \xb, #12 // ext(0000,ABCD,12)=0ABC ext v25.16b, v26.16b, \xb, #12 // ext(0000,ABCD,12)=0ABC
add v24.4S, v24.4S, \x // XXXX+ABCD={X+A,X+B,X+C,X+D} add v24.4s, v24.4s, \x // XXXX+ABCD={X+A,X+B,X+C,X+D}
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC) add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B,X+D+C} (+0ABC)
ext v25.16B, v26.16B, v25.16B, #12 // ext(0000,0ABC,12)=00AB ext v25.16b, v26.16b, v25.16b, #12 // ext(0000,0ABC,12)=00AB
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B+A,X+D+C+B} (+00AB) add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B+A,X+D+C+B} (+00AB)
ext v25.16B, v26.16B, v25.16B, #12 // ext(0000,00AB,12)=000A ext v25.16b, v26.16b, v25.16b, #12 // ext(0000,00AB,12)=000A
add v24.4S, v24.4S, v25.4S // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A) add v24.4s, v24.4s, v25.4s // {X+A,X+B+A,X+C+B+A,X+D+C+B+A} (+000A)
st1 {v24.4S}, [x0], #16 // write 4x32-bit final values st1 {v24.4s}, [x0], #16 // write 4x32-bit final values
.endm .endm
function ff_compute_safe_ssd_integral_image_neon, export=1 function ff_compute_safe_ssd_integral_image_neon, export=1
movi v26.4S, #0 // used as zero for the "rotations" in acc_sum_store movi v26.4s, #0 // used as zero for the "rotations" in acc_sum_store
sub x3, x3, w6, UXTW // s1 padding (s1_linesize - w) sub x3, x3, w6, UXTW // s1 padding (s1_linesize - w)
sub x5, x5, w6, UXTW // s2 padding (s2_linesize - w) sub x5, x5, w6, UXTW // s2 padding (s2_linesize - w)
sub x9, x0, w1, UXTW #2 // dst_top sub x9, x0, w1, UXTW #2 // dst_top
@ -43,31 +43,31 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
1: mov w10, w6 // width copy for each line 1: mov w10, w6 // width copy for each line
sub x0, x0, #16 // beginning of the dst line minus 4 sums sub x0, x0, #16 // beginning of the dst line minus 4 sums
sub x8, x9, #4 // dst_top-1 sub x8, x9, #4 // dst_top-1
ld1 {v24.4S}, [x0], #16 // load ...X (contextual last sums) ld1 {v24.4s}, [x0], #16 // load ...X (contextual last sums)
2: ld1 {v0.16B}, [x2], #16 // s1[x + 0..15] 2: ld1 {v0.16b}, [x2], #16 // s1[x + 0..15]
ld1 {v1.16B}, [x4], #16 // s2[x + 0..15] ld1 {v1.16b}, [x4], #16 // s2[x + 0..15]
ld1 {v16.4S,v17.4S}, [x8], #32 // dst_top[x + 0..7 - 1] ld1 {v16.4s,v17.4s}, [x8], #32 // dst_top[x + 0..7 - 1]
usubl v2.8H, v0.8B, v1.8B // d[x + 0..7] = s1[x + 0..7] - s2[x + 0..7] usubl v2.8h, v0.8b, v1.8b // d[x + 0..7] = s1[x + 0..7] - s2[x + 0..7]
usubl2 v3.8H, v0.16B, v1.16B // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15] usubl2 v3.8h, v0.16b, v1.16b // d[x + 8..15] = s1[x + 8..15] - s2[x + 8..15]
ld1 {v18.4S,v19.4S}, [x8], #32 // dst_top[x + 8..15 - 1] ld1 {v18.4s,v19.4s}, [x8], #32 // dst_top[x + 8..15 - 1]
smull v4.4S, v2.4H, v2.4H // d[x + 0..3]^2 smull v4.4s, v2.4h, v2.4h // d[x + 0..3]^2
smull2 v5.4S, v2.8H, v2.8H // d[x + 4..7]^2 smull2 v5.4s, v2.8h, v2.8h // d[x + 4..7]^2
ld1 {v20.4S,v21.4S}, [x9], #32 // dst_top[x + 0..7] ld1 {v20.4s,v21.4s}, [x9], #32 // dst_top[x + 0..7]
smull v6.4S, v3.4H, v3.4H // d[x + 8..11]^2 smull v6.4s, v3.4h, v3.4h // d[x + 8..11]^2
smull2 v7.4S, v3.8H, v3.8H // d[x + 12..15]^2 smull2 v7.4s, v3.8h, v3.8h // d[x + 12..15]^2
ld1 {v22.4S,v23.4S}, [x9], #32 // dst_top[x + 8..15] ld1 {v22.4s,v23.4s}, [x9], #32 // dst_top[x + 8..15]
sub v0.4S, v20.4S, v16.4S // dst_top[x + 0..3] - dst_top[x + 0..3 - 1] sub v0.4s, v20.4s, v16.4s // dst_top[x + 0..3] - dst_top[x + 0..3 - 1]
sub v1.4S, v21.4S, v17.4S // dst_top[x + 4..7] - dst_top[x + 4..7 - 1] sub v1.4s, v21.4s, v17.4s // dst_top[x + 4..7] - dst_top[x + 4..7 - 1]
add v0.4S, v0.4S, v4.4S // + d[x + 0..3]^2 add v0.4s, v0.4s, v4.4s // + d[x + 0..3]^2
add v1.4S, v1.4S, v5.4S // + d[x + 4..7]^2 add v1.4s, v1.4s, v5.4s // + d[x + 4..7]^2
sub v2.4S, v22.4S, v18.4S // dst_top[x + 8..11] - dst_top[x + 8..11 - 1] sub v2.4s, v22.4s, v18.4s // dst_top[x + 8..11] - dst_top[x + 8..11 - 1]
sub v3.4S, v23.4S, v19.4S // dst_top[x + 12..15] - dst_top[x + 12..15 - 1] sub v3.4s, v23.4s, v19.4s // dst_top[x + 12..15] - dst_top[x + 12..15 - 1]
add v2.4S, v2.4S, v6.4S // + d[x + 8..11]^2 add v2.4s, v2.4s, v6.4s // + d[x + 8..11]^2
add v3.4S, v3.4S, v7.4S // + d[x + 12..15]^2 add v3.4s, v3.4s, v7.4s // + d[x + 12..15]^2
acc_sum_store v0.4S, v0.16B // accumulate and store dst[ 0..3] acc_sum_store v0.4s, v0.16b // accumulate and store dst[ 0..3]
acc_sum_store v1.4S, v1.16B // accumulate and store dst[ 4..7] acc_sum_store v1.4s, v1.16b // accumulate and store dst[ 4..7]
acc_sum_store v2.4S, v2.16B // accumulate and store dst[ 8..11] acc_sum_store v2.4s, v2.16b // accumulate and store dst[ 8..11]
acc_sum_store v3.4S, v3.16B // accumulate and store dst[12..15] acc_sum_store v3.4s, v3.16b // accumulate and store dst[12..15]
subs w10, w10, #16 // width dec subs w10, w10, #16 // width dec
b.ne 2b // loop til next line b.ne 2b // loop til next line
add x2, x2, x3 // skip to next line (s1) add x2, x2, x3 // skip to next line (s1)

@ -25,16 +25,16 @@
function ff_vector_fmul_neon, export=1 function ff_vector_fmul_neon, export=1
1: subs w3, w3, #16 1: subs w3, w3, #16
ld1 {v0.4S, v1.4S}, [x1], #32 ld1 {v0.4s, v1.4s}, [x1], #32
ld1 {v2.4S, v3.4S}, [x1], #32 ld1 {v2.4s, v3.4s}, [x1], #32
ld1 {v4.4S, v5.4S}, [x2], #32 ld1 {v4.4s, v5.4s}, [x2], #32
ld1 {v6.4S, v7.4S}, [x2], #32 ld1 {v6.4s, v7.4s}, [x2], #32
fmul v16.4S, v0.4S, v4.4S fmul v16.4s, v0.4s, v4.4s
fmul v17.4S, v1.4S, v5.4S fmul v17.4s, v1.4s, v5.4s
fmul v18.4S, v2.4S, v6.4S fmul v18.4s, v2.4s, v6.4s
fmul v19.4S, v3.4S, v7.4S fmul v19.4s, v3.4s, v7.4s
st1 {v16.4S, v17.4S}, [x0], #32 st1 {v16.4s, v17.4s}, [x0], #32
st1 {v18.4S, v19.4S}, [x0], #32 st1 {v18.4s, v19.4s}, [x0], #32
b.ne 1b b.ne 1b
ret ret
endfunc endfunc
@ -42,16 +42,16 @@ endfunc
function ff_vector_fmac_scalar_neon, export=1 function ff_vector_fmac_scalar_neon, export=1
mov x3, #-32 mov x3, #-32
1: subs w2, w2, #16 1: subs w2, w2, #16
ld1 {v16.4S, v17.4S}, [x0], #32 ld1 {v16.4s, v17.4s}, [x0], #32
ld1 {v18.4S, v19.4S}, [x0], x3 ld1 {v18.4s, v19.4s}, [x0], x3
ld1 {v4.4S, v5.4S}, [x1], #32 ld1 {v4.4s, v5.4s}, [x1], #32
ld1 {v6.4S, v7.4S}, [x1], #32 ld1 {v6.4s, v7.4s}, [x1], #32
fmla v16.4S, v4.4S, v0.S[0] fmla v16.4s, v4.4s, v0.s[0]
fmla v17.4S, v5.4S, v0.S[0] fmla v17.4s, v5.4s, v0.s[0]
fmla v18.4S, v6.4S, v0.S[0] fmla v18.4s, v6.4s, v0.s[0]
fmla v19.4S, v7.4S, v0.S[0] fmla v19.4s, v7.4s, v0.s[0]
st1 {v16.4S, v17.4S}, [x0], #32 st1 {v16.4s, v17.4s}, [x0], #32
st1 {v18.4S, v19.4S}, [x0], #32 st1 {v18.4s, v19.4s}, [x0], #32
b.ne 1b b.ne 1b
ret ret
endfunc endfunc
@ -59,43 +59,43 @@ endfunc
function ff_vector_fmul_scalar_neon, export=1 function ff_vector_fmul_scalar_neon, export=1
mov w4, #15 mov w4, #15
bics w3, w2, w4 bics w3, w2, w4
dup v16.4S, v0.S[0] dup v16.4s, v0.s[0]
b.eq 3f b.eq 3f
ld1 {v0.4S, v1.4S}, [x1], #32 ld1 {v0.4s, v1.4s}, [x1], #32
1: subs w3, w3, #16 1: subs w3, w3, #16
fmul v0.4S, v0.4S, v16.4S fmul v0.4s, v0.4s, v16.4s
ld1 {v2.4S, v3.4S}, [x1], #32 ld1 {v2.4s, v3.4s}, [x1], #32
fmul v1.4S, v1.4S, v16.4S fmul v1.4s, v1.4s, v16.4s
fmul v2.4S, v2.4S, v16.4S fmul v2.4s, v2.4s, v16.4s
st1 {v0.4S, v1.4S}, [x0], #32 st1 {v0.4s, v1.4s}, [x0], #32
fmul v3.4S, v3.4S, v16.4S fmul v3.4s, v3.4s, v16.4s
b.eq 2f b.eq 2f
ld1 {v0.4S, v1.4S}, [x1], #32 ld1 {v0.4s, v1.4s}, [x1], #32
st1 {v2.4S, v3.4S}, [x0], #32 st1 {v2.4s, v3.4s}, [x0], #32
b 1b b 1b
2: ands w2, w2, #15 2: ands w2, w2, #15
st1 {v2.4S, v3.4S}, [x0], #32 st1 {v2.4s, v3.4s}, [x0], #32
b.eq 4f b.eq 4f
3: ld1 {v0.4S}, [x1], #16 3: ld1 {v0.4s}, [x1], #16
fmul v0.4S, v0.4S, v16.4S fmul v0.4s, v0.4s, v16.4s
st1 {v0.4S}, [x0], #16 st1 {v0.4s}, [x0], #16
subs w2, w2, #4 subs w2, w2, #4
b.gt 3b b.gt 3b
4: ret 4: ret
endfunc endfunc
function ff_vector_dmul_scalar_neon, export=1 function ff_vector_dmul_scalar_neon, export=1
dup v16.2D, v0.D[0] dup v16.2d, v0.d[0]
ld1 {v0.2D, v1.2D}, [x1], #32 ld1 {v0.2d, v1.2d}, [x1], #32
1: subs w2, w2, #8 1: subs w2, w2, #8
fmul v0.2D, v0.2D, v16.2D fmul v0.2d, v0.2d, v16.2d
ld1 {v2.2D, v3.2D}, [x1], #32 ld1 {v2.2d, v3.2d}, [x1], #32
fmul v1.2D, v1.2D, v16.2D fmul v1.2d, v1.2d, v16.2d
fmul v2.2D, v2.2D, v16.2D fmul v2.2d, v2.2d, v16.2d
st1 {v0.2D, v1.2D}, [x0], #32 st1 {v0.2d, v1.2d}, [x0], #32
fmul v3.2D, v3.2D, v16.2D fmul v3.2d, v3.2d, v16.2d
ld1 {v0.2D, v1.2D}, [x1], #32 ld1 {v0.2d, v1.2d}, [x1], #32
st1 {v2.2D, v3.2D}, [x0], #32 st1 {v2.2d, v3.2d}, [x0], #32
b.gt 1b b.gt 1b
ret ret
endfunc endfunc
@ -108,49 +108,49 @@ function ff_vector_fmul_window_neon, export=1
add x6, x3, x5, lsl #3 // win + 8 * (len - 2) add x6, x3, x5, lsl #3 // win + 8 * (len - 2)
add x5, x0, x5, lsl #3 // dst + 8 * (len - 2) add x5, x0, x5, lsl #3 // dst + 8 * (len - 2)
mov x7, #-16 mov x7, #-16
ld1 {v0.4S}, [x1], #16 // s0 ld1 {v0.4s}, [x1], #16 // s0
ld1 {v2.4S}, [x3], #16 // wi ld1 {v2.4s}, [x3], #16 // wi
ld1 {v1.4S}, [x2], x7 // s1 ld1 {v1.4s}, [x2], x7 // s1
1: ld1 {v3.4S}, [x6], x7 // wj 1: ld1 {v3.4s}, [x6], x7 // wj
subs x4, x4, #4 subs x4, x4, #4
fmul v17.4S, v0.4S, v2.4S // s0 * wi fmul v17.4s, v0.4s, v2.4s // s0 * wi
rev64 v4.4S, v1.4S rev64 v4.4s, v1.4s
rev64 v5.4S, v3.4S rev64 v5.4s, v3.4s
rev64 v17.4S, v17.4S rev64 v17.4s, v17.4s
ext v4.16B, v4.16B, v4.16B, #8 // s1_r ext v4.16b, v4.16b, v4.16b, #8 // s1_r
ext v5.16B, v5.16B, v5.16B, #8 // wj_r ext v5.16b, v5.16b, v5.16b, #8 // wj_r
ext v17.16B, v17.16B, v17.16B, #8 // (s0 * wi)_rev ext v17.16b, v17.16b, v17.16b, #8 // (s0 * wi)_rev
fmul v16.4S, v0.4S, v5.4S // s0 * wj_r fmul v16.4s, v0.4s, v5.4s // s0 * wj_r
fmla v17.4S, v1.4S, v3.4S // (s0 * wi)_rev + s1 * wj fmla v17.4s, v1.4s, v3.4s // (s0 * wi)_rev + s1 * wj
b.eq 2f b.eq 2f
ld1 {v0.4S}, [x1], #16 ld1 {v0.4s}, [x1], #16
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
st1 {v17.4S}, [x5], x7 st1 {v17.4s}, [x5], x7
ld1 {v2.4S}, [x3], #16 ld1 {v2.4s}, [x3], #16
ld1 {v1.4S}, [x2], x7 ld1 {v1.4s}, [x2], x7
st1 {v16.4S}, [x0], #16 st1 {v16.4s}, [x0], #16
b 1b b 1b
2: 2:
fmls v16.4S, v4.4S, v2.4S // s0 * wj_r - s1_r * wi fmls v16.4s, v4.4s, v2.4s // s0 * wj_r - s1_r * wi
st1 {v17.4S}, [x5], x7 st1 {v17.4s}, [x5], x7
st1 {v16.4S}, [x0], #16 st1 {v16.4s}, [x0], #16
ret ret
endfunc endfunc
function ff_vector_fmul_add_neon, export=1 function ff_vector_fmul_add_neon, export=1
ld1 {v0.4S, v1.4S}, [x1], #32 ld1 {v0.4s, v1.4s}, [x1], #32
ld1 {v2.4S, v3.4S}, [x2], #32 ld1 {v2.4s, v3.4s}, [x2], #32
ld1 {v4.4S, v5.4S}, [x3], #32 ld1 {v4.4s, v5.4s}, [x3], #32
1: subs w4, w4, #8 1: subs w4, w4, #8
fmla v4.4S, v0.4S, v2.4S fmla v4.4s, v0.4s, v2.4s
fmla v5.4S, v1.4S, v3.4S fmla v5.4s, v1.4s, v3.4s
b.eq 2f b.eq 2f
ld1 {v0.4S, v1.4S}, [x1], #32 ld1 {v0.4s, v1.4s}, [x1], #32
ld1 {v2.4S, v3.4S}, [x2], #32 ld1 {v2.4s, v3.4s}, [x2], #32
st1 {v4.4S, v5.4S}, [x0], #32 st1 {v4.4s, v5.4s}, [x0], #32
ld1 {v4.4S, v5.4S}, [x3], #32 ld1 {v4.4s, v5.4s}, [x3], #32
b 1b b 1b
2: st1 {v4.4S, v5.4S}, [x0], #32 2: st1 {v4.4s, v5.4s}, [x0], #32
ret ret
endfunc endfunc
@ -159,44 +159,44 @@ function ff_vector_fmul_reverse_neon, export=1
add x2, x2, x3, lsl #2 add x2, x2, x3, lsl #2
sub x2, x2, #32 sub x2, x2, #32
mov x4, #-32 mov x4, #-32
ld1 {v2.4S, v3.4S}, [x2], x4 ld1 {v2.4s, v3.4s}, [x2], x4
ld1 {v0.4S, v1.4S}, [x1], #32 ld1 {v0.4s, v1.4s}, [x1], #32
1: subs x3, x3, #8 1: subs x3, x3, #8
rev64 v3.4S, v3.4S rev64 v3.4s, v3.4s
rev64 v2.4S, v2.4S rev64 v2.4s, v2.4s
ext v3.16B, v3.16B, v3.16B, #8 ext v3.16b, v3.16b, v3.16b, #8
ext v2.16B, v2.16B, v2.16B, #8 ext v2.16b, v2.16b, v2.16b, #8
fmul v16.4S, v0.4S, v3.4S fmul v16.4s, v0.4s, v3.4s
fmul v17.4S, v1.4S, v2.4S fmul v17.4s, v1.4s, v2.4s
b.eq 2f b.eq 2f
ld1 {v2.4S, v3.4S}, [x2], x4 ld1 {v2.4s, v3.4s}, [x2], x4
ld1 {v0.4S, v1.4S}, [x1], #32 ld1 {v0.4s, v1.4s}, [x1], #32
st1 {v16.4S, v17.4S}, [x0], #32 st1 {v16.4s, v17.4s}, [x0], #32
b 1b b 1b
2: st1 {v16.4S, v17.4S}, [x0], #32 2: st1 {v16.4s, v17.4s}, [x0], #32
ret ret
endfunc endfunc
function ff_butterflies_float_neon, export=1 function ff_butterflies_float_neon, export=1
1: ld1 {v0.4S}, [x0] 1: ld1 {v0.4s}, [x0]
ld1 {v1.4S}, [x1] ld1 {v1.4s}, [x1]
subs w2, w2, #4 subs w2, w2, #4
fsub v2.4S, v0.4S, v1.4S fsub v2.4s, v0.4s, v1.4s
fadd v3.4S, v0.4S, v1.4S fadd v3.4s, v0.4s, v1.4s
st1 {v2.4S}, [x1], #16 st1 {v2.4s}, [x1], #16
st1 {v3.4S}, [x0], #16 st1 {v3.4s}, [x0], #16
b.gt 1b b.gt 1b
ret ret
endfunc endfunc
function ff_scalarproduct_float_neon, export=1 function ff_scalarproduct_float_neon, export=1
movi v2.4S, #0 movi v2.4s, #0
1: ld1 {v0.4S}, [x0], #16 1: ld1 {v0.4s}, [x0], #16
ld1 {v1.4S}, [x1], #16 ld1 {v1.4s}, [x1], #16
subs w2, w2, #4 subs w2, w2, #4
fmla v2.4S, v0.4S, v1.4S fmla v2.4s, v0.4s, v1.4s
b.gt 1b b.gt 1b
faddp v0.4S, v2.4S, v2.4S faddp v0.4s, v2.4s, v2.4s
faddp s0, v0.2S faddp s0, v0.2s
ret ret
endfunc endfunc

@ -21,57 +21,57 @@
#include "libavutil/aarch64/asm.S" #include "libavutil/aarch64/asm.S"
function ff_resample_common_apply_filter_x4_float_neon, export=1 function ff_resample_common_apply_filter_x4_float_neon, export=1
movi v0.4S, #0 // accumulator movi v0.4s, #0 // accumulator
1: ld1 {v1.4S}, [x1], #16 // src[0..3] 1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4S}, [x2], #16 // filter[0..3] ld1 {v2.4s}, [x2], #16 // filter[0..3]
fmla v0.4S, v1.4S, v2.4S // accumulator += src[0..3] * filter[0..3] fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4 subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length b.gt 1b // loop until filter_length
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.S}[0], [x0], #4 // write accumulator st1 {v0.s}[0], [x0], #4 // write accumulator
ret ret
endfunc endfunc
function ff_resample_common_apply_filter_x8_float_neon, export=1 function ff_resample_common_apply_filter_x8_float_neon, export=1
movi v0.4S, #0 // accumulator movi v0.4s, #0 // accumulator
1: ld1 {v1.4S}, [x1], #16 // src[0..3] 1: ld1 {v1.4s}, [x1], #16 // src[0..3]
ld1 {v2.4S}, [x2], #16 // filter[0..3] ld1 {v2.4s}, [x2], #16 // filter[0..3]
ld1 {v3.4S}, [x1], #16 // src[4..7] ld1 {v3.4s}, [x1], #16 // src[4..7]
ld1 {v4.4S}, [x2], #16 // filter[4..7] ld1 {v4.4s}, [x2], #16 // filter[4..7]
fmla v0.4S, v1.4S, v2.4S // accumulator += src[0..3] * filter[0..3] fmla v0.4s, v1.4s, v2.4s // accumulator += src[0..3] * filter[0..3]
fmla v0.4S, v3.4S, v4.4S // accumulator += src[4..7] * filter[4..7] fmla v0.4s, v3.4s, v4.4s // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8 subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length b.gt 1b // loop until filter_length
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
faddp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values faddp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.S}[0], [x0], #4 // write accumulator st1 {v0.s}[0], [x0], #4 // write accumulator
ret ret
endfunc endfunc
function ff_resample_common_apply_filter_x4_s16_neon, export=1 function ff_resample_common_apply_filter_x4_s16_neon, export=1
movi v0.4S, #0 // accumulator movi v0.4s, #0 // accumulator
1: ld1 {v1.4H}, [x1], #8 // src[0..3] 1: ld1 {v1.4h}, [x1], #8 // src[0..3]
ld1 {v2.4H}, [x2], #8 // filter[0..3] ld1 {v2.4h}, [x2], #8 // filter[0..3]
smlal v0.4S, v1.4H, v2.4H // accumulator += src[0..3] * filter[0..3] smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
subs w3, w3, #4 // filter_length -= 4 subs w3, w3, #4 // filter_length -= 4
b.gt 1b // loop until filter_length b.gt 1b // loop until filter_length
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.S}[0], [x0], #4 // write accumulator st1 {v0.s}[0], [x0], #4 // write accumulator
ret ret
endfunc endfunc
function ff_resample_common_apply_filter_x8_s16_neon, export=1 function ff_resample_common_apply_filter_x8_s16_neon, export=1
movi v0.4S, #0 // accumulator movi v0.4s, #0 // accumulator
1: ld1 {v1.8H}, [x1], #16 // src[0..7] 1: ld1 {v1.8h}, [x1], #16 // src[0..7]
ld1 {v2.8H}, [x2], #16 // filter[0..7] ld1 {v2.8h}, [x2], #16 // filter[0..7]
smlal v0.4S, v1.4H, v2.4H // accumulator += src[0..3] * filter[0..3] smlal v0.4s, v1.4h, v2.4h // accumulator += src[0..3] * filter[0..3]
smlal2 v0.4S, v1.8H, v2.8H // accumulator += src[4..7] * filter[4..7] smlal2 v0.4s, v1.8h, v2.8h // accumulator += src[4..7] * filter[4..7]
subs w3, w3, #8 // filter_length -= 8 subs w3, w3, #8 // filter_length -= 8
b.gt 1b // loop until filter_length b.gt 1b // loop until filter_length
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
addp v0.4S, v0.4S, v0.4S // pair adding of the 4x32-bit accumulated values addp v0.4s, v0.4s, v0.4s // pair adding of the 4x32-bit accumulated values
st1 {v0.S}[0], [x0], #4 // write accumulator st1 {v0.s}[0], [x0], #4 // write accumulator
ret ret
endfunc endfunc

@ -50,43 +50,43 @@ function ff_hscale8to15_X8_neon, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2 add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2 add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2D, #0 // val sum part 1 (for dst[0]) movi v0.2d, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1]) movi v1.2d, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2]) movi v2.2d, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3]) movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0] add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w0, UXTW // srcp + filterPos[1] add x8, x3, w0, UXTW // srcp + filterPos[1]
add x0, x3, w11, UXTW // srcp + filterPos[2] add x0, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3] add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter mov w15, w6 // filterSize counter
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}] 2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}] ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}] ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}] ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize subs w15, w15, #8 // j -= 8: processed 8/filterSize
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4 subs w2, w2, #4 // dstW -= 4
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H}, [x1], #8 // write to destination part0123 st1 {v0.4h}, [x1], #8 // write to destination part0123
b.gt 1b // loop until end of line b.gt 1b // loop until end of line
ret ret
endfunc endfunc
@ -245,7 +245,7 @@ function ff_hscale8to15_4_neon, export=1
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] } stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
1: 1:
ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp] // transpose 8 bytes each from src into 4 registers ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
// load 8 values from filterPos to be used as offsets into src // load 8 values from filterPos to be used as offsets into src
ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
@ -253,74 +253,74 @@ function ff_hscale8to15_4_neon, export=1
ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
movi v0.2D, #0 // Clear madd accumulator for idx 0..3 movi v0.2d, #0 // Clear madd accumulator for idx 0..3
movi v5.2D, #0 // Clear madd accumulator for idx 4..7 movi v5.2d, #0 // Clear madd accumulator for idx 4..7
ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
add x5, x5, #32 // advance filterPos add x5, x5, #32 // advance filterPos
// interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy // interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]], next iteration ldr w8, [x3, w8, UXTW] // src[filterPos[idx + 0]], next iteration
ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]], next iteration ldr w9, [x3, w9, UXTW] // src[filterPos[idx + 1]], next iteration
uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]], next iteration ldr w10, [x3, w10, UXTW] // src[filterPos[idx + 2]], next iteration
ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]], next iteration ldr w11, [x3, w11, UXTW] // src[filterPos[idx + 3]], next iteration
smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3 smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3 smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]], next iteration ldr w12, [x3, w12, UXTW] // src[filterPos[idx + 4]], next iteration
ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]], next iteration ldr w13, [x3, w13, UXTW] // src[filterPos[idx + 5]], next iteration
smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3 smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3 smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]], next iteration ldr w14, [x3, w14, UXTW] // src[filterPos[idx + 6]], next iteration
ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]], next iteration ldr w15, [x3, w15, UXTW] // src[filterPos[idx + 7]], next iteration
smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7 smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7 smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] } stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] } stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7 smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7 smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] } stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] } stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
sub w2, w2, #8 // dstW -= 8 sub w2, w2, #8 // dstW -= 8
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7] st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
cmp w2, #16 // continue on main loop if there are at least 16 iterations left cmp w2, #16 // continue on main loop if there are at least 16 iterations left
b.ge 1b b.ge 1b
// last full iteration // last full iteration
ld4 {v16.8B, v17.8B, v18.8B, v19.8B}, [sp] ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
ld4 {v1.8H, v2.8H, v3.8H, v4.8H}, [x4], #64 // load filter idx + 0..7 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
movi v0.2D, #0 // Clear madd accumulator for idx 0..3 movi v0.2d, #0 // Clear madd accumulator for idx 0..3
movi v5.2D, #0 // Clear madd accumulator for idx 4..7 movi v5.2d, #0 // Clear madd accumulator for idx 4..7
uxtl v16.8H, v16.8B // unsigned extend long, covert src data to 16-bit uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
uxtl v17.8H, v17.8B // unsigned extend long, covert src data to 16-bit uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
uxtl v18.8H, v18.8B // unsigned extend long, covert src data to 16-bit uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
uxtl v19.8H, v19.8B // unsigned extend long, covert src data to 16-bit uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
smlal v0.4S, v1.4H, v16.4H // multiply accumulate inner loop j = 0, idx = 0..3 smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
smlal v0.4S, v2.4H, v17.4H // multiply accumulate inner loop j = 1, idx = 0..3 smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
smlal v0.4S, v3.4H, v18.4H // multiply accumulate inner loop j = 2, idx = 0..3 smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
smlal v0.4S, v4.4H, v19.4H // multiply accumulate inner loop j = 3, idx = 0..3 smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
smlal2 v5.4S, v1.8H, v16.8H // multiply accumulate inner loop j = 0, idx = 4..7 smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
smlal2 v5.4S, v2.8H, v17.8H // multiply accumulate inner loop j = 1, idx = 4..7 smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
smlal2 v5.4S, v3.8H, v18.8H // multiply accumulate inner loop j = 2, idx = 4..7 smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
smlal2 v5.4S, v4.8H, v19.8H // multiply accumulate inner loop j = 3, idx = 4..7 smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
subs w2, w2, #8 // dstW -= 8 subs w2, w2, #8 // dstW -= 8
sqshrn v0.4H, v0.4S, #7 // shift and clip the 2x16-bit final values sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
sqshrn v1.4H, v5.4S, #7 // shift and clip the 2x16-bit final values sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
st1 {v0.4H, v1.4H}, [x1], #16 // write to dst[idx + 0..7] st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
@ -332,15 +332,15 @@ function ff_hscale8to15_4_neon, export=1
// load src // load src
ldr w8, [x5], #4 // filterPos[i] ldr w8, [x5], #4 // filterPos[i]
add x9, x3, w8, UXTW // calculate the address for src load add x9, x3, w8, UXTW // calculate the address for src load
ld1 {v5.S}[0], [x9] // src[filterPos[i] + 0..3] ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
// load filter // load filter
ld1 {v6.4H}, [x4], #8 // filter[filterSize * i + 0..3] ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
uxtl v5.8H, v5.8B // unsigned exten long, convert src data to 16-bit uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit
smull v0.4S, v5.4H, v6.4H // 4 iterations of src[...] * filter[...] smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...]
addv s0, v0.4S // add up products of src and filter values addv s0, v0.4s // add up products of src and filter values
sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
st1 {v0.H}[0], [x1], #2 // dst[i] = ... st1 {v0.h}[0], [x1], #2 // dst[i] = ...
sub w2, w2, #1 // dstW-- sub w2, w2, #1 // dstW--
cbnz w2, 2b cbnz w2, 2b
@ -445,12 +445,12 @@ function ff_hscale8to19_4_neon, export=1
smull v5.4s, v0.4h, v28.4h smull v5.4s, v0.4h, v28.4h
smull2 v6.4s, v0.8h, v28.8h smull2 v6.4s, v0.8h, v28.8h
uxtl v2.8h, v2.8b uxtl v2.8h, v2.8b
smlal v5.4s, v1.4h, v29.4H smlal v5.4s, v1.4h, v29.4h
smlal2 v6.4s, v1.8h, v29.8H smlal2 v6.4s, v1.8h, v29.8h
uxtl v3.8h, v3.8b uxtl v3.8h, v3.8b
smlal v5.4s, v2.4h, v30.4H smlal v5.4s, v2.4h, v30.4h
smlal2 v6.4s, v2.8h, v30.8H smlal2 v6.4s, v2.8h, v30.8h
smlal v5.4s, v3.4h, v31.4H smlal v5.4s, v3.4h, v31.4h
smlal2 v6.4s, v3.8h, v31.8h smlal2 v6.4s, v3.8h, v31.8h
sshr v5.4s, v5.4s, #3 sshr v5.4s, v5.4s, #3
@ -472,8 +472,8 @@ function ff_hscale8to19_4_neon, export=1
ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
ld1 {v31.4h}, [x4], #8 ld1 {v31.4h}, [x4], #8
uxtl v0.8h, v0.8b uxtl v0.8h, v0.8b
smull v5.4s, v0.4h, v31.4H smull v5.4s, v0.4h, v31.4h
saddlv d0, v5.4S saddlv d0, v5.4s
sqshrn s0, d0, #3 sqshrn s0, d0, #3
smin v0.4s, v0.4s, v18.4s smin v0.4s, v0.4s, v18.4s
st1 {v0.s}[0], [x1], #4 st1 {v0.s}[0], [x1], #4
@ -499,42 +499,42 @@ function ff_hscale8to19_X8_neon, export=1
ldr w11, [x5], #4 // filterPos[idx + 2] ldr w11, [x5], #4 // filterPos[idx + 2]
add x4, x13, x7 // filter3 = filter2 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2
ldr w9, [x5], #4 // filterPos[idx + 3] ldr w9, [x5], #4 // filterPos[idx + 3]
movi v0.2D, #0 // val sum part 1 (for dst[0]) movi v0.2d, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1]) movi v1.2d, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2]) movi v2.2d, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3]) movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0] add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w0, UXTW // srcp + filterPos[1] add x8, x3, w0, UXTW // srcp + filterPos[1]
add x0, x3, w11, UXTW // srcp + filterPos[2] add x0, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3] add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter mov w15, w6 // filterSize counter
2: ld1 {v4.8B}, [x17], #8 // srcp[filterPos[0] + {0..7}] 2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
uxtl v4.8H, v4.8B // unpack part 1 to 16-bit uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
smlal v0.4S, v4.4H, v5.4H // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
ld1 {v6.8B}, [x8], #8 // srcp[filterPos[1] + {0..7}] ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
smlal2 v0.4S, v4.8H, v5.8H // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
ld1 {v16.8B}, [x0], #8 // srcp[filterPos[2] + {0..7}] ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
uxtl v6.8H, v6.8B // unpack part 2 to 16-bit uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v16.8H, v16.8B // unpack part 3 to 16-bit uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
smlal v1.4S, v6.4H, v7.4H // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v18.8B}, [x11], #8 // srcp[filterPos[3] + {0..7}] ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
smlal v2.4S, v16.4H, v17.4H // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
smlal2 v2.4S, v16.8H, v17.8H // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
uxtl v18.8H, v18.8B // unpack part 4 to 16-bit uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
smlal2 v1.4S, v6.8H, v7.8H // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
smlal v3.4S, v18.4H, v19.4H // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
subs w15, w15, #8 // j -= 8: processed 8/filterSize subs w15, w15, #8 // j -= 8: processed 8/filterSize
smlal2 v3.4S, v18.8H, v19.8H // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4 subs w2, w2, #4 // dstW -= 4
sshr v0.4s, v0.4S, #3 // shift and clip the 2x16-bit final values sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values
smin v0.4s, v0.4s, v20.4s smin v0.4s, v0.4s, v20.4s
st1 {v0.4s}, [x1], #16 // write to destination part0123 st1 {v0.4s}, [x1], #16 // write to destination part0123
b.gt 1b // loop until end of line b.gt 1b // loop until end of line
@ -588,16 +588,16 @@ function ff_hscale8to19_X4_neon, export=1
smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0 smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0
ldr d6, [x10], #8 // load src values for idx 2 ldr d6, [x10], #8 // load src values for idx 2
ldr q29, [x14, x16] // load filter values for idx 2 ldr q29, [x14, x16] // load filter values for idx 2
smlal v17.4s, v5.4h, v30.4H // multiplication of lower half for idx 1 smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1
ldr d7, [x11], #8 // load src values for idx 3 ldr d7, [x11], #8 // load src values for idx 3
smlal2 v17.4s, v5.8h, v30.8H // multiplication of upper half for idx 1 smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1
uxtl v6.8h, v6.8B // extend tpye to matchi the filter's size uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size
ldr q28, [x15, x16] // load filter values for idx 3 ldr q28, [x15, x16] // load filter values for idx 3
smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2 smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2
uxtl v7.8h, v7.8B uxtl v7.8h, v7.8b
smlal2 v18.4s, v6.8h, v29.8H // multiplication of upper half for idx 2 smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2
sub w0, w0, #8 sub w0, w0, #8
smlal v19.4s, v7.4h, v28.4H // multiplication of lower half for idx 3 smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3
cmp w0, #8 cmp w0, #8
smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3 smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3
add x16, x16, #16 // advance filter values indexing add x16, x16, #16 // advance filter values indexing
@ -618,11 +618,11 @@ function ff_hscale8to19_X4_neon, export=1
uxtl v5.8h, v5.8b // extend type to match the filter' size uxtl v5.8h, v5.8b // extend type to match the filter' size
ldr s6, [x10] // load src values for idx 2 ldr s6, [x10] // load src values for idx 2
smlal v17.4s, v5.4h, v30.4h smlal v17.4s, v5.4h, v30.4h
uxtl v6.8h, v6.8B // extend type to match the filter's size uxtl v6.8h, v6.8b // extend type to match the filter's size
ldr d29, [x14, x17] // load filter values for idx 2 ldr d29, [x14, x17] // load filter values for idx 2
ldr s7, [x11] // load src values for idx 3 ldr s7, [x11] // load src values for idx 3
addp v16.4s, v16.4s, v17.4s addp v16.4s, v16.4s, v17.4s
uxtl v7.8h, v7.8B uxtl v7.8h, v7.8b
ldr d28, [x15, x17] // load filter values for idx 3 ldr d28, [x15, x17] // load filter values for idx 3
smlal v18.4s, v6.4h, v29.4h smlal v18.4s, v6.4h, v29.4h
smlal v19.4s, v7.4h, v28.4h smlal v19.4s, v7.4h, v28.4h
@ -700,31 +700,31 @@ function ff_hscale16to15_4_neon_asm, export=1
// Extending to 32 bits is necessary, as unit16_t values can't // Extending to 32 bits is necessary, as unit16_t values can't
// be represented as int16_t without type promotion. // be represented as int16_t without type promotion.
uxtl v26.4s, v0.4h uxtl v26.4s, v0.4h
sxtl v27.4s, v28.4H sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s mul v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v28.8H sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s mul v6.4s, v0.4s, v28.4s
sxtl v27.4s, v29.4H sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h uxtl2 v0.4s, v1.8h
mla v5.4s, v27.4s, v26.4s mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v29.8H sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h uxtl v26.4s, v2.4h
mla v6.4s, v28.4s, v0.4s mla v6.4s, v28.4s, v0.4s
sxtl v27.4s, v30.4H sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h uxtl2 v0.4s, v2.8h
mla v5.4s, v27.4s, v26.4s mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v30.8H sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h uxtl v26.4s, v3.4h
mla v6.4s, v28.4s, v0.4s mla v6.4s, v28.4s, v0.4s
sxtl v27.4s, v31.4H sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h uxtl2 v0.4s, v3.8h
mla v5.4s, v27.4s, v26.4s mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v31.8H sxtl2 v28.4s, v31.8h
sub w2, w2, #8 sub w2, w2, #8
mla v6.4s, v28.4s, v0.4s mla v6.4s, v28.4s, v0.4s
@ -775,31 +775,31 @@ function ff_hscale16to15_4_neon_asm, export=1
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
uxtl v26.4s, v0.4h uxtl v26.4s, v0.4h
sxtl v27.4s, v28.4H sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s mul v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v28.8H sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s mul v6.4s, v0.4s, v28.4s
sxtl v27.4s, v29.4H sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h uxtl2 v0.4s, v1.8h
mla v5.4s, v26.4s, v27.4s mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v29.8H sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h uxtl v26.4s, v2.4h
mla v6.4s, v0.4s, v28.4s mla v6.4s, v0.4s, v28.4s
sxtl v27.4s, v30.4H sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h uxtl2 v0.4s, v2.8h
mla v5.4s, v26.4s, v27.4s mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v30.8H sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h uxtl v26.4s, v3.4h
mla v6.4s, v0.4s, v28.4s mla v6.4s, v0.4s, v28.4s
sxtl v27.4s, v31.4H sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h uxtl2 v0.4s, v3.8h
mla v5.4s, v26.4s, v27.4s mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v31.8H sxtl2 v28.4s, v31.8h
subs w2, w2, #8 subs w2, w2, #8
mla v6.4s, v0.4s, v28.4s mla v6.4s, v0.4s, v28.4s
@ -807,7 +807,7 @@ function ff_hscale16to15_4_neon_asm, export=1
sshl v6.4s, v6.4s, v17.4s sshl v6.4s, v6.4s, v17.4s
smin v5.4s, v5.4s, v18.4s smin v5.4s, v5.4s, v18.4s
smin v6.4s, v6.4s, v18.4s smin v6.4s, v6.4s, v18.4s
xtn v5.4h, v5.4S xtn v5.4h, v5.4s
xtn2 v5.8h, v6.4s xtn2 v5.8h, v6.4s
st1 {v5.8h}, [x1], #16 st1 {v5.8h}, [x1], #16
@ -826,7 +826,7 @@ function ff_hscale16to15_4_neon_asm, export=1
uxtl v0.4s, v0.4h uxtl v0.4s, v0.4h
sxtl v31.4s, v31.4h sxtl v31.4s, v31.4h
mul v5.4s, v0.4s, v31.4s mul v5.4s, v0.4s, v31.4s
addv s0, v5.4S addv s0, v5.4s
sshl v0.4s, v0.4s, v17.4s sshl v0.4s, v0.4s, v17.4s
smin v0.4s, v0.4s, v18.4s smin v0.4s, v0.4s, v18.4s
st1 {v0.h}[0], [x1], #2 st1 {v0.h}[0], [x1], #2
@ -865,58 +865,58 @@ function ff_hscale16to15_X8_neon_asm, export=1
add x12, x16, x7 // filter1 = filter0 + filterSize*2 add x12, x16, x7 // filter1 = filter0 + filterSize*2
add x13, x12, x7 // filter2 = filter1 + filterSize*2 add x13, x12, x7 // filter2 = filter1 + filterSize*2
add x4, x13, x7 // filter3 = filter2 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2D, #0 // val sum part 1 (for dst[0]) movi v0.2d, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1]) movi v1.2d, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2]) movi v2.2d, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3]) movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0] add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w10, UXTW // srcp + filterPos[1] add x8, x3, w10, UXTW // srcp + filterPos[1]
add x10, x3, w11, UXTW // srcp + filterPos[2] add x10, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3] add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter mov w15, w6 // filterSize counter
2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}] 2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}] ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5 mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
sxtl v27.4s, v7.4H // exted filter lower half sxtl v27.4s, v7.4h // exted filter lower half
uxtl2 v6.4s, v6.8H // extend srcp upper half uxtl2 v6.4s, v6.8h // extend srcp upper half
sxtl2 v7.4s, v7.8h // extend filter upper half sxtl2 v7.4s, v7.8h // extend filter upper half
ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}] ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v22.4s, v16.4H // extend srcp lower half uxtl v22.4s, v16.4h // extend srcp lower half
sxtl v23.4s, v17.4H // extend filter lower half sxtl v23.4s, v17.4h // extend filter lower half
uxtl2 v16.4s, v16.8H // extend srcp upper half uxtl2 v16.4s, v16.8h // extend srcp upper half
sxtl2 v17.4s, v17.8h // extend filter upper half sxtl2 v17.4s, v17.8h // extend filter upper half
mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}] ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize subs w15, w15, #8 // j -= 8: processed 8/filterSize
uxtl v28.4s, v18.4H // extend srcp lower half uxtl v28.4s, v18.4h // extend srcp lower half
sxtl v29.4s, v19.4H // extend filter lower half sxtl v29.4s, v19.4h // extend filter lower half
uxtl2 v18.4s, v18.8H // extend srcp upper half uxtl2 v18.4s, v18.8h // extend srcp upper half
sxtl2 v19.4s, v19.8h // extend filter upper half sxtl2 v19.4s, v19.8h // extend filter upper half
mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4 subs w2, w2, #4 // dstW -= 4
sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl) smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
xtn v0.4h, v0.4s // narrow down to 16 bits xtn v0.4h, v0.4s // narrow down to 16 bits
st1 {v0.4H}, [x1], #8 // write to destination part0123 st1 {v0.4h}, [x1], #8 // write to destination part0123
b.gt 1b // loop until end of line b.gt 1b // loop until end of line
ret ret
endfunc endfunc
@ -1108,31 +1108,31 @@ function ff_hscale16to19_4_neon_asm, export=1
// Extending to 32 bits is necessary, as unit16_t values can't // Extending to 32 bits is necessary, as unit16_t values can't
// be represented as int16_t without type promotion. // be represented as int16_t without type promotion.
uxtl v26.4s, v0.4h uxtl v26.4s, v0.4h
sxtl v27.4s, v28.4H sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s mul v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v28.8H sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s mul v6.4s, v0.4s, v28.4s
sxtl v27.4s, v29.4H sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h uxtl2 v0.4s, v1.8h
mla v5.4s, v27.4s, v26.4s mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v29.8H sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h uxtl v26.4s, v2.4h
mla v6.4s, v28.4s, v0.4s mla v6.4s, v28.4s, v0.4s
sxtl v27.4s, v30.4H sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h uxtl2 v0.4s, v2.8h
mla v5.4s, v27.4s, v26.4s mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v30.8H sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h uxtl v26.4s, v3.4h
mla v6.4s, v28.4s, v0.4s mla v6.4s, v28.4s, v0.4s
sxtl v27.4s, v31.4H sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h uxtl2 v0.4s, v3.8h
mla v5.4s, v27.4s, v26.4s mla v5.4s, v27.4s, v26.4s
sxtl2 v28.4s, v31.8H sxtl2 v28.4s, v31.8h
sub w2, w2, #8 sub w2, w2, #8
mla v6.4s, v28.4s, v0.4s mla v6.4s, v28.4s, v0.4s
@ -1181,31 +1181,31 @@ function ff_hscale16to19_4_neon_asm, export=1
ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
uxtl v26.4s, v0.4h uxtl v26.4s, v0.4h
sxtl v27.4s, v28.4H sxtl v27.4s, v28.4h
uxtl2 v0.4s, v0.8h uxtl2 v0.4s, v0.8h
mul v5.4s, v26.4s, v27.4s mul v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v28.8H sxtl2 v28.4s, v28.8h
uxtl v26.4s, v1.4h uxtl v26.4s, v1.4h
mul v6.4s, v0.4s, v28.4s mul v6.4s, v0.4s, v28.4s
sxtl v27.4s, v29.4H sxtl v27.4s, v29.4h
uxtl2 v0.4s, v1.8h uxtl2 v0.4s, v1.8h
mla v5.4s, v26.4s, v27.4s mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v29.8H sxtl2 v28.4s, v29.8h
uxtl v26.4s, v2.4h uxtl v26.4s, v2.4h
mla v6.4s, v0.4s, v28.4s mla v6.4s, v0.4s, v28.4s
sxtl v27.4s, v30.4H sxtl v27.4s, v30.4h
uxtl2 v0.4s, v2.8h uxtl2 v0.4s, v2.8h
mla v5.4s, v26.4s, v27.4s mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v30.8H sxtl2 v28.4s, v30.8h
uxtl v26.4s, v3.4h uxtl v26.4s, v3.4h
mla v6.4s, v0.4s, v28.4s mla v6.4s, v0.4s, v28.4s
sxtl v27.4s, v31.4H sxtl v27.4s, v31.4h
uxtl2 v0.4s, v3.8h uxtl2 v0.4s, v3.8h
mla v5.4s, v26.4s, v27.4s mla v5.4s, v26.4s, v27.4s
sxtl2 v28.4s, v31.8H sxtl2 v28.4s, v31.8h
subs w2, w2, #8 subs w2, w2, #8
mla v6.4s, v0.4s, v28.4s mla v6.4s, v0.4s, v28.4s
@ -1232,7 +1232,7 @@ function ff_hscale16to19_4_neon_asm, export=1
sxtl v31.4s, v31.4h sxtl v31.4s, v31.4h
subs w2, w2, #1 subs w2, w2, #1
mul v5.4s, v0.4s, v31.4s mul v5.4s, v0.4s, v31.4s
addv s0, v5.4S addv s0, v5.4s
sshl v0.4s, v0.4s, v17.4s sshl v0.4s, v0.4s, v17.4s
smin v0.4s, v0.4s, v18.4s smin v0.4s, v0.4s, v18.4s
st1 {v0.s}[0], [x1], #4 st1 {v0.s}[0], [x1], #4
@ -1270,52 +1270,52 @@ function ff_hscale16to19_X8_neon_asm, export=1
add x13, x12, x7 // filter2 = filter1 + filterSize*2 add x13, x12, x7 // filter2 = filter1 + filterSize*2
lsl w10, w10, #1 lsl w10, w10, #1
add x4, x13, x7 // filter3 = filter2 + filterSize*2 add x4, x13, x7 // filter3 = filter2 + filterSize*2
movi v0.2D, #0 // val sum part 1 (for dst[0]) movi v0.2d, #0 // val sum part 1 (for dst[0])
movi v1.2D, #0 // val sum part 2 (for dst[1]) movi v1.2d, #0 // val sum part 2 (for dst[1])
movi v2.2D, #0 // val sum part 3 (for dst[2]) movi v2.2d, #0 // val sum part 3 (for dst[2])
movi v3.2D, #0 // val sum part 4 (for dst[3]) movi v3.2d, #0 // val sum part 4 (for dst[3])
add x17, x3, w8, UXTW // srcp + filterPos[0] add x17, x3, w8, UXTW // srcp + filterPos[0]
add x8, x3, w10, UXTW // srcp + filterPos[1] add x8, x3, w10, UXTW // srcp + filterPos[1]
add x10, x3, w11, UXTW // srcp + filterPos[2] add x10, x3, w11, UXTW // srcp + filterPos[2]
add x11, x3, w9, UXTW // srcp + filterPos[3] add x11, x3, w9, UXTW // srcp + filterPos[3]
mov w15, w6 // filterSize counter mov w15, w6 // filterSize counter
2: ld1 {v4.8H}, [x17], #16 // srcp[filterPos[0] + {0..7}] 2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
ld1 {v5.8H}, [x16], #16 // load 8x16-bit filter values, part 1 ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
ld1 {v6.8H}, [x8], #16 // srcp[filterPos[1] + {0..7}] ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
ld1 {v7.8H}, [x12], #16 // load 8x16-bit at filter+filterSize ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
uxtl v24.4s, v4.4H // extend srcp lower half to 32 bits to preserve sign uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
sxtl v25.4s, v5.4H // extend filter lower half to 32 bits to match srcp size sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5 mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
mla v0.4S, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5 mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
sxtl v27.4s, v7.4H // exted filter lower half sxtl v27.4s, v7.4h // exted filter lower half
uxtl2 v6.4s, v6.8H // extend srcp upper half uxtl2 v6.4s, v6.8h // extend srcp upper half
sxtl2 v7.4s, v7.8h // extend filter upper half sxtl2 v7.4s, v7.8h // extend filter upper half
ld1 {v16.8H}, [x10], #16 // srcp[filterPos[2] + {0..7}] ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
mla v1.4S, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
ld1 {v17.8H}, [x13], #16 // load 8x16-bit at filter+2*filterSize ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
uxtl v22.4s, v16.4H // extend srcp lower half uxtl v22.4s, v16.4h // extend srcp lower half
sxtl v23.4s, v17.4H // extend filter lower half sxtl v23.4s, v17.4h // extend filter lower half
uxtl2 v16.4s, v16.8H // extend srcp upper half uxtl2 v16.4s, v16.8h // extend srcp upper half
sxtl2 v17.4s, v17.8h // extend filter upper half sxtl2 v17.4s, v17.8h // extend filter upper half
mla v2.4S, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
mla v2.4S, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
ld1 {v18.8H}, [x11], #16 // srcp[filterPos[3] + {0..7}] ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
mla v1.4S, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
ld1 {v19.8H}, [x4], #16 // load 8x16-bit at filter+3*filterSize ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
subs w15, w15, #8 // j -= 8: processed 8/filterSize subs w15, w15, #8 // j -= 8: processed 8/filterSize
uxtl v28.4s, v18.4H // extend srcp lower half uxtl v28.4s, v18.4h // extend srcp lower half
sxtl v29.4s, v19.4H // extend filter lower half sxtl v29.4s, v19.4h // extend filter lower half
uxtl2 v18.4s, v18.8H // extend srcp upper half uxtl2 v18.4s, v18.8h // extend srcp upper half
sxtl2 v19.4s, v19.8h // extend filter upper half sxtl2 v19.4s, v19.8h // extend filter upper half
mla v3.4S, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
mla v3.4S, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
b.gt 2b // inner loop if filterSize not consumed completely b.gt 2b // inner loop if filterSize not consumed completely
addp v0.4S, v0.4S, v1.4S // part01 horizontal pair adding addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
addp v2.4S, v2.4S, v3.4S // part23 horizontal pair adding addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
addp v0.4S, v0.4S, v2.4S // part0123 horizontal pair adding addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
subs w2, w2, #4 // dstW -= 4 subs w2, w2, #4 // dstW -= 4
sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl) smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)

@ -29,13 +29,13 @@ function ff_yuv2planeX_8_neon, export=1
// x5 - const uint8_t *dither, // x5 - const uint8_t *dither,
// w6 - int offset // w6 - int offset
ld1 {v0.8B}, [x5] // load 8x8-bit dither ld1 {v0.8b}, [x5] // load 8x8-bit dither
and w6, w6, #7 and w6, w6, #7
cbz w6, 1f // check if offsetting present cbz w6, 1f // check if offsetting present
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8H, v0.8B // extend dither to 16-bit 1: uxtl v0.8h, v0.8b // extend dither to 16-bit
ushll v1.4S, v0.4H, #12 // extend dither to 32-bit with left shift by 12 (part 1) ushll v1.4s, v0.4h, #12 // extend dither to 32-bit with left shift by 12 (part 1)
ushll2 v2.4S, v0.8H, #12 // extend dither to 32-bit with left shift by 12 (part 2) ushll2 v2.4s, v0.8h, #12 // extend dither to 32-bit with left shift by 12 (part 2)
cmp w1, #8 // if filterSize == 8, branch to specialized version cmp w1, #8 // if filterSize == 8, branch to specialized version
b.eq 6f b.eq 6f
cmp w1, #4 // if filterSize == 4, branch to specialized version cmp w1, #4 // if filterSize == 4, branch to specialized version
@ -48,8 +48,8 @@ function ff_yuv2planeX_8_neon, export=1
mov x7, #0 // i = 0 mov x7, #0 // i = 0
tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version tbnz w1, #0, 4f // if filterSize % 2 != 0 branch to specialized version
// fs % 2 == 0 // fs % 2 == 0
2: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value 2: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter mov x10, x0 // filterp = filter
@ -57,12 +57,12 @@ function ff_yuv2planeX_8_neon, export=1
ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1] ldr s7, [x10], #4 // read 2x16-bit coeff X and Y at filter[j] and filter[j+1]
add x11, x11, x7, lsl #1 // &src[j ][i] add x11, x11, x7, lsl #1 // &src[j ][i]
add x12, x12, x7, lsl #1 // &src[j+1][i] add x12, x12, x7, lsl #1 // &src[j+1][i]
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
ld1 {v6.8H}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P ld1 {v6.8h}, [x12] // read 8x16-bit @ src[j+1][i + {0..7}]: I,J,K,L,M,N,O,P
smlal v3.4S, v5.4H, v7.H[0] // val0 += {A,B,C,D} * X smlal v3.4s, v5.4h, v7.h[0] // val0 += {A,B,C,D} * X
smlal2 v4.4S, v5.8H, v7.H[0] // val1 += {E,F,G,H} * X smlal2 v4.4s, v5.8h, v7.h[0] // val1 += {E,F,G,H} * X
smlal v3.4S, v6.4H, v7.H[1] // val0 += {I,J,K,L} * Y smlal v3.4s, v6.4h, v7.h[1] // val0 += {I,J,K,L} * Y
smlal2 v4.4S, v6.8H, v7.H[1] // val1 += {M,N,O,P} * Y smlal2 v4.4s, v6.8h, v7.h[1] // val1 += {M,N,O,P} * Y
subs w8, w8, #2 // tmpfilterSize -= 2 subs w8, w8, #2 // tmpfilterSize -= 2
b.gt 3b // loop until filterSize consumed b.gt 3b // loop until filterSize consumed
@ -77,17 +77,17 @@ function ff_yuv2planeX_8_neon, export=1
// If filter size is odd (most likely == 1), then use this section. // If filter size is odd (most likely == 1), then use this section.
// fs % 2 != 0 // fs % 2 != 0
4: mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value 4: mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
mov w8, w1 // tmpfilterSize = filterSize mov w8, w1 // tmpfilterSize = filterSize
mov x9, x2 // srcp = src mov x9, x2 // srcp = src
mov x10, x0 // filterp = filter mov x10, x0 // filterp = filter
5: ldr x11, [x9], #8 // get 1 pointer: src[j] 5: ldr x11, [x9], #8 // get 1 pointer: src[j]
ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j] ldr h6, [x10], #2 // read 1 16 bit coeff X at filter[j]
add x11, x11, x7, lsl #1 // &src[j ][i] add x11, x11, x7, lsl #1 // &src[j ][i]
ld1 {v5.8H}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H ld1 {v5.8h}, [x11] // read 8x16-bit @ src[j ][i + {0..7}]: A,B,C,D,E,F,G,H
smlal v3.4S, v5.4H, v6.H[0] // val0 += {A,B,C,D} * X smlal v3.4s, v5.4h, v6.h[0] // val0 += {A,B,C,D} * X
smlal2 v4.4S, v5.8H, v6.H[0] // val1 += {E,F,G,H} * X smlal2 v4.4s, v5.8h, v6.h[0] // val1 += {E,F,G,H} * X
subs w8, w8, #1 // tmpfilterSize -= 2 subs w8, w8, #1 // tmpfilterSize -= 2
b.gt 5b // loop until filterSize consumed b.gt 5b // loop until filterSize consumed
@ -107,36 +107,36 @@ function ff_yuv2planeX_8_neon, export=1
ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7] ldp x12, x13, [x2, #48] // load 2 pointers: src[j+6] and src[j+7]
// load 8x16-bit values for filter[j], where j=0..7 // load 8x16-bit values for filter[j], where j=0..7
ld1 {v6.8H}, [x0] ld1 {v6.8h}, [x0]
7: 7:
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
ld1 {v28.8H}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}] ld1 {v28.8h}, [x10], #16 // load 8x16-bit values for src[j + 4][i + {0..7}]
ld1 {v29.8H}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}] ld1 {v29.8h}, [x11], #16 // load 8x16-bit values for src[j + 5][i + {0..7}]
ld1 {v30.8H}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}] ld1 {v30.8h}, [x12], #16 // load 8x16-bit values for src[j + 6][i + {0..7}]
ld1 {v31.8H}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}] ld1 {v31.8h}, [x13], #16 // load 8x16-bit values for src[j + 7][i + {0..7}]
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0] smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0] smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1] smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1] smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2] smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2] smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3] smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3] smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
smlal v3.4S, v28.4H, v6.H[4] // val0 += src[4][i + {0..3}] * filter[4] smlal v3.4s, v28.4h, v6.h[4] // val0 += src[4][i + {0..3}] * filter[4]
smlal2 v4.4S, v28.8H, v6.H[4] // val1 += src[4][i + {4..7}] * filter[4] smlal2 v4.4s, v28.8h, v6.h[4] // val1 += src[4][i + {4..7}] * filter[4]
smlal v3.4S, v29.4H, v6.H[5] // val0 += src[5][i + {0..3}] * filter[5] smlal v3.4s, v29.4h, v6.h[5] // val0 += src[5][i + {0..3}] * filter[5]
smlal2 v4.4S, v29.8H, v6.H[5] // val1 += src[5][i + {4..7}] * filter[5] smlal2 v4.4s, v29.8h, v6.h[5] // val1 += src[5][i + {4..7}] * filter[5]
smlal v3.4S, v30.4H, v6.H[6] // val0 += src[6][i + {0..3}] * filter[6] smlal v3.4s, v30.4h, v6.h[6] // val0 += src[6][i + {0..3}] * filter[6]
smlal2 v4.4S, v30.8H, v6.H[6] // val1 += src[6][i + {4..7}] * filter[6] smlal2 v4.4s, v30.8h, v6.h[6] // val1 += src[6][i + {4..7}] * filter[6]
smlal v3.4S, v31.4H, v6.H[7] // val0 += src[7][i + {0..3}] * filter[7] smlal v3.4s, v31.4h, v6.h[7] // val0 += src[7][i + {0..3}] * filter[7]
smlal2 v4.4S, v31.8H, v6.H[7] // val1 += src[7][i + {4..7}] * filter[7] smlal2 v4.4s, v31.8h, v6.h[7] // val1 += src[7][i + {4..7}] * filter[7]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@ -151,24 +151,24 @@ function ff_yuv2planeX_8_neon, export=1
ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3] ldp x7, x9, [x2, #16] // load 2 pointers: src[j+2] and src[j+3]
// load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes // load 4x16-bit values for filter[j], where j=0..3 and replicated across lanes
ld1 {v6.4H}, [x0] ld1 {v6.4h}, [x0]
9: 9:
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
ld1 {v26.8H}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}] ld1 {v26.8h}, [x7], #16 // load 8x16-bit values for src[j + 2][i + {0..7}]
ld1 {v27.8H}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}] ld1 {v27.8h}, [x9], #16 // load 8x16-bit values for src[j + 3][i + {0..7}]
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0] smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0] smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1] smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1] smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
smlal v3.4S, v26.4H, v6.H[2] // val0 += src[2][i + {0..3}] * filter[2] smlal v3.4s, v26.4h, v6.h[2] // val0 += src[2][i + {0..3}] * filter[2]
smlal2 v4.4S, v26.8H, v6.H[2] // val1 += src[2][i + {4..7}] * filter[2] smlal2 v4.4s, v26.8h, v6.h[2] // val1 += src[2][i + {4..7}] * filter[2]
smlal v3.4S, v27.4H, v6.H[3] // val0 += src[3][i + {0..3}] * filter[3] smlal v3.4s, v27.4h, v6.h[3] // val0 += src[3][i + {0..3}] * filter[3]
smlal2 v4.4S, v27.8H, v6.H[3] // val1 += src[3][i + {4..7}] * filter[3] smlal2 v4.4s, v27.8h, v6.h[3] // val1 += src[3][i + {4..7}] * filter[3]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@ -184,16 +184,16 @@ function ff_yuv2planeX_8_neon, export=1
// load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes // load 2x16-bit values for filter[j], where j=0..1 and replicated across lanes
ldr s6, [x0] ldr s6, [x0]
11: 11:
mov v3.16B, v1.16B // initialize accumulator part 1 with dithering value mov v3.16b, v1.16b // initialize accumulator part 1 with dithering value
mov v4.16B, v2.16B // initialize accumulator part 2 with dithering value mov v4.16b, v2.16b // initialize accumulator part 2 with dithering value
ld1 {v24.8H}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}] ld1 {v24.8h}, [x5], #16 // load 8x16-bit values for src[j + 0][i + {0..7}]
ld1 {v25.8H}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}] ld1 {v25.8h}, [x6], #16 // load 8x16-bit values for src[j + 1][i + {0..7}]
smlal v3.4S, v24.4H, v6.H[0] // val0 += src[0][i + {0..3}] * filter[0] smlal v3.4s, v24.4h, v6.h[0] // val0 += src[0][i + {0..3}] * filter[0]
smlal2 v4.4S, v24.8H, v6.H[0] // val1 += src[0][i + {4..7}] * filter[0] smlal2 v4.4s, v24.8h, v6.h[0] // val1 += src[0][i + {4..7}] * filter[0]
smlal v3.4S, v25.4H, v6.H[1] // val0 += src[1][i + {0..3}] * filter[1] smlal v3.4s, v25.4h, v6.h[1] // val0 += src[1][i + {0..3}] * filter[1]
smlal2 v4.4S, v25.8H, v6.H[1] // val1 += src[1][i + {4..7}] * filter[1] smlal2 v4.4s, v25.8h, v6.h[1] // val1 += src[1][i + {4..7}] * filter[1]
sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16) sqshrun v3.4h, v3.4s, #16 // clip16(val0>>16)
sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16) sqshrun2 v3.8h, v4.4s, #16 // clip16(val1>>16)
@ -210,11 +210,11 @@ function ff_yuv2plane1_8_neon, export=1
// w2 - int dstW, // w2 - int dstW,
// x3 - const uint8_t *dither, // x3 - const uint8_t *dither,
// w4 - int offset // w4 - int offset
ld1 {v0.8B}, [x3] // load 8x8-bit dither ld1 {v0.8b}, [x3] // load 8x8-bit dither
and w4, w4, #7 and w4, w4, #7
cbz w4, 1f // check if offsetting present cbz w4, 1f // check if offsetting present
ext v0.8B, v0.8B, v0.8B, #3 // honor offsetting which can be 0 or 3 only ext v0.8b, v0.8b, v0.8b, #3 // honor offsetting which can be 0 or 3 only
1: uxtl v0.8H, v0.8B // extend dither to 32-bit 1: uxtl v0.8h, v0.8b // extend dither to 32-bit
uxtl v1.4s, v0.4h uxtl v1.4s, v0.4h
uxtl2 v2.4s, v0.8h uxtl2 v2.4s, v0.8h
2: 2:

@ -33,9 +33,9 @@
.macro load_args_nv12 .macro load_args_nv12
ldr x8, [sp] // table ldr x8, [sp] // table
load_yoff_ycoeff 8, 16 // y_offset, y_coeff load_yoff_ycoeff 8, 16 // y_offset, y_coeff
ld1 {v1.1D}, [x8] ld1 {v1.1d}, [x8]
dup v0.8H, w10 dup v0.8h, w10
dup v3.8H, w9 dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0 // w7 = linesizeC - width (paddingC) sub w7, w7, w0 // w7 = linesizeC - width (paddingC)
@ -51,9 +51,9 @@
ldr w14, [sp, #8] // linesizeV ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1D}, [x8] ld1 {v1.1d}, [x8]
dup v0.8H, w10 dup v0.8h, w10
dup v3.8H, w9 dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
@ -67,9 +67,9 @@
ldr w14, [sp, #8] // linesizeV ldr w14, [sp, #8] // linesizeV
ldr x8, [sp, #16] // table ldr x8, [sp, #16] // table
load_yoff_ycoeff 24, 32 // y_offset, y_coeff load_yoff_ycoeff 24, 32 // y_offset, y_coeff
ld1 {v1.1D}, [x8] ld1 {v1.1d}, [x8]
dup v0.8H, w10 dup v0.8h, w10
dup v3.8H, w9 dup v3.8h, w9
sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding)
sub w5, w5, w0 // w5 = linesizeY - width (paddingY) sub w5, w5, w0 // w5 = linesizeY - width (paddingY)
sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU) sub w7, w7, w0, lsr #1 // w7 = linesizeU - width / 2 (paddingU)
@ -77,22 +77,22 @@
.endm .endm
.macro load_chroma_nv12 .macro load_chroma_nv12
ld2 {v16.8B, v17.8B}, [x6], #16 ld2 {v16.8b, v17.8b}, [x6], #16
ushll v18.8H, v16.8B, #3 ushll v18.8h, v16.8b, #3
ushll v19.8H, v17.8B, #3 ushll v19.8h, v17.8b, #3
.endm .endm
.macro load_chroma_nv21 .macro load_chroma_nv21
ld2 {v16.8B, v17.8B}, [x6], #16 ld2 {v16.8b, v17.8b}, [x6], #16
ushll v19.8H, v16.8B, #3 ushll v19.8h, v16.8b, #3
ushll v18.8H, v17.8B, #3 ushll v18.8h, v17.8b, #3
.endm .endm
.macro load_chroma_yuv420p .macro load_chroma_yuv420p
ld1 {v16.8B}, [ x6], #8 ld1 {v16.8b}, [ x6], #8
ld1 {v17.8B}, [x13], #8 ld1 {v17.8b}, [x13], #8
ushll v18.8H, v16.8B, #3 ushll v18.8h, v16.8b, #3
ushll v19.8H, v17.8B, #3 ushll v19.8h, v17.8b, #3
.endm .endm
.macro load_chroma_yuv422p .macro load_chroma_yuv422p
@ -123,18 +123,18 @@
.endm .endm
.macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2 .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
add v20.8H, v26.8H, v20.8H // Y1 + R1 add v20.8h, v26.8h, v20.8h // Y1 + R1
add v21.8H, v27.8H, v21.8H // Y2 + R2 add v21.8h, v27.8h, v21.8h // Y2 + R2
add v22.8H, v26.8H, v22.8H // Y1 + G1 add v22.8h, v26.8h, v22.8h // Y1 + G1
add v23.8H, v27.8H, v23.8H // Y2 + G2 add v23.8h, v27.8h, v23.8h // Y2 + G2
add v24.8H, v26.8H, v24.8H // Y1 + B1 add v24.8h, v26.8h, v24.8h // Y1 + B1
add v25.8H, v27.8H, v25.8H // Y2 + B2 add v25.8h, v27.8h, v25.8h // Y2 + B2
sqrshrun \r1, v20.8H, #1 // clip_u8((Y1 + R1) >> 1) sqrshrun \r1, v20.8h, #1 // clip_u8((Y1 + R1) >> 1)
sqrshrun \r2, v21.8H, #1 // clip_u8((Y2 + R1) >> 1) sqrshrun \r2, v21.8h, #1 // clip_u8((Y2 + R1) >> 1)
sqrshrun \g1, v22.8H, #1 // clip_u8((Y1 + G1) >> 1) sqrshrun \g1, v22.8h, #1 // clip_u8((Y1 + G1) >> 1)
sqrshrun \g2, v23.8H, #1 // clip_u8((Y2 + G1) >> 1) sqrshrun \g2, v23.8h, #1 // clip_u8((Y2 + G1) >> 1)
sqrshrun \b1, v24.8H, #1 // clip_u8((Y1 + B1) >> 1) sqrshrun \b1, v24.8h, #1 // clip_u8((Y1 + B1) >> 1)
sqrshrun \b2, v25.8H, #1 // clip_u8((Y2 + B1) >> 1) sqrshrun \b2, v25.8h, #1 // clip_u8((Y2 + B1) >> 1)
movi \a1, #255 movi \a1, #255
movi \a2, #255 movi \a2, #255
.endm .endm
@ -146,47 +146,47 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1
1: 1:
mov w8, w0 // w8 = width mov w8, w0 // w8 = width
2: 2:
movi v5.8H, #4, lsl #8 // 128 * (1<<3) movi v5.8h, #4, lsl #8 // 128 * (1<<3)
load_chroma_\ifmt load_chroma_\ifmt
sub v18.8H, v18.8H, v5.8H // U*(1<<3) - 128*(1<<3) sub v18.8h, v18.8h, v5.8h // U*(1<<3) - 128*(1<<3)
sub v19.8H, v19.8H, v5.8H // V*(1<<3) - 128*(1<<3) sub v19.8h, v19.8h, v5.8h // V*(1<<3) - 128*(1<<3)
sqdmulh v20.8H, v19.8H, v1.H[0] // V * v2r (R) sqdmulh v20.8h, v19.8h, v1.h[0] // V * v2r (R)
sqdmulh v22.8H, v18.8H, v1.H[1] // U * u2g sqdmulh v22.8h, v18.8h, v1.h[1] // U * u2g
sqdmulh v19.8H, v19.8H, v1.H[2] // V * v2g sqdmulh v19.8h, v19.8h, v1.h[2] // V * v2g
add v22.8H, v22.8H, v19.8H // U * u2g + V * v2g (G) add v22.8h, v22.8h, v19.8h // U * u2g + V * v2g (G)
sqdmulh v24.8H, v18.8H, v1.H[3] // U * u2b (B) sqdmulh v24.8h, v18.8h, v1.h[3] // U * u2b (B)
zip2 v21.8H, v20.8H, v20.8H // R2 zip2 v21.8h, v20.8h, v20.8h // R2
zip1 v20.8H, v20.8H, v20.8H // R1 zip1 v20.8h, v20.8h, v20.8h // R1
zip2 v23.8H, v22.8H, v22.8H // G2 zip2 v23.8h, v22.8h, v22.8h // G2
zip1 v22.8H, v22.8H, v22.8H // G1 zip1 v22.8h, v22.8h, v22.8h // G1
zip2 v25.8H, v24.8H, v24.8H // B2 zip2 v25.8h, v24.8h, v24.8h // B2
zip1 v24.8H, v24.8H, v24.8H // B1 zip1 v24.8h, v24.8h, v24.8h // B1
ld1 {v2.16B}, [x4], #16 // load luma ld1 {v2.16b}, [x4], #16 // load luma
ushll v26.8H, v2.8B, #3 // Y1*(1<<3) ushll v26.8h, v2.8b, #3 // Y1*(1<<3)
ushll2 v27.8H, v2.16B, #3 // Y2*(1<<3) ushll2 v27.8h, v2.16b, #3 // Y2*(1<<3)
sub v26.8H, v26.8H, v3.8H // Y1*(1<<3) - y_offset sub v26.8h, v26.8h, v3.8h // Y1*(1<<3) - y_offset
sub v27.8H, v27.8H, v3.8H // Y2*(1<<3) - y_offset sub v27.8h, v27.8h, v3.8h // Y2*(1<<3) - y_offset
sqdmulh v26.8H, v26.8H, v0.8H // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15 sqdmulh v26.8h, v26.8h, v0.8h // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
sqdmulh v27.8H, v27.8H, v0.8H // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15 sqdmulh v27.8h, v27.8h, v0.8h // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
.ifc \ofmt,argb // 1 2 3 0 .ifc \ofmt,argb // 1 2 3 0
compute_rgba v5.8B,v6.8B,v7.8B,v4.8B, v17.8B,v18.8B,v19.8B,v16.8B compute_rgba v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
.endif .endif
.ifc \ofmt,rgba // 0 1 2 3 .ifc \ofmt,rgba // 0 1 2 3
compute_rgba v4.8B,v5.8B,v6.8B,v7.8B, v16.8B,v17.8B,v18.8B,v19.8B compute_rgba v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
.endif .endif
.ifc \ofmt,abgr // 3 2 1 0 .ifc \ofmt,abgr // 3 2 1 0
compute_rgba v7.8B,v6.8B,v5.8B,v4.8B, v19.8B,v18.8B,v17.8B,v16.8B compute_rgba v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
.endif .endif
.ifc \ofmt,bgra // 2 1 0 3 .ifc \ofmt,bgra // 2 1 0 3
compute_rgba v6.8B,v5.8B,v4.8B,v7.8B, v18.8B,v17.8B,v16.8B,v19.8B compute_rgba v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
.endif .endif
st4 { v4.8B, v5.8B, v6.8B, v7.8B}, [x2], #32 st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
st4 {v16.8B,v17.8B,v18.8B,v19.8B}, [x2], #32 st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
subs w8, w8, #16 // width -= 16 subs w8, w8, #16 // width -= 16
b.gt 2b b.gt 2b
add x2, x2, w3, SXTW // dst += padding add x2, x2, w3, SXTW // dst += padding