ARM: fix corner-case overflow in H.264 weighted prediction
Originally committed as revision 17657 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
5fa61b26f7
commit
fe7f149ed8
@ -1539,77 +1539,75 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
|
|||||||
|
|
||||||
@ Weighted prediction
|
@ Weighted prediction
|
||||||
|
|
||||||
.macro weight_16 mac
|
.macro weight_16 add
|
||||||
vdup.8 d0, r3
|
vdup.8 d0, r3
|
||||||
vmov q2, q8
|
|
||||||
vmov q3, q8
|
|
||||||
1: subs ip, ip, #2
|
1: subs ip, ip, #2
|
||||||
vld1.8 {d20-d21},[r0,:128], r1
|
vld1.8 {d20-d21},[r0,:128], r1
|
||||||
\mac q2, d0, d20
|
vmull.u8 q2, d0, d20
|
||||||
pld [r0]
|
pld [r0]
|
||||||
\mac q3, d0, d21
|
vmull.u8 q3, d0, d21
|
||||||
vmov q12, q8
|
|
||||||
vld1.8 {d28-d29},[r0,:128], r1
|
vld1.8 {d28-d29},[r0,:128], r1
|
||||||
vmov q13, q8
|
vmull.u8 q12, d0, d28
|
||||||
\mac q12, d0, d28
|
|
||||||
pld [r0]
|
pld [r0]
|
||||||
\mac q13, d0, d29
|
vmull.u8 q13, d0, d29
|
||||||
vshl.s16 q2, q2, q9
|
\add q2, q8, q2
|
||||||
vshl.s16 q3, q3, q9
|
vrshl.s16 q2, q2, q9
|
||||||
|
\add q3, q8, q3
|
||||||
|
vrshl.s16 q3, q3, q9
|
||||||
vqmovun.s16 d4, q2
|
vqmovun.s16 d4, q2
|
||||||
vqmovun.s16 d5, q3
|
vqmovun.s16 d5, q3
|
||||||
vshl.s16 q12, q12, q9
|
\add q12, q8, q12
|
||||||
vshl.s16 q13, q13, q9
|
vrshl.s16 q12, q12, q9
|
||||||
|
\add q13, q8, q13
|
||||||
|
vrshl.s16 q13, q13, q9
|
||||||
vqmovun.s16 d24, q12
|
vqmovun.s16 d24, q12
|
||||||
vqmovun.s16 d25, q13
|
vqmovun.s16 d25, q13
|
||||||
vmov q3, q8
|
|
||||||
vst1.8 {d4- d5}, [r4,:128], r1
|
vst1.8 {d4- d5}, [r4,:128], r1
|
||||||
vmov q2, q8
|
|
||||||
vst1.8 {d24-d25},[r4,:128], r1
|
vst1.8 {d24-d25},[r4,:128], r1
|
||||||
bne 1b
|
bne 1b
|
||||||
pop {r4, pc}
|
pop {r4, pc}
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro weight_8 mac
|
.macro weight_8 add
|
||||||
vdup.8 d0, r3
|
vdup.8 d0, r3
|
||||||
vmov q1, q8
|
|
||||||
vmov q10, q8
|
|
||||||
1: subs ip, ip, #2
|
1: subs ip, ip, #2
|
||||||
vld1.8 {d4},[r0,:64], r1
|
vld1.8 {d4},[r0,:64], r1
|
||||||
\mac q1, d0, d4
|
vmull.u8 q1, d0, d4
|
||||||
pld [r0]
|
pld [r0]
|
||||||
vld1.8 {d6},[r0,:64], r1
|
vld1.8 {d6},[r0,:64], r1
|
||||||
\mac q10, d0, d6
|
vmull.u8 q10, d0, d6
|
||||||
|
\add q1, q8, q1
|
||||||
pld [r0]
|
pld [r0]
|
||||||
vshl.s16 q1, q1, q9
|
vrshl.s16 q1, q1, q9
|
||||||
vqmovun.s16 d2, q1
|
vqmovun.s16 d2, q1
|
||||||
vshl.s16 q10, q10, q9
|
\add q10, q8, q10
|
||||||
|
vrshl.s16 q10, q10, q9
|
||||||
vqmovun.s16 d4, q10
|
vqmovun.s16 d4, q10
|
||||||
vmov q10, q8
|
|
||||||
vst1.8 {d2},[r4,:64], r1
|
vst1.8 {d2},[r4,:64], r1
|
||||||
vmov q1, q8
|
|
||||||
vst1.8 {d4},[r4,:64], r1
|
vst1.8 {d4},[r4,:64], r1
|
||||||
bne 1b
|
bne 1b
|
||||||
pop {r4, pc}
|
pop {r4, pc}
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro weight_4 mac
|
.macro weight_4 add
|
||||||
vdup.8 d0, r3
|
vdup.8 d0, r3
|
||||||
vmov q1, q8
|
vmov q1, q8
|
||||||
vmov q10, q8
|
vmov q10, q8
|
||||||
1: subs ip, ip, #4
|
1: subs ip, ip, #4
|
||||||
vld1.32 {d4[0]},[r0,:32], r1
|
vld1.32 {d4[0]},[r0,:32], r1
|
||||||
vld1.32 {d4[1]},[r0,:32], r1
|
vld1.32 {d4[1]},[r0,:32], r1
|
||||||
\mac q1, d0, d4
|
vmull.u8 q1, d0, d4
|
||||||
pld [r0]
|
pld [r0]
|
||||||
blt 2f
|
blt 2f
|
||||||
vld1.32 {d6[0]},[r0,:32], r1
|
vld1.32 {d6[0]},[r0,:32], r1
|
||||||
vld1.32 {d6[1]},[r0,:32], r1
|
vld1.32 {d6[1]},[r0,:32], r1
|
||||||
\mac q10, d0, d6
|
vmull.u8 q10, d0, d6
|
||||||
pld [r0]
|
pld [r0]
|
||||||
vshl.s16 q1, q1, q9
|
\add q1, q8, q1
|
||||||
|
vrshl.s16 q1, q1, q9
|
||||||
vqmovun.s16 d2, q1
|
vqmovun.s16 d2, q1
|
||||||
vshl.s16 q10, q10, q9
|
\add q10, q8, q10
|
||||||
|
vrshl.s16 q10, q10, q9
|
||||||
vqmovun.s16 d4, q10
|
vqmovun.s16 d4, q10
|
||||||
vmov q10, q8
|
vmov q10, q8
|
||||||
vst1.32 {d2[0]},[r4,:32], r1
|
vst1.32 {d2[0]},[r4,:32], r1
|
||||||
@ -1619,7 +1617,8 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
|
|||||||
vst1.32 {d4[1]},[r4,:32], r1
|
vst1.32 {d4[1]},[r4,:32], r1
|
||||||
bne 1b
|
bne 1b
|
||||||
pop {r4, pc}
|
pop {r4, pc}
|
||||||
2: vshl.s16 q1, q1, q9
|
2: \add q1, q8, q1
|
||||||
|
vrshl.s16 q1, q1, q9
|
||||||
vqmovun.s16 d2, q1
|
vqmovun.s16 d2, q1
|
||||||
vst1.32 {d2[0]},[r4,:32], r1
|
vst1.32 {d2[0]},[r4,:32], r1
|
||||||
vst1.32 {d2[1]},[r4,:32], r1
|
vst1.32 {d2[1]},[r4,:32], r1
|
||||||
@ -1630,19 +1629,25 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
|
|||||||
function weight_h264_pixels_\w\()_neon
|
function weight_h264_pixels_\w\()_neon
|
||||||
push {r4, lr}
|
push {r4, lr}
|
||||||
ldr r4, [sp, #8]
|
ldr r4, [sp, #8]
|
||||||
vdup.16 q9, r2
|
cmp r2, #1
|
||||||
mov lr, #1
|
|
||||||
lsl r4, r4, r2
|
lsl r4, r4, r2
|
||||||
subs r2, r2, #1
|
|
||||||
vneg.s16 q9, q9
|
|
||||||
addge r4, r4, lr, lsl r2
|
|
||||||
cmp r3, #0
|
|
||||||
vdup.16 q8, r4
|
vdup.16 q8, r4
|
||||||
mov r4, r0
|
mov r4, r0
|
||||||
|
ble 20f
|
||||||
|
rsb lr, r2, #1
|
||||||
|
vdup.16 q9, lr
|
||||||
|
cmp r3, #0
|
||||||
blt 10f
|
blt 10f
|
||||||
weight_\w vmlal.u8
|
weight_\w vhadd.s16
|
||||||
10: rsb r3, r3, #0
|
10: rsb r3, r3, #0
|
||||||
weight_\w vmlsl.u8
|
weight_\w vhsub.s16
|
||||||
|
20: rsb lr, r2, #0
|
||||||
|
vdup.16 q9, lr
|
||||||
|
cmp r3, #0
|
||||||
|
blt 10f
|
||||||
|
weight_\w vadd.s16
|
||||||
|
10: rsb r3, r3, #0
|
||||||
|
weight_\w vsub.s16
|
||||||
.endfunc
|
.endfunc
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user